# 데이터 전처리 및 시각화
- Pandas, Matplotlib 활용
- Pandas
  - 2008년 출시, Wes McKineey 개발
  - 목적 : 데이터 분석 및 조작을 위한 고성능 처리(Pandas Core가 c언어로 처리되어 있다), 데이터 구조를 지원하는 라이브러리
  - 특징 : 데이터 프레임(DataFrame) 및 시리즈(Series)를 바탕으로 데이터 처리
    1. 데이터 처리 빠르고 효율적
    2. 지원하는 생태계가 넓어 다른 라이브러리와 연결/통합 가능
    3. 인덱싱 지원, 효율적인 처리 가능
    4. 벡터화된 연산 지원
    5. Numpy를 기반, Numpy 연산 대부분 호환
  - Series는 한 Column과 데이터를 함께 묶은 개념

### EDA
- EDA?
  - EDA(Exploratory Data Analysis), 탐색적 데이터 분석
- Colab에서는 Pandas 관련 간단한 EDA가 존재


In [2]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
Downloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2024.2 tzdata-2025.1
[0mNote: you may need to restart the kernel to use updated packages.


In [11]:
import pandas as pd

data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["New York", "Paris", "London"],
}

df = pd.DataFrame(data)
df.to_csv("sample_data.csv", index=False)

df = pd.read_csv("sample_data.csv")
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Paris
2,Charlie,35,London


In [16]:
print("Original DataFrame:")
print(df)

print("\nAge Column: ")
print(df["Age"])


filtered_df = df[df["Age"] > 28]
print("\nFiltered DataFrame (Age > 28):")
print(filtered_df)

df["Salary"] = [70000, 80000, 90000]
print("\nDataFrame with Salary column:")
print(df)

Original DataFrame:
      Name  Age      City  Salary
0    Alice   25  New York   70000
1      Bob   30     Paris   80000
2  Charlie   35    London   90000

Age Column: 
0    25
1    30
2    35
Name: Age, dtype: int64

Filtered DataFrame (Age > 28):
      Name  Age    City  Salary
1      Bob   30   Paris   80000
2  Charlie   35  London   90000

DataFrame with Salary column:
      Name  Age      City  Salary
0    Alice   25  New York   70000
1      Bob   30     Paris   80000
2  Charlie   35    London   90000


In [18]:
type(df["Age"][0])

numpy.int64

# 데이터 변환 및 집계
- [Pandas Cheat Sheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf)
  - Pandas 관련 내용 간단 정리

In [19]:
import pandas as pd
import numpy as np

np.random.seed(0)
data = {
    "Name": np.random.choice(["Alice", "Bob", "Charlie", "David", "Eve", None], size=1000),
    "Age": np.random.randint(20, 60, size=1000),
    "City":np.random.choice(["Seoul", "New York", "Paris", "London", "Berlin", "Tokyo"], size=1000),
    "Salary": np.random.randint(50000, 120000, size=1000),
}

departments = ["HR", "Marketing", "Sales", "IT", "Finance"]
np.random.seed(0)
df2 = pd.DataFrame({
    "Name": df["Name"].dropna().unique(), # unique: 중복 제거
    "Department": np.random.choice(departments, size=len(df["Name"].dropna().unique()))
})

In [23]:
%pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
# 저장/불러오기
df = pd.DataFrame(data)
df.to_csv("sample_data.csv", index=False)
df2.to_csv("sample_data2.csv", index=False)
df.to_excel("sample_data.xlsx", index=False) # 3만개 정도 넘어가면 저장 못할 수 있다 (Microsoft excel의 한계)
df.to_json("sample_data.json", index=False) # 정보 처리 중간에 오류 발생하면 처리 불가
df.to_json("sample_data_multilines.json", orient="records", index=False, lines=True) # 오류 발생 개선

df = pd.read_excel("sample_data.xlsx")
print(df)
df = pd.read_csv("sample_data.csv")
print(df)
df = pd.read_json("sample_data.json")
print(df)

      Name  Age      City  Salary
0      Eve   35     Seoul   50889
1      NaN   40    Berlin  110224
2    Alice   29     Tokyo   80141
3    David   24     Paris   59740
4    David   44     Paris   62998
..     ...  ...       ...     ...
995    Eve   24  New York   77062
996    NaN   30     Tokyo   76061
997    NaN   26     Tokyo   55645
998  David   28     Seoul  110482
999  Alice   28    Berlin   61768

[1000 rows x 4 columns]
      Name  Age      City  Salary
0      Eve   35     Seoul   50889
1      NaN   40    Berlin  110224
2    Alice   29     Tokyo   80141
3    David   24     Paris   59740
4    David   44     Paris   62998
..     ...  ...       ...     ...
995    Eve   24  New York   77062
996    NaN   30     Tokyo   76061
997    NaN   26     Tokyo   55645
998  David   28     Seoul  110482
999  Alice   28    Berlin   61768

[1000 rows x 4 columns]
      Name  Age      City  Salary
0      Eve   35     Seoul   50889
1     None   40    Berlin  110224
2    Alice   29     Tokyo   8014

In [40]:
# 데이터 정제
df = pd.read_csv("sample_data.csv")

older_than30 = df[df["Age"] >= 30]
print(older_than30)

older_in_seoul = df[(df["Age"] >= 30) & (df["City"] == "Seoul")]
print(older_in_seoul.head()) # 5개만 출력

df_not_null = df[df["Name"].notna()]
df_is_null = df[df["Name"].isna()]

# 이름에 'a' 포함
df_with_a = df[df["Name"].str.contains("a", na=False)]
print(df_with_a.head()) # 5개만 출력


        Name  Age      City  Salary
0        Eve   35     Seoul   50889
1        NaN   40    Berlin  110224
4      David   44     Paris   62998
5      David   50     Tokyo  104176
7      David   39  New York  117249
..       ...  ...       ...     ...
989  Charlie   37  New York  116816
990      NaN   42    London   81501
991      Eve   33  New York   75625
993      NaN   31     Tokyo   60565
996      NaN   30     Tokyo   76061

[769 rows x 4 columns]
   Name  Age   City  Salary
0   Eve   35  Seoul   50889
17  Bob   41  Seoul  119063
18  NaN   40  Seoul   67344
20  NaN   57  Seoul   85779
32  Bob   45  Seoul  108539
      Name  Age      City  Salary
3    David   24     Paris   59740
4    David   44     Paris   62998
5    David   50     Tokyo  104176
7    David   39  New York  117249
9  Charlie   33    Berlin   50075


In [32]:
df["Age"] >= 30

0       True
1       True
2      False
3      False
4       True
       ...  
995    False
996     True
997    False
998    False
999    False
Name: Age, Length: 1000, dtype: bool

In [42]:
# 데이터 변환

# 예) 대분자 변환
df["Name"] = df["Name"].apply(lambda x: x.upper() if pd.notna(x) else x)
print(df)

# 예) 이름과 City를 결합한 새로운 컬럼 생성
def process(row: pd.Series) -> str:
    if pd.notna(row['Name']):
        return f"{row['Name']} from {row['Name']}"
    return f"Unknown from {row['City']}"
df["Name_City"] = df.apply(process, axis=1)
print(df)

      Name  Age      City  Salary
0      EVE   35     Seoul   50889
1      NaN   40    Berlin  110224
2    ALICE   29     Tokyo   80141
3    DAVID   24     Paris   59740
4    DAVID   44     Paris   62998
..     ...  ...       ...     ...
995    EVE   24  New York   77062
996    NaN   30     Tokyo   76061
997    NaN   26     Tokyo   55645
998  DAVID   28     Seoul  110482
999  ALICE   28    Berlin   61768

[1000 rows x 4 columns]
      Name  Age      City  Salary            Name_City
0      EVE   35     Seoul   50889         EVE from EVE
1      NaN   40    Berlin  110224  Unknown from Berlin
2    ALICE   29     Tokyo   80141     ALICE from ALICE
3    DAVID   24     Paris   59740     DAVID from DAVID
4    DAVID   44     Paris   62998     DAVID from DAVID
..     ...  ...       ...     ...                  ...
995    EVE   24  New York   77062         EVE from EVE
996    NaN   30     Tokyo   76061   Unknown from Tokyo
997    NaN   26     Tokyo   55645   Unknown from Tokyo
998  DAVID   28  

In [None]:
# 데이터 집계/조인
average_salary = df["Salary"].mean()
print(average_salary)

average_age_by_city = df.groupby("City")["Age"].mean()
print(average_age_by_city)

aggregated_data = df.agg({"Age": ["mean", "min", "max"], "Salary": ["mean", "min", "max"]})
print(aggregated_data)

df = pd.read_csv("sample_data.csv")
df2 = pd.read_csv("sample_data2.csv")
# Inner Join (일치된 것들만 병합)
inner_joined = pd.merge(df, df2, on="Name", how="inner")
print(inner_joined)

# Outer Join
outer_joined = pd.merge(df, df2, on="Name", how="outer") # 예시에서는 Name=NaN인 경우에 Department = NaN

84173.79
City
Berlin      40.493976
London      39.914439
New York    39.987261
Paris       38.596491
Seoul       40.320261
Tokyo       38.686747
Name: Age, dtype: float64
         Age     Salary
mean  39.655   84173.79
min   20.000   50010.00
max   59.000  119939.00
      Name  Age      City  Salary Department
0      Eve   35     Seoul   50889    Finance
1    Alice   29     Tokyo   80141         HR
2    David   24     Paris   59740         IT
3    David   44     Paris   62998         IT
4    David   50     Tokyo  104176         IT
..     ...  ...       ...     ...        ...
824    Bob   22     Paris   59307         IT
825  David   29     Tokyo   82691         IT
826    Eve   24  New York   77062    Finance
827  David   28     Seoul  110482         IT
828  Alice   28    Berlin   61768         HR

[829 rows x 5 columns]


# Dask, 대용량 데이터 처리
- Dask
  - 2014년 출시, Matthew Rocklin 개발
  - 목적: 대규모 계산을 위한 병렬 처리, 지연 실행 및 대규모 데이터셋 처리
  - 특징
    - 동적 작업 스케쥴링
    - 대규모 배열
    - 데이터 프레임
    - 리스트 처리
    - 머신 러닝
    - 병렬 처리 지원
    - Pandas 호환 API
  - 구조
    - 자료구조
      - Dask Array
      - Dask DataFrame
      - Dask Bag
      - Dask Delayed
      - Futures
    - 작업 그래프
    - 스케줄러
      - 단일 머신 (thread, process, 동기화)
      - 분산 처리

In [2]:
%pip install dask
%pip install --no-cache-dir pyarrow>=10.0.1

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [9]:
%pip uninstall -y pyarrow

Found existing installation: pyarrow 10.0.1
Uninstalling pyarrow-10.0.1:
  Successfully uninstalled pyarrow-10.0.1
[0mNote: you may need to restart the kernel to use updated packages.


In [15]:
%pip install pybind11
%pip install --no-cache-dir pyarrow

Collecting pybind11
  Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Downloading pybind11-2.13.6-py3-none-any.whl (243 kB)
Installing collected packages: pybind11
Successfully installed pybind11-2.13.6
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [18]:
%pip install --upgrade pybind11
%pip install --upgrade numpy

[0mNote: you may need to restart the kernel to use updated packages.
Collecting numpy
  Downloading numpy-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Downloading numpy-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.1
    Uninstalling numpy-2.2.1:
      Successfully uninstalled numpy-2.2.1
Successfully installed numpy-2.2.2
[0mNote: you may need to restart the kernel to use updated packages.


In [20]:
%conda install pyarrow

Retrieving notices: done
Channels:
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - pyarrow


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    arrow-cpp-19.0.0           |       h865e1df_0        12.5 MB
    aws-c-auth-0.6.19          |       h5eee18b_0          99 KB
    aws-c-cal-0.5.20           |       hdbd6064_0          42 KB
    aws-c-common-0.8.5         |       h5eee18b_0         207 KB
    aws-c-compression-0.2.16   |       h5eee18b_0          18 KB
    aws-c-event-stream-0.2.15  |       h6a678d5_0          50 KB
    aws-c-http-0.6.25          |       h5eee18b_0         200 KB
    aws-c-io-0.13.10           |       h5eee18b_0         150 KB
    aws-c-mqtt-0.7.13          |       h5eee18b_0          67 KB
    aws-c-s3-0.1.51            

In [21]:
import time
import pandas as pd
import numpy as np
import dask.dataframe as dd

def create_dataset(nrows: int, ncols: int) -> tuple[pd.DataFrame, pd.DataFrame]:
    main_data = {f"col_{i}": np.random.rand(nrows) for i in range(ncols)}
    ref_data = {f"col_{i}": np.random.rand(nrows // 10) for i in range(ncols)}
    main_df = pd.DataFrame(main_data)
    ref_df = pd.DataFrame(ref_data)
    return main_df, ref_df

def pandas_operations(main_df: pd.DataFrame, ref_df: pd.DataFrame) -> tuple[float, float]:
    start_time_agg = time.time()
    grouped = main_df.groupby("col_0").mean()
    end_time_agg = time.time()

    start_time_join = time.time()
    joined = main_df.merge(ref_df, on="col_0", how="left")
    end_time_join = time.time()

    return end_time_agg - start_time_agg, end_time_join - start_time_join

def dask_operations(main_df: pd.DataFrame, ref_df: pd.DataFrame, npartitions: int) -> tuple[float, float]:
    dmain_df = dd.from_pandas(main_df, npartitions=npartitions)
    dref_df = dd.from_pandas(ref_df, npartitions=npartitions)

    start_time_agg = time.time()
    grouped_task = dmain_df.groupby("col_0").mean()
    grouped = grouped_task.compute()
    end_time_agg = time.time()
    grouped_task.visualize("grouped.svg")

    start_time_join = time.time()
    joined_task = dmain_df.merge(dref_df, on="col_0", how="left")
    joined = joined_task.compute()
    end_time_join = time.time()
    joined_task.visualize("joined.svg")

    return end_time_agg - start_time_agg, end_time_join - start_time_join

main_df, ref_df = create_dataset(10_000_000, 5)
pandas_agg_time, pandas_join_time = pandas_operations(main_df, ref_df)
dask_agg_time, dask_join_time = dask_operations(main_df, ref_df, npartitions=10)

print("Pandas 집계 시간: ", pandas_agg_time, "초")
print("Pandas 조인 시간: ", pandas_join_time, "초")
print("Dask 집계 시간: ", dask_agg_time, "초")
print("Dask 조인 시간: ", dask_join_time, "초")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/site-packages/tornado

AttributeError: _ARRAY_API not found

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

# Lastest