<a href="https://colab.research.google.com/github/Oatthapong/Python/blob/main/Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning (การทำความสะอาดข้อมูล)

# นำเข้า Library เเละสร้าง Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
#สร้างข้อมูล
data = {
    "customer_id": [101, 102, 103, 104, np.nan, 106, 106],  # มีค่าหาย
    "name": ["Alice", "Bob", "Charlie", None, "Eve", "Frank", "Frank"],  # ค่าหาย (None)
    "age": [25, 30, np.nan, 22, 28, -5, 40],  # ค่าหายและค่าผิดปกติ (-5)
    "salary": ["50000", "55000", "60000", "65000", "70000", "75000", "invalid"],  # มีค่าที่ไม่ใช่ตัวเลข
}

df = pd.DataFrame(data)
print(df)

   customer_id     name   age   salary
0        101.0    Alice  25.0    50000
1        102.0      Bob  30.0    55000
2        103.0  Charlie   NaN    60000
3        104.0     None  22.0    65000
4          NaN      Eve  28.0    70000
5        106.0    Frank  -5.0    75000
6        106.0    Frank  40.0  invalid


# จัดการค่าหาย (Missing Data)

In [None]:
# เเทนที่ค่าหายด้วยค่าเฉลี่ย
df["age"].fillna(df["age"].mean(), inplace=True)
print(df)

   customer_id     name        age   salary
0        101.0    Alice  25.000000    50000
1        102.0      Bob  30.000000    55000
2        103.0  Charlie  23.333333    60000
3        104.0     None  22.000000    65000
4          NaN      Eve  28.000000    70000
5        106.0    Frank  -5.000000    75000
6        106.0    Frank  40.000000  invalid


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].mean(), inplace=True)


In [None]:
# ลบเเถวที่มี customer_id เป็น NaN
df.dropna(subset=["customer_id"],inplace=True)
print(df)

   customer_id     name        age   salary
0        101.0    Alice  25.000000    50000
1        102.0      Bob  30.000000    55000
2        103.0  Charlie  23.333333    60000
3        104.0     None  22.000000    65000
5        106.0    Frank  -5.000000    75000
6        106.0    Frank  40.000000  invalid


In [None]:
# เเทนค่าหายใน name ด้วย Unknown
df["name"].fillna("Unknown",inplace=True)
print(df)

   customer_id     name        age   salary
0        101.0    Alice  25.000000    50000
1        102.0      Bob  30.000000    55000
2        103.0  Charlie  23.333333    60000
3        104.0  Unknown  22.000000    65000
5        106.0    Frank  -5.000000    75000
6        106.0    Frank  40.000000  invalid


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["name"].fillna("Unknown",inplace=True)


# เเปลงประเภทข้อมูล (Data Type Conversion)

In [None]:
# เเปลง salary เป็นตัวเลข (ถ้ามีค่าผิดปกติให้เป็น NaN)
df["salary"] = pd.to_numeric(df["salary"], errors="coerce")
print(df)

   customer_id     name        age   salary
0        101.0    Alice  25.000000  50000.0
1        102.0      Bob  30.000000  55000.0
2        103.0  Charlie  23.333333  60000.0
3        104.0  Unknown  22.000000  65000.0
5        106.0    Frank  -5.000000  75000.0
6        106.0    Frank  40.000000      NaN


In [None]:
# เเปลง customer_id ให้เป็น int
df["customer_id"] = df["customer_id"].astype(int)
print(df)

   customer_id     name        age   salary
0          101    Alice  25.000000  50000.0
1          102      Bob  30.000000  55000.0
2          103  Charlie  23.333333  60000.0
3          104  Unknown  22.000000  65000.0
5          106    Frank  -5.000000  75000.0
6          106    Frank  40.000000      NaN


# ลบค่าผิดปกติ (outliers)

In [None]:
# กำจัดค่าอายุที่น้อยกว่า 0
df = df[df["age"] > 0 ]
print(df)

   customer_id     name        age   salary
0          101    Alice  25.000000  50000.0
1          102      Bob  30.000000  55000.0
2          103  Charlie  23.333333  60000.0
3          104  Unknown  22.000000  65000.0
6          106    Frank  40.000000      NaN


# ลบข้อมูลซ้ำ (Duplicates)

In [None]:
df.drop_duplicates(inplace=True)
print(df)

   customer_id     name        age   salary
0          101    Alice  25.000000  50000.0
1          102      Bob  30.000000  55000.0
2          103  Charlie  23.333333  60000.0
3          104  Unknown  22.000000  65000.0
6          106    Frank  40.000000      NaN
