In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# 데이터 읽기
file_path = 'train_null_filtered.csv'
data = pd.read_csv(file_path)

# 데이터 확인
print("Data Preview:\n", data.head())
print("Data Description:\n", data.describe())

# 이상치 확인을 위한 함수 정의
def find_outliers_iqr(df):
    outlier_cols = []
    for col in df.select_dtypes(include=[np.number]).columns:  # 숫자형 열만 선택
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # 이상치가 있는지 확인
        if df[(df[col] < lower_bound) | (df[col] > upper_bound)].any().any():
            outlier_cols.append(col)
    
    return outlier_cols

# IQR을 통해 이상치가 있는 열 찾기
outlier_columns = find_outliers_iqr(data)
print("Columns with outliers based on IQR:\n", outlier_columns)

# MinMax Scaling을 통한 이상치 보정
scaler = MinMaxScaler()

# 이상치가 존재하는 열들에만 MinMax scaling 적용
data[outlier_columns] = scaler.fit_transform(data[outlier_columns])

# 보정된 데이터 확인
print("Data after MinMax Scaling:\n", data.describe())

Data Preview:
   Wip Line_Dam Process Desc._Dam     Equipment_Dam Model.Suffix_Dam  \
0      IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334505   
1      IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334505   
2      IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   
3      IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   
4      IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334501   

  Workorder_Dam  Insp. Seq No._Dam Insp Judge Code_Dam  \
0    4F1XA938-1                  1                  OK   
1    3KPM0016-2                  1                  OK   
2    4E1X9167-1                  1                  OK   
3    3K1X0057-1                  1                  OK   
4    3HPM0007-1                  1                  OK   

   CURE END POSITION X Collect Result_Dam  \
0                                     240   
1                                     240   
2                                    1000   
3                            

In [13]:
# 파일로 저장 (원할 시)
data.to_csv('train_null_filtered_scaled.csv', index=False)
