# 데이터 불러오기

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans, DBSCAN
from sklearn.svm import SVC
import optuna
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [111]:
train = pd.read_csv("./data.csv")
test = pd.read_csv("./test.csv")

# 이상치 판단 (파생변수 `outInd` 추가)

In [112]:
# 이상치를 찾을 함수 정의
def find_outliers(column):
    q1 = column.quantile(0.999)
    q3 = column.quantile(0.001)
    
    # 0.1%보다 작거나 99.9%보다 큰 값을 가지는 인덱스를 반환
    outliers = (column > q1) | (column < q3)
    
    return outliers


# 각 열에 대해 이상치를 찾아내고 인덱스를 저장
outlier_indices = []

for column in train.columns[1:]:
    outliers = find_outliers(train[column])
    outlier_indices.extend(train.index[outliers].tolist())

# 중복된 인덱스 제거
outlier_indices = list(set(outlier_indices))

# outlier이면 1, 아니면 0인 열을 생성
outliers_df = pd.DataFrame(0, index=train.index, columns=['outInd'], dtype=int)
outliers_df.loc[outlier_indices, 'outInd'] = 1

# train_ori 데이터프레임에 outInd 열 추가
train = pd.concat([train, outliers_df], axis=1)

In [113]:
# test에 같은 작업 반복

outlier_indices = []

for column in test.columns[1:]:
    outliers = find_outliers(test[column])
    outlier_indices.extend(test.index[outliers].tolist())

outlier_indices = list(set(outlier_indices))

outliers_df = pd.DataFrame(0, index=test.index, columns=['outInd'], dtype=int)
outliers_df.loc[outlier_indices, 'outInd'] = 1

test = pd.concat([test, outliers_df], axis=1)

# 1차 칼럼 제거

* 결측치 존재 컬럼, 동일 컬럼, 값이 하나인 컬럼

In [114]:
index1 = ["X7", "X51", "X77", "X78", "X85", "X94", "X1", "X19", "X36"]
train = train.drop(index1, axis=1)

In [115]:
test = test.drop(index1, axis=1)

In [116]:
sum(train["outInd"] == 1)

413

# 2차 칼럼 제거

* 상관계수가 매우 높은 (> 0.95) 컬럼 제거

In [117]:
index2 = ["X4", "X89", "X16", "X18", "X27", "X38", "X42", "X43", "X75"]
train = train.drop(index2, axis=1)

In [118]:
test = test.drop(index2, axis=1)

# PCA

* [X2, X3, X86] -> P1
* [X6, X8, X10] -> P2
* [X40, X66, X91] -> P3


### train

In [119]:
# PCA할 칼럼 추출
columns_to_pca = ['X2', 'X3', 'X86']  
data_for_pca = train[columns_to_pca]

# 로버스트 스케일링
robust_scaler = RobustScaler()
scaled_data = robust_scaler.fit_transform(data_for_pca)

# PCA 수행
pca = PCA(n_components=1)
principal_component = pca.fit_transform(scaled_data)
train['P1'] = principal_component[:, 0]  

# 기존 칼럼 제거
train.drop(columns=columns_to_pca, inplace=True)

In [120]:
# PCA할 칼럼 추출
columns_to_pca = ['X6', 'X8', 'X10']  
data_for_pca = train[columns_to_pca]

# 로버스트 스케일링
robust_scaler = RobustScaler()
scaled_data = robust_scaler.fit_transform(data_for_pca)

# PCA 수행
pca = PCA(n_components=1)
principal_component = pca.fit_transform(scaled_data)
train['P2'] = principal_component[:, 0]  

# 기존 칼럼 제거
train.drop(columns=columns_to_pca, inplace=True)

In [121]:
# PCA할 칼럼 추출
columns_to_pca = ['X40', 'X66', 'X91']  
data_for_pca = train[columns_to_pca]

# 로버스트 스케일링
robust_scaler = RobustScaler()
scaled_data = robust_scaler.fit_transform(data_for_pca)

# PCA 수행
pca = PCA(n_components=1)
principal_component = pca.fit_transform(scaled_data)
train['P3'] = principal_component[:, 0]  

# 기존 칼럼 제거
train.drop(columns=columns_to_pca, inplace=True)

### test

In [122]:
# PCA할 칼럼 추출
columns_to_pca = ['X2', 'X3', 'X86']  
data_for_pca = test[columns_to_pca]

# 로버스트 스케일링
robust_scaler = RobustScaler()
scaled_data = robust_scaler.fit_transform(data_for_pca)

# PCA 수행
pca = PCA(n_components=1)
principal_component = pca.fit_transform(scaled_data)
test['P1'] = principal_component[:, 0]  

# 기존 칼럼 제거
test.drop(columns=columns_to_pca, inplace=True)

In [123]:
# PCA할 칼럼 추출
columns_to_pca = ['X6', 'X8', 'X10']  
data_for_pca = test[columns_to_pca]

# 로버스트 스케일링
robust_scaler = RobustScaler()
scaled_data = robust_scaler.fit_transform(data_for_pca)

# PCA 수행
pca = PCA(n_components=1)
principal_component = pca.fit_transform(scaled_data)
test['P2'] = principal_component[:, 0]  

# 기존 칼럼 제거
test.drop(columns=columns_to_pca, inplace=True)

In [124]:
# PCA할 칼럼 추출
columns_to_pca = ['X40', 'X66', 'X91']  
data_for_pca = test[columns_to_pca]

# 로버스트 스케일링
robust_scaler = RobustScaler()
scaled_data = robust_scaler.fit_transform(data_for_pca)

# PCA 수행
pca = PCA(n_components=1)
principal_component = pca.fit_transform(scaled_data)
test['P3'] = principal_component[:, 0]  

# 기존 칼럼 제거
test.drop(columns=columns_to_pca, inplace=True)

# 파생변수 `dispersion` 추가

In [125]:
# outInd 제외

train_no_out = train.drop('outInd', axis=1)

train_m = train_no_out.median()
train_std = train_no_out.std()

train_new = (train_no_out - train_m)/train_std
train_new

train_new["dispersion"] = train_new.apply(lambda x: sum(abs(x)), axis=1)

train["dispersion"] = train_new["dispersion"]

In [126]:
# test에 같은 작업 반복

test_no_out = test.drop('outInd', axis=1)

test_m = test_no_out.median()
test_std = test_no_out.std()

test_new = (test_no_out - test_m)/test_std
test_new

test_new["dispersion"] = test_new.apply(lambda x: sum(abs(x)), axis=1)

test["dispersion"] = test_new["dispersion"]

# RobustScaling

In [127]:
train

Unnamed: 0,Target,X5,X9,X11,X12,X13,X14,X15,X17,X20,...,X88,X90,X92,X93,X95,outInd,P1,P2,P3,dispersion
0,0,0.594077,0.303674,1.757675e-04,7.010000e+09,0.461902,0.009626,0.000000,0.166323,0.318632,...,0.623635,0.838672,0.026738,0.564893,0.027785,0,1.864980,-1.377682,-0.274130,22.809811
1,0,0.599000,0.303579,3.710000e+09,8.660000e+08,0.455518,0.000124,0.242231,0.182420,0.305192,...,0.624578,0.842348,0.026795,0.565179,0.029882,1,-0.870147,-3.689031,-0.298622,25.647179
2,0,0.613500,0.303461,1.584220e-04,6.040000e+09,0.482704,0.000362,0.197287,0.203278,0.337802,...,0.624640,0.841584,0.027058,0.566173,0.023145,0,-0.232152,-4.617759,0.288267,22.122375
3,0,0.607208,0.303966,1.795835e-04,0.000000e+00,0.468311,0.000175,0.063772,0.229615,0.379536,...,0.624431,0.843514,0.026862,0.565475,0.016974,0,-1.235467,-4.925382,3.257469,47.116747
4,0,0.605630,0.303097,2.330075e-04,4.380000e+09,0.466123,0.000401,0.000000,0.147613,0.321497,...,0.623780,0.833008,0.026671,0.564511,0.016055,0,3.025280,-0.975379,2.513752,25.034120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5450,0,0.599684,0.303375,1.945006e-04,0.000000e+00,0.481516,0.000338,0.000000,0.177616,0.327120,...,0.623414,0.838109,0.026565,0.563786,0.033673,0,1.582590,-1.056262,-0.595114,19.405067
5451,0,0.616238,0.303571,3.021582e-04,2.867675e-04,0.465169,0.000483,0.112461,0.178838,0.321461,...,0.623910,0.841515,0.026826,0.565320,0.046812,0,-0.471688,-3.835096,-1.070178,15.611162
5452,0,0.599410,0.303522,6.560000e+09,2.950000e+09,0.462404,0.000826,0.000000,0.160802,0.323159,...,0.623537,0.842439,0.026965,0.565869,0.017679,0,-0.527797,-3.359277,2.399194,36.928504
5453,0,0.597955,0.303528,7.470000e+09,0.000000e+00,0.462022,0.000400,0.180889,0.165101,0.319834,...,0.623869,0.840250,0.029204,0.568663,0.018015,0,0.746577,-2.916972,2.812620,23.400160


In [135]:
to_scale_cols = list(train.columns[:-5])
to_scale_cols.append("dispersion")

In [140]:
pd.DataFrame(to_scale)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,0.0,-0.886425,1.258956,-2.479094e-14,1.930178,-0.332411,28.359756,-0.362475,-0.691176,-0.347541,...,-0.362031,3.073898,0.171620,-0.175770,-0.455461,-1.109691,-0.570769,-0.634898,-0.212908,0.028340
1,0.0,-0.517036,0.454452,8.750000e-01,0.112426,-1.011872,-0.606707,0.812868,-0.066993,-1.593443,...,-1.694008,-1.188970,0.074656,0.200130,1.344657,0.525086,-0.101019,-0.129617,-0.137632,0.277854
2,0.0,0.571120,-0.540430,-2.888185e-14,1.643195,1.881282,0.118902,0.594794,0.741830,1.429508,...,0.711794,-0.073865,-0.895917,0.472156,1.462505,0.185490,2.050112,1.625715,-0.379508,-0.032113
3,0.0,0.098972,3.735926,-2.389094e-14,-0.143787,0.349549,-0.451220,-0.053045,1.763072,5.298361,...,1.357237,0.324837,8.564674,0.034638,1.064600,1.043473,0.445204,0.393411,-0.601051,2.165857
4,0.0,-0.019470,-3.629478,-1.129092e-14,1.152071,0.116754,0.237805,-0.362475,-1.416667,-0.081967,...,0.015765,0.061905,-0.062220,2.817076,-0.179845,-3.628299,-1.124211,-1.309474,-0.634029,0.223942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5450,0.0,-0.465657,-1.275333,-2.037275e-14,-0.143787,1.754887,0.045732,-0.362475,-0.253268,0.439344,...,0.649665,0.117365,-0.394001,-0.336710,-0.877720,-1.360152,-1.992411,-2.588435,-0.001530,-0.271069
5451,0.0,0.776636,0.384852,5.018187e-15,-0.143787,0.015175,0.487805,0.183202,-0.205882,-0.085246,...,0.036506,-0.890497,-0.393903,0.132156,0.069271,0.154650,0.149895,0.118563,0.470170,-0.604699
5452,0.0,-0.486209,-0.026612,1.547170e+00,0.728994,-0.279032,1.533537,-0.362475,-0.905229,0.072131,...,-0.144994,7.314738,0.335470,-0.224355,-0.642560,0.565525,1.287300,1.088859,-0.575721,1.269917
5453,0.0,-0.595457,0.026612,1.761792e+00,-0.143787,-0.319736,0.234756,0.515226,-0.738562,-0.236066,...,-0.280098,-0.520251,0.454856,-0.246285,-0.009286,-0.407958,19.663315,6.021014,-0.563659,0.080254


In [141]:
# PC, outInd 제외
to_scale = train[to_scale_cols]
not_scale = train.loc[:, ["P1", "P2", "P3", "outInd"]]

# 로버스트 스케일링
robust_scaler = RobustScaler()
to_scale = robust_scaler.fit_transform(to_scale)
to_scale = pd.DataFrame(to_scale)
to_scale.columns = to_scale_cols

train = pd.concat([to_scale, not_scale], axis=1)
train

Unnamed: 0,Target,X5,X9,X11,X12,X13,X14,X15,X17,X20,...,X88,X90,X92,X93,X95,dispersion,P1,P2,P3,outInd
0,0.0,-0.886425,1.258956,-2.479094e-14,1.930178,-0.332411,28.359756,-0.362475,-0.691176,-0.347541,...,-0.455461,-1.109691,-0.570769,-0.634898,-0.212908,0.028340,1.864980,-1.377682,-0.274130,0
1,0.0,-0.517036,0.454452,8.750000e-01,0.112426,-1.011872,-0.606707,0.812868,-0.066993,-1.593443,...,1.344657,0.525086,-0.101019,-0.129617,-0.137632,0.277854,-0.870147,-3.689031,-0.298622,1
2,0.0,0.571120,-0.540430,-2.888185e-14,1.643195,1.881282,0.118902,0.594794,0.741830,1.429508,...,1.462505,0.185490,2.050112,1.625715,-0.379508,-0.032113,-0.232152,-4.617759,0.288267,0
3,0.0,0.098972,3.735926,-2.389094e-14,-0.143787,0.349549,-0.451220,-0.053045,1.763072,5.298361,...,1.064600,1.043473,0.445204,0.393411,-0.601051,2.165857,-1.235467,-4.925382,3.257469,0
4,0.0,-0.019470,-3.629478,-1.129092e-14,1.152071,0.116754,0.237805,-0.362475,-1.416667,-0.081967,...,-0.179845,-3.628299,-1.124211,-1.309474,-0.634029,0.223942,3.025280,-0.975379,2.513752,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5450,0.0,-0.465657,-1.275333,-2.037275e-14,-0.143787,1.754887,0.045732,-0.362475,-0.253268,0.439344,...,-0.877720,-1.360152,-1.992411,-2.588435,-0.001530,-0.271069,1.582590,-1.056262,-0.595114,0
5451,0.0,0.776636,0.384852,5.018187e-15,-0.143787,0.015175,0.487805,0.183202,-0.205882,-0.085246,...,0.069271,0.154650,0.149895,0.118563,0.470170,-0.604699,-0.471688,-3.835096,-1.070178,0
5452,0.0,-0.486209,-0.026612,1.547170e+00,0.728994,-0.279032,1.533537,-0.362475,-0.905229,0.072131,...,-0.642560,0.565525,1.287300,1.088859,-0.575721,1.269917,-0.527797,-3.359277,2.399194,0
5453,0.0,-0.595457,0.026612,1.761792e+00,-0.143787,-0.319736,0.234756,0.515226,-0.738562,-0.236066,...,-0.009286,-0.407958,19.663315,6.021014,-0.563659,0.080254,0.746577,-2.916972,2.812620,0


In [143]:
to_scale_cols.remove("Target")

In [144]:
# test에 같은 작업 반복
to_scale = test[to_scale_cols]
not_scale = test.loc[:, ["P1", "P2", "P3", "outInd"]]

robust_scaler = RobustScaler()
to_scale = robust_scaler.fit_transform(to_scale)
to_scale = pd.DataFrame(to_scale)
to_scale.columns = to_scale_cols

test = pd.concat([to_scale, not_scale], axis=1)
test

Unnamed: 0,X5,X9,X11,X12,X13,X14,X15,X17,X20,X21,...,X88,X90,X92,X93,X95,dispersion,P1,P2,P3,outInd
0,1.130597,2.024802,-1.463030e-14,-0.160000,2.357193,0.152466,0.623454,-0.066561,0.621138,-0.531835,...,16.206406,0.447264,0.005189,-0.008954,1.391141,0.001985,-1.411827,-3.730843,-1.491302,0
1,-0.183074,-0.122024,1.258517e-01,-0.153014,0.007087,0.059791,0.886195,1.454834,0.227642,-0.139326,...,-11.325105,-0.227483,1.425837,1.221195,0.034652,0.178615,0.330221,-1.480296,-0.677878,1
2,-0.082636,-0.510913,2.527722e+00,0.681096,2.284904,0.544096,0.151905,-0.535658,0.152846,-0.546317,...,-6.889754,-0.270745,5.544565,3.401498,-0.152552,0.242805,0.261365,-1.264733,-0.713613,0
3,-0.581108,-0.120040,1.824983e-01,-0.160000,-1.124203,0.194320,1.348342,-0.412044,-1.404878,0.365044,...,-1.129262,-0.435487,7.233426,3.969819,-0.065271,-0.462762,0.554547,-0.653684,-0.278867,0
4,-0.587485,-0.050595,5.317301e-01,-0.160000,-0.743799,-0.266069,0.979218,0.095087,-1.655285,4.499875,...,-1.976337,1.344832,0.165567,0.149641,-0.426249,0.901968,-1.835425,-1.089270,1.216211,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1359,0.574200,-0.108135,-1.586622e-14,-0.160000,0.411233,-0.089686,0.571499,1.488114,0.598374,-0.103371,...,-0.329129,-0.108984,0.449081,0.416415,0.248296,-0.619389,-0.033326,-1.757153,-0.946026,0
1360,0.187857,0.280754,-3.307652e-14,-0.160000,0.722360,0.017937,-0.350816,-0.438986,0.000000,-0.499376,...,1.118831,0.117920,0.044333,0.030291,1.319254,-0.431531,-0.621173,-2.130268,-1.497297,0
1361,-0.381825,-0.256944,1.360053e+00,-0.160000,0.567860,-0.116592,-0.211282,-0.169572,0.230894,-0.414981,...,-0.259977,-0.249547,1.668712,1.397787,0.299005,-0.790940,0.355781,-1.253288,-1.035687,0
1362,1.302777,0.433532,2.337448e-14,-0.160000,1.122254,-0.941704,-0.347848,0.462758,0.588618,-0.226717,...,0.098828,0.273728,-0.143058,-0.160837,1.404245,-0.460922,-0.987626,-2.390689,-1.512373,0


# 파일 I/O

In [146]:
# 전처리된 최종 파일 만들기!
train.to_csv("data_p.csv", index=False)
test.to_csv("test_p.csv", index=False)