# Introduction

그 동안 진행했던, EDA와 Outlier Detection을 이용해 분석용 데이터 전처리를 수행한다. 단계는 다음과 같다.

1. Outlier Detection
2. Create Summary Statistic Feature
3. PCA & Clustering

# Import Module

In [45]:
import numpy as np
import pandas as pd

from sklearn.ensemble import IsolationForest

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [46]:
train = pd.read_csv('../dataset/train.csv', index_col = 0)
test = pd.read_csv('../dataset/test.csv', index_col = 0)
sample_submission = pd.read_csv('../dataset/sample_submission.csv', index_col = 0)

# Preprocessing

## Outlier Detection

In [47]:
X_train = train.drop(columns = 'type')
y_train = train['type']

num_X_train = X_train.drop(columns = 'fiberID')
num_X_test = test.drop(columns = 'fiberID')

model = IsolationForest(random_state = 42, contamination = 0.03)
model.fit(num_X_train)

outlier_pred = model.predict(num_X_train)

train['if_outlier'] = outlier_pred

train['if_outlier'].value_counts()

 1    193992
-1      5999
Name: if_outlier, dtype: int64

In [48]:
train_no_outlier = train.loc[train['if_outlier'] == 1]
train_no_outlier.drop(columns = ['if_outlier'], inplace = True)

## Create Summary Statistic Featue

### Train

In [49]:
fiberID_mean = train_no_outlier.groupby('fiberID').mean()
mean_cols = ["{}_mean".format(x) for x in fiberID_mean.columns]
fiberID_mean.columns = mean_cols
fiberID_mean.reset_index(inplace = True)

fiberID_median = train_no_outlier.groupby('fiberID').median()
median_cols = ["{}_median".format(x) for x in fiberID_median.columns]
fiberID_median.columns = median_cols
fiberID_median.reset_index(inplace = True)

fiberID_std = train_no_outlier.groupby('fiberID').std()
std_cols = ["{}_std".format(x) for x in fiberID_std.columns]
fiberID_std.columns = std_cols
fiberID_std.reset_index(inplace = True)

In [50]:
train = pd.merge(train_no_outlier, fiberID_mean, on = 'fiberID', how = 'left')
train = pd.merge(train, fiberID_median, on = 'fiberID', how = 'left')
train = pd.merge(train, fiberID_std, on = 'fiberID', how = 'left')

### Test

In [51]:
fiberID_mean = test.groupby('fiberID').mean()
mean_cols = ["{}_mean".format(x) for x in fiberID_mean.columns]
fiberID_mean.columns = mean_cols
fiberID_mean.reset_index(inplace = True)

fiberID_median = test.groupby('fiberID').median()
median_cols = ["{}_median".format(x) for x in fiberID_median.columns]
fiberID_median.columns = median_cols
fiberID_median.reset_index(inplace = True)

fiberID_std = test.groupby('fiberID').std()
std_cols = ["{}_std".format(x) for x in fiberID_std.columns]
fiberID_std.columns = std_cols
fiberID_std.reset_index(inplace = True)

In [52]:
test = pd.merge(test, fiberID_mean, on = 'fiberID', how = 'left')
test = pd.merge(test, fiberID_median, on = 'fiberID', how = 'left')
test = pd.merge(test, fiberID_std, on = 'fiberID', how = 'left')

# k-Means Clustering

## psfMag

### Train

In [53]:
X_train = train.drop(columns = 'type')
y_train = train['type']

psfMag_train = X_train[['psfMag_u', 'psfMag_g', 'psfMag_r', 'psfMag_i', 'psfMag_z']]
scaler = StandardScaler()
psfMag_train_scaler = scaler.fit_transform(psfMag_train)

pca = PCA(n_components = 2)
psf_pca = pca.fit_transform(psfMag_train_scaler)
print(pca.explained_variance_ratio_)
psf_pca_scaler_df = pd.DataFrame(psf_pca)

x = psf_pca_scaler_df.values

km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 42)
y_means = km.fit_predict(x)

train['psf_cluster'] = y_means

[0.70151416 0.19999788]


### Test

In [54]:
X_test = test

psfMag_test = X_test[['psfMag_u', 'psfMag_g', 'psfMag_r', 'psfMag_i', 'psfMag_z']]
psfMag_test_scaler = scaler.transform(psfMag_test)

psf_pca = pca.transform(psfMag_test_scaler)
print(pca.explained_variance_ratio_)
psf_pca_scaler_df = pd.DataFrame(psf_pca)

x = psf_pca_scaler_df.values
y_means = km.predict(x)

test['psf_cluster'] = y_means

[0.70151416 0.19999788]


## fiberMag
### Train

In [55]:
X_train = train.drop(columns = 'type')
y_train = train['type']

fiberMag_train = X_train[['fiberMag_u', 'fiberMag_g', 'fiberMag_r', 'fiberMag_i', 'fiberMag_z']]
scaler = StandardScaler()
fiberMag_train_scaler = scaler.fit_transform(fiberMag_train)

pca = PCA(n_components = 2)
fiber_pca = pca.fit_transform(fiberMag_train_scaler)
print(pca.explained_variance_ratio_)
fiber_pca_scaler_df = pd.DataFrame(fiber_pca)

x = fiber_pca_scaler_df.values

km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 42)
y_means = km.fit_predict(x)

train['fiber_cluster'] = y_means

[0.51879584 0.19999793]


### Test

In [56]:
X_test = test

fiberMag_test = X_test[['fiberMag_u', 'fiberMag_g', 'fiberMag_r', 'fiberMag_i', 'fiberMag_z']]
fiberMag_test_scaler = scaler.transform(fiberMag_test)

fiber_pca = pca.transform(fiberMag_test_scaler)
print(pca.explained_variance_ratio_)
fiber_pca_scaler_df = pd.DataFrame(fiber_pca)

x = fiber_pca_scaler_df.values
y_means = km.predict(x)

test['fiber_cluster'] = y_means

[0.51879584 0.19999793]


## model
### Train

In [57]:
X_train = train.drop(columns = 'type')
y_train = train['type']

modelMag_train = X_train[['modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i', 'modelMag_z']]
scaler = StandardScaler()
modelMag_train_scaler = scaler.fit_transform(modelMag_train)

pca = PCA(n_components = 2)
model_pca = pca.fit_transform(modelMag_train_scaler)
print(pca.explained_variance_ratio_)
model_pca_scaler_df = pd.DataFrame(model_pca)

x = model_pca_scaler_df.values

km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 42)
y_means = km.fit_predict(x)

train['model_cluster'] = y_means

[0.82275801 0.15646262]


### Test

In [58]:
X_test = test

modelMag_test = X_test[['modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i', 'modelMag_z']]
modelMag_test_scaler = scaler.transform(modelMag_test)

model_pca = pca.transform(modelMag_test_scaler)
print(pca.explained_variance_ratio_)
model_pca_scaler_df = pd.DataFrame(model_pca)

x = model_pca_scaler_df.values
y_means = km.predict(x)

test['model_cluster'] = y_means

[0.82275801 0.15646262]


In [59]:
test.head()

Unnamed: 0,fiberID,psfMag_u,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_u,fiberMag_g,fiberMag_r,fiberMag_i,...,petroMag_i_std,petroMag_z_std,modelMag_u_std,modelMag_g_std,modelMag_r_std,modelMag_i_std,modelMag_z_std,psf_cluster,fiber_cluster,model_cluster
0,251,23.817399,22.508963,20.981106,18.517316,17.076079,25.05389,23.167848,21.335901,18.835858,...,1.893139,1.793287,2.367397,2.213415,2.036589,1.887005,1.833027,0,1,2
1,386,22.806983,21.937111,20.33577,20.000512,19.527369,22.498565,22.186,20.618879,20.301204,...,1.347009,1.1211,2.04308,2.108371,1.781324,1.358582,1.112072,0,3,4
2,232,21.02425,19.235669,18.304061,17.808608,17.380113,21.205546,19.439533,18.344433,17.90969,...,1.591555,1.609979,3.038135,1.93275,1.771562,1.583241,1.574938,2,0,0
3,557,20.503424,20.286261,20.197204,20.162419,20.059832,20.976132,20.611498,20.567262,20.479318,...,1.813364,1.981834,2.144815,1.829022,1.887076,1.82598,1.913923,0,1,1
4,75,24.244851,22.668237,21.239333,19.284777,18.235939,25.68186,22.935289,21.642456,19.624926,...,1.627857,1.491637,2.775471,2.76291,2.167265,1.685952,1.623274,0,3,2


## 641

In [60]:
train['over_641'] = 0
train.loc[train['fiberID'] >= 641, 'over_641'] = 1

test['over_641'] = 0
test.loc[test['fiberID'] >= 641, 'over_641'] = 1

In [61]:
print(train.shape)
print(test.shape)

(193992, 86)
(10009, 85)


In [62]:
train.describe()

Unnamed: 0,fiberID,psfMag_u,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_u,fiberMag_g,fiberMag_r,fiberMag_i,...,petroMag_z_std,modelMag_u_std,modelMag_g_std,modelMag_r_std,modelMag_i_std,modelMag_z_std,psf_cluster,fiber_cluster,model_cluster,over_641
count,193992.0,193992.0,193992.0,193992.0,193992.0,193992.0,193992.0,193992.0,193992.0,193992.0,...,193992.0,193992.0,193992.0,193992.0,193992.0,193992.0,193992.0,193992.0,193992.0,193992.0
mean,361.683528,-6.397396,19.848803,19.282599,18.892145,18.64481,11.56141,20.082393,19.503786,19.007427,...,2.128541,2.05135,1.741862,1.640903,1.597057,1.670811,1.524661,1.655305,1.695838,0.078601
std,226.027489,12057.85,1.746506,1.601831,1.574998,1.63769,4233.87,1.724449,1.603436,44.84657,...,5.322825,0.23524,0.278681,0.247652,0.239333,0.242967,1.487771,1.485863,1.439177,0.269116
min,1.0,-5310802.0,3.70147,8.605801,-30.71189,3.888018,-1864766.0,-0.770183,8.231789,-19721.172166,...,0.583952,0.679973,0.452045,0.411013,0.333267,0.465581,0.0,0.0,0.0,0.0
25%,174.0,19.68792,18.759996,18.087935,17.785838,17.463377,19.97887,18.950728,18.292905,17.941803,...,1.691051,2.02156,1.763357,1.644553,1.557283,1.624803,0.0,0.0,0.0,0.0
50%,350.0,20.86932,19.90351,19.454039,19.044438,18.635965,21.04946,20.070619,19.63465,19.193343,...,1.825125,2.098079,1.812434,1.705137,1.655037,1.726669,1.0,1.0,1.0,0.0
75%,527.0,22.10793,21.098885,20.481026,20.053615,19.898821,22.29554,21.341838,20.746834,20.311863,...,1.945569,2.172157,1.858977,1.75586,1.737751,1.809578,2.0,3.0,3.0,0.0
max,1000.0,40.74148,102.124043,54.501228,61.241476,32.130146,95.39492,33.362551,28.881709,32.171369,...,129.387047,2.682186,2.101853,1.944007,1.862731,1.97895,4.0,4.0,4.0,1.0


In [63]:
train.to_csv('../dataset/pr_train.csv')
test.to_csv('../dataset/pr_test.csv')