# Introduction

그 동안 진행했던, EDA와 Outlier Detection을 이용해 분석용 데이터 전처리를 수행한다. 단계는 다음과 같다.

1. urgiz to UBVRclc
2. Outlier Detection
3. Create Summary Statistic Feature
4. PCA & Clustering

# Import Module

In [85]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import IsolationForest

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [86]:
train = pd.read_csv('../dataset/lee_train.csv', index_col = 0)
test = pd.read_csv('../dataset/lee_test.csv', index_col = 0)
sample_submission = pd.read_csv('../dataset/sample_submission.csv', index_col = 0)

# Preprocessing

## urgiz to UBVRclc

### B

In [87]:
train['psfMag_B'] = train['psfMag_g'] - 0.39 * (train['psfMag_u'] - train['psfMag_g']) + 0.21
train['fiberMag_B'] = train['fiberMag_g'] - 0.39 * (train['fiberMag_u'] - train['fiberMag_g']) + 0.21
train['petroMag_B'] = train['petroMag_g'] - 0.39 * (train['petroMag_u'] - train['petroMag_g']) + 0.21
train['modelMag_B'] = train['modelMag_g'] - 0.39 * (train['modelMag_u'] - train['modelMag_g']) + 0.21

In [88]:
test['psfMag_B'] = test['psfMag_g'] - 0.39 * (test['psfMag_u'] - test['psfMag_g']) + 0.21
test['fiberMag_B'] = test['fiberMag_g'] - 0.39 * (test['fiberMag_u'] - test['fiberMag_g']) + 0.21
test['petroMag_B'] = test['petroMag_g'] - 0.39 * (test['petroMag_u'] - test['petroMag_g']) + 0.21
test['modelMag_B'] = test['modelMag_g'] - 0.39 * (test['modelMag_u'] - test['modelMag_g']) + 0.21

### V

In [89]:
train['psfMag_V'] = train['psfMag_g'] - 0.59 * (train['psfMag_g'] - train['psfMag_r']) - 0.01
train['fiberMag_V'] = train['fiberMag_g'] - 0.59 * (train['fiberMag_g'] - train['fiberMag_r']) - 0.01
train['petroMag_V'] = train['petroMag_g'] - 0.59 * (train['petroMag_g'] - train['petroMag_r']) - 0.01
train['modelMag_V'] = train['modelMag_g'] - 0.59 * (train['modelMag_g'] - train['modelMag_r']) - 0.01

In [90]:
test['psfMag_V'] = test['psfMag_g'] - 0.59 * (test['psfMag_g'] - test['psfMag_r']) - 0.01
test['fiberMag_V'] = test['fiberMag_g'] - 0.59 * (test['fiberMag_g'] - test['fiberMag_r']) - 0.01
test['petroMag_V'] = test['petroMag_g'] - 0.59 * (test['petroMag_g'] - test['petroMag_r']) - 0.01
test['modelMag_V'] = test['modelMag_g'] - 0.59 * (test['modelMag_g'] - test['modelMag_r']) - 0.01

### U-B

In [91]:
train['psfMag_u_b'] = 0.78*(train['psfMag_u'] - train['psfMag_g'])- 0.88
train['fiberMag_u_b'] = 0.78*(train['fiberMag_u'] - train['fiberMag_g'])- 0.88
train['petroMag_u_b'] = 0.78*(train['petroMag_u'] - train['petroMag_g'])- 0.88
train['modelMag_u_b'] = 0.78*(train['modelMag_u'] - train['modelMag_g'])- 0.88

In [92]:
test['psfMag_u_b'] = 0.78*(test['psfMag_u'] - test['psfMag_g'])- 0.88
test['fiberMag_u_b'] = 0.78*(test['fiberMag_u'] - test['fiberMag_g'])- 0.88
test['petroMag_u_b'] = 0.78*(test['petroMag_u'] - test['petroMag_g'])- 0.88
test['modelMag_u_b'] = 0.78*(test['modelMag_u'] - test['modelMag_g'])- 0.88

### U

In [93]:
train['psfMag_U'] = train['psfMag_u_b'] + train['psfMag_B']
train['fiberMag_U'] = train['fiberMag_u_b'] + train['fiberMag_B']
train['petroMag_U'] = train['petroMag_u_b'] + train['petroMag_B']
train['modelMag_U'] = train['modelMag_u_b'] + train['modelMag_B']

In [94]:
test['psfMag_U'] = test['psfMag_u_b'] + test['psfMag_B']
test['fiberMag_U'] = test['fiberMag_u_b'] + test['fiberMag_B']
test['petroMag_U'] = test['petroMag_u_b'] + test['petroMag_B']
test['modelMag_U'] = test['modelMag_u_b'] + test['modelMag_B']

### B-V

In [95]:
train['psfMag_b_v'] = 0.98*(train['psfMag_g']-train['psfMag_r']) + 0.22
train['fiberMag_b_v'] = 0.98*(train['fiberMag_g'] - train['fiberMag_r']) + 0.22
train['petroMag_b_v'] = 0.98*(train['petroMag_g'] - train['petroMag_r']) + 0.22
train['modelMag_b_v'] = 0.98*(train['modelMag_g'] - train['modelMag_r']) + 0.22

In [96]:
test['psfMag_b_v'] = 0.98*(test['psfMag_g']-test['psfMag_r']) + 0.22
test['fiberMag_b_v'] = 0.98*(test['fiberMag_g'] - test['fiberMag_r']) + 0.22
test['petroMag_b_v'] = 0.98*(test['petroMag_g'] - test['petroMag_r']) + 0.22
test['modelMag_b_v'] = 0.98*(test['modelMag_g'] - test['modelMag_r']) + 0.22

### u - g

In [97]:
train['psfmMag_u_g'] = train['psfMag_u'] - train['psfMag_g']
train['fibermMag_u_g'] = train['fiberMag_u'] - train['fiberMag_g']
train['petromMag_u_g'] = train['petroMag_u'] - train['petroMag_g']
train['modelmMag_u_g'] = train['modelMag_u'] - train['modelMag_g']

In [98]:
test['psfmMag_u_g'] = test['psfMag_u'] - test['psfMag_g']
test['fibermMag_u_g'] = test['fiberMag_u'] - test['fiberMag_g']
test['petromMag_u_g'] = test['petroMag_u'] - test['petroMag_g']
test['modelmMag_u_g'] = test['modelMag_u'] - test['modelMag_g']

### g - r

In [99]:
train['psfMag_g_r'] = train['psfMag_g'] - train['psfMag_r']
train['fiberMag_g_r'] = train['fiberMag_g'] - train['fiberMag_r']
train['petroMag_g_r'] = train['petroMag_g'] - train['petroMag_r']
train['modelMag_g_r'] = train['modelMag_g'] - train['modelMag_r']

In [100]:
test['psfMag_g_r'] = test['psfMag_g'] - test['psfMag_r']
test['fiberMag_g_r'] = test['fiberMag_g'] - test['fiberMag_r']
test['petroMag_g_r'] = test['petroMag_g'] - test['petroMag_r']
test['modelMag_g_r'] = test['modelMag_g'] - test['modelMag_r']

### g - i

In [101]:
train['psfMag_g_i'] = train['psfMag_g'] - train['psfMag_i']
train['fiberMag_g_i'] = train['fiberMag_g'] - train['fiberMag_i']
train['petroMag_g_i'] = train['petroMag_g'] - train['petroMag_i']
train['modelMag_g_i'] = train['modelMag_g'] - train['modelMag_i']

In [102]:
test['psfMag_g_i'] = test['psfMag_g'] - test['psfMag_i']
test['fiberMag_g_i'] = test['fiberMag_g'] - test['fiberMag_i']
test['petroMag_g_i'] = test['petroMag_g'] - test['petroMag_i']
test['modelMag_g_i'] = test['modelMag_g'] - test['modelMag_i']

### i -  z

In [103]:
train['psfMag_i_z'] = train['psfMag_i'] - train['psfMag_z']
train['fiberMag_i_z'] = train['fiberMag_i'] - train['fiberMag_z']
train['petroMag_i_z'] = train['petroMag_i'] - train['petroMag_z']
train['modelMag_i_z'] = train['modelMag_i'] - train['modelMag_z']

In [104]:
test['psfMag_i_z'] = test['psfMag_i'] - test['psfMag_z']
test['fiberMag_i_z'] = test['fiberMag_i'] - test['fiberMag_z']
test['petroMag_i_z'] = test['petroMag_i'] - test['petroMag_z']
test['modelMag_i_z'] = test['modelMag_i'] - test['modelMag_z']

# Polynomial Feature
## TO - DO

# 색지수(B-V)를 이용한 Spectral Class 변수 생성

<img src="https://www.scienceall.com/nas/image/201008/20100818_Dwh30q09.jpg">

In [105]:
def SpectralClass(x):
    if x < -0.3:
        return "O"
    
    elif (x >= -0.3) & (x < 0):
        return "B"
    
    elif (x >= 0) & (x < 0.33):
        return "A"
    
    elif (x >= 0.33) & (x < 0.6):
        return "F"
    
    elif (x >= 0.6) & (x < 0.81):
        return "G"
    
    elif (x >= 0.81) & (x < 1.4):
        return "K"
    
    else:
        return "M"

## Train

In [106]:
train['star_spectrum_psf'] = train['psfMag_b_v'].apply(SpectralClass)
train['star_spectrum_fiber'] = train['fiberMag_b_v'].apply(SpectralClass)
train['star_spectrum_petro'] = train['petroMag_b_v'].apply(SpectralClass)
train['star_spectrum_model'] = train['modelMag_b_v'].apply(SpectralClass)

In [83]:
sspsf = pd.get_dummies(train['star_spectrum_psf'], prefix = 'star_spectrum_psf_')
ssf = pd.get_dummies(train['star_spectrum_fiber'], prefix = 'star_spectrum_fiber_')
sspetro = pd.get_dummies(train['star_spectrum_petro'], prefix = 'star_spectrum_petro_')
ssm = pd.get_dummies(train['star_spectrum_model'], prefix = 'star_spectrum_model_')

star_spectrum_train = pd.concat([sspsf, ssf, sspetro, ssm], axis = 1)
train = pd.concat([train,star_spectrum_train], axis = 1)
train.drop(columns = ['star_spectrum_psf', 'star_spectrum_fiber', 'star_spectrum_petro', 'star_spectrum_model'],
          inplace = True)

## Test

In [84]:
test['star_spectrum_psf'] = test['psfMag_b_v'].apply(SpectralClass)
test['star_spectrum_fiber'] = test['fiberMag_b_v'].apply(SpectralClass)
test['star_spectrum_petro'] = test['petroMag_b_v'].apply(SpectralClass)
test['star_spectrum_model'] = test['modelMag_b_v'].apply(SpectralClass)

In [57]:
sspsf = pd.get_dummies(test['star_spectrum_psf'], prefix = 'star_spectrum_psf_')
ssf = pd.get_dummies(test['star_spectrum_fiber'], prefix = 'star_spectrum_fiber_')
sspetro = pd.get_dummies(test['star_spectrum_petro'], prefix = 'star_spectrum_petro_')
ssm = pd.get_dummies(test['star_spectrum_model'], prefix = 'star_spectrum_model_')

star_spectrum_test = pd.concat([sspsf, ssf, sspetro, ssm], axis = 1)
test = pd.concat([test,star_spectrum_test], axis = 1)
test.drop(columns = ['star_spectrum_psf', 'star_spectrum_fiber', 'star_spectrum_petro', 'star_spectrum_model'],
          inplace = True)

In [58]:
train.to_csv('../dataset/train_20200226.csv')
test.to_csv('../dataset/test_20200226.csv')

## Outlier Detection

In [47]:
X_train = train.drop(columns = 'type')
y_train = train['type']

num_X_train = X_train.drop(columns = 'fiberID')
num_X_test = test.drop(columns = 'fiberID')

model = IsolationForest(random_state = 42, contamination = 0.03)
model.fit(num_X_train)

outlier_pred = model.predict(num_X_train)

train['if_outlier'] = outlier_pred

train['if_outlier'].value_counts()

 1    193992
-1      5999
Name: if_outlier, dtype: int64

In [48]:
train_no_outlier = train.loc[train['if_outlier'] == 1]
train_no_outlier.drop(columns = ['if_outlier'], inplace = True)

## Create Summary Statistic Featue

### Train

In [49]:
fiberID_mean = train_no_outlier.groupby('fiberID').mean()
mean_cols = ["{}_mean".format(x) for x in fiberID_mean.columns]
fiberID_mean.columns = mean_cols
fiberID_mean.reset_index(inplace = True)

fiberID_median = train_no_outlier.groupby('fiberID').median()
median_cols = ["{}_median".format(x) for x in fiberID_median.columns]
fiberID_median.columns = median_cols
fiberID_median.reset_index(inplace = True)

fiberID_std = train_no_outlier.groupby('fiberID').std()
std_cols = ["{}_std".format(x) for x in fiberID_std.columns]
fiberID_std.columns = std_cols
fiberID_std.reset_index(inplace = True)

In [50]:
train = pd.merge(train_no_outlier, fiberID_mean, on = 'fiberID', how = 'left')
train = pd.merge(train, fiberID_median, on = 'fiberID', how = 'left')
train = pd.merge(train, fiberID_std, on = 'fiberID', how = 'left')

### Test

In [51]:
fiberID_mean = test.groupby('fiberID').mean()
mean_cols = ["{}_mean".format(x) for x in fiberID_mean.columns]
fiberID_mean.columns = mean_cols
fiberID_mean.reset_index(inplace = True)

fiberID_median = test.groupby('fiberID').median()
median_cols = ["{}_median".format(x) for x in fiberID_median.columns]
fiberID_median.columns = median_cols
fiberID_median.reset_index(inplace = True)

fiberID_std = test.groupby('fiberID').std()
std_cols = ["{}_std".format(x) for x in fiberID_std.columns]
fiberID_std.columns = std_cols
fiberID_std.reset_index(inplace = True)

In [52]:
test = pd.merge(test, fiberID_mean, on = 'fiberID', how = 'left')
test = pd.merge(test, fiberID_median, on = 'fiberID', how = 'left')
test = pd.merge(test, fiberID_std, on = 'fiberID', how = 'left')

# k-Means Clustering

## psfMag

### Train

In [53]:
X_train = train.drop(columns = 'type')
y_train = train['type']

psfMag_train = X_train[['psfMag_u', 'psfMag_g', 'psfMag_r', 'psfMag_i', 'psfMag_z']]
scaler = StandardScaler()
psfMag_train_scaler = scaler.fit_transform(psfMag_train)

pca = PCA(n_components = 2)
psf_pca = pca.fit_transform(psfMag_train_scaler)
print(pca.explained_variance_ratio_)
psf_pca_scaler_df = pd.DataFrame(psf_pca)

x = psf_pca_scaler_df.values

km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 42)
y_means = km.fit_predict(x)

train['psf_cluster'] = y_means

[0.70151416 0.19999788]


### Test

In [54]:
X_test = test

psfMag_test = X_test[['psfMag_u', 'psfMag_g', 'psfMag_r', 'psfMag_i', 'psfMag_z']]
psfMag_test_scaler = scaler.transform(psfMag_test)

psf_pca = pca.transform(psfMag_test_scaler)
print(pca.explained_variance_ratio_)
psf_pca_scaler_df = pd.DataFrame(psf_pca)

x = psf_pca_scaler_df.values
y_means = km.predict(x)

test['psf_cluster'] = y_means

[0.70151416 0.19999788]


## fiberMag
### Train

In [55]:
X_train = train.drop(columns = 'type')
y_train = train['type']

fiberMag_train = X_train[['fiberMag_u', 'fiberMag_g', 'fiberMag_r', 'fiberMag_i', 'fiberMag_z']]
scaler = StandardScaler()
fiberMag_train_scaler = scaler.fit_transform(fiberMag_train)

pca = PCA(n_components = 2)
fiber_pca = pca.fit_transform(fiberMag_train_scaler)
print(pca.explained_variance_ratio_)
fiber_pca_scaler_df = pd.DataFrame(fiber_pca)

x = fiber_pca_scaler_df.values

km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 42)
y_means = km.fit_predict(x)

train['fiber_cluster'] = y_means

[0.51879584 0.19999793]


### Test

In [56]:
X_test = test

fiberMag_test = X_test[['fiberMag_u', 'fiberMag_g', 'fiberMag_r', 'fiberMag_i', 'fiberMag_z']]
fiberMag_test_scaler = scaler.transform(fiberMag_test)

fiber_pca = pca.transform(fiberMag_test_scaler)
print(pca.explained_variance_ratio_)
fiber_pca_scaler_df = pd.DataFrame(fiber_pca)

x = fiber_pca_scaler_df.values
y_means = km.predict(x)

test['fiber_cluster'] = y_means

[0.51879584 0.19999793]


## model
### Train

In [57]:
X_train = train.drop(columns = 'type')
y_train = train['type']

modelMag_train = X_train[['modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i', 'modelMag_z']]
scaler = StandardScaler()
modelMag_train_scaler = scaler.fit_transform(modelMag_train)

pca = PCA(n_components = 2)
model_pca = pca.fit_transform(modelMag_train_scaler)
print(pca.explained_variance_ratio_)
model_pca_scaler_df = pd.DataFrame(model_pca)

x = model_pca_scaler_df.values

km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 42)
y_means = km.fit_predict(x)

train['model_cluster'] = y_means

[0.82275801 0.15646262]


### Test

In [58]:
X_test = test

modelMag_test = X_test[['modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i', 'modelMag_z']]
modelMag_test_scaler = scaler.transform(modelMag_test)

model_pca = pca.transform(modelMag_test_scaler)
print(pca.explained_variance_ratio_)
model_pca_scaler_df = pd.DataFrame(model_pca)

x = model_pca_scaler_df.values
y_means = km.predict(x)

test['model_cluster'] = y_means

[0.82275801 0.15646262]


In [59]:
test.head()

Unnamed: 0,fiberID,psfMag_u,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_u,fiberMag_g,fiberMag_r,fiberMag_i,...,petroMag_i_std,petroMag_z_std,modelMag_u_std,modelMag_g_std,modelMag_r_std,modelMag_i_std,modelMag_z_std,psf_cluster,fiber_cluster,model_cluster
0,251,23.817399,22.508963,20.981106,18.517316,17.076079,25.05389,23.167848,21.335901,18.835858,...,1.893139,1.793287,2.367397,2.213415,2.036589,1.887005,1.833027,0,1,2
1,386,22.806983,21.937111,20.33577,20.000512,19.527369,22.498565,22.186,20.618879,20.301204,...,1.347009,1.1211,2.04308,2.108371,1.781324,1.358582,1.112072,0,3,4
2,232,21.02425,19.235669,18.304061,17.808608,17.380113,21.205546,19.439533,18.344433,17.90969,...,1.591555,1.609979,3.038135,1.93275,1.771562,1.583241,1.574938,2,0,0
3,557,20.503424,20.286261,20.197204,20.162419,20.059832,20.976132,20.611498,20.567262,20.479318,...,1.813364,1.981834,2.144815,1.829022,1.887076,1.82598,1.913923,0,1,1
4,75,24.244851,22.668237,21.239333,19.284777,18.235939,25.68186,22.935289,21.642456,19.624926,...,1.627857,1.491637,2.775471,2.76291,2.167265,1.685952,1.623274,0,3,2


## 641

In [121]:
train['over_641'] = 0
train.loc[train['fiberID'] >= 641, 'over_641'] = 1

test['over_641'] = 0
test.loc[test['fiberID'] >= 641, 'over_641'] = 1

In [146]:
print(train.shape)
print(test.shape)

(199991, 58)
(10009, 57)


In [147]:
train.describe()

Unnamed: 0,fiberID,psfMag_u,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_u,fiberMag_g,fiberMag_r,fiberMag_i,...,petroMag_g_r,modelMag_g_r,psfMag_g_i,fiberMag_g_i,petroMag_g_i,modelMag_g_i,psfMag_i_z,fiberMag_i_z,petroMag_i_z,modelMag_i_z
count,199991.0,199991.0,199991.0,199991.0,199991.0,199991.0,199991.0,199991.0,199991.0,199991.0,...,199991.0,199991.0,199991.0,199991.0,199991.0,199991.0,199991.0,199991.0,199991.0,199991.0
mean,360.830152,20.984196,19.860931,19.286934,18.871288,18.612435,21.17129,20.089212,19.504481,19.085871,...,0.593802,0.591417,0.989643,1.003341,0.997511,1.011917,0.258853,0.258324,0.209735,0.24299
std,225.30589,2.077561,1.862845,1.706566,1.622895,1.683887,1.965332,1.843168,1.707691,1.628083,...,0.891487,0.662537,1.198534,1.183197,1.375375,1.203724,0.502551,0.491547,0.908188,0.502974
min,1.0,13.97959,13.005124,12.888487,12.770241,12.260415,14.287634,13.322913,13.25155,13.118824,...,-17.298728,-13.786538,-12.276446,-11.80629,-17.605725,-12.27935,-12.985449,-12.634583,-18.871631,-14.105615
25%,174.0,19.65259,18.70118,18.048572,17.747663,17.425523,19.940403,18.902851,18.259352,17.903615,...,0.126454,0.146714,0.254183,0.252611,0.239192,0.261862,0.012658,0.015527,-0.000725,0.015634
50%,349.0,20.871359,19.904235,19.454492,19.043895,18.611799,21.049096,20.069038,19.631419,19.188763,...,0.379964,0.378308,0.532209,0.539404,0.543932,0.539929,0.181018,0.194561,0.163743,0.171908
75%,526.0,22.160432,21.150297,20.515936,20.073528,19.88376,22.33754,21.38583,20.773911,20.331419,...,0.978696,1.012924,1.435658,1.502386,1.452252,1.480307,0.430213,0.400972,0.433092,0.377001
max,1000.0,29.368159,28.024363,26.737526,25.353447,25.821215,29.197543,28.030256,26.855783,25.206105,...,19.820782,15.927723,15.2378,10.629771,20.16446,16.331576,13.055728,12.493474,18.926626,13.598584


In [149]:
train.to_csv('../dataset/train20200225.csv')
test.to_csv('../dataset/test20200225.csv')