# Experiment 2-4
> `family_size`, `child_num` 은 거의 유사한 값을 가진다. 위 두 feature를 이용한 실험
- outlier처리
- drop
- 위 둘 중 성능이 좋아진 것이 있다면 거기에 `family_size` - `child_num` feature를 추가해보자
  

In [1]:
from urllib.request import urlretrieve

urlretrieve('https://drive.google.com/uc?export=download&id=1XLVFI_sK0smRVVuT8XU2s-M3lJT-68sN', './open.zip')

('./open.zip', <http.client.HTTPMessage at 0x7f16ecef2c90>)

In [2]:
!unzip ./open.zip

Archive:  ./open.zip
   creating: open/
  inflating: open/train.csv          
  inflating: open/sample_submission.csv  
  inflating: open/test.csv           


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import cross_validate
from sklearn.decomposition import PCA

In [4]:
train = pd.read_csv('./open/train.csv')
test = pd.read_csv('./open/test.csv')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

In [6]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


## Data Preprocessing

In [7]:
train.fillna('NAN', inplace=True)
test.fillna('NAN', inplace=True)

In [8]:
train.drop('index', axis=1, inplace=True)
test.drop('index', axis=1, inplace=True)

In [9]:
train.drop('FLAG_MOBIL', axis=1, inplace=True)
test.drop('FLAG_MOBIL', axis=1, inplace=True)

In [10]:
binary_col = ['gender',
              'car',
              'reality']

binary_encoder = OrdinalEncoder(categories=[['F', 'M'],
                                            ['N', 'Y'],
                                            ['N', 'Y']],
                                dtype=np.int8)\
                              .fit(train.loc[:,binary_col])

train.loc[:,binary_col] = binary_encoder.transform(train.loc[:,binary_col])

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         26457 non-null  int64  
 1   car            26457 non-null  int64  
 2   reality        26457 non-null  int64  
 3   child_num      26457 non-null  int64  
 4   income_total   26457 non-null  float64
 5   income_type    26457 non-null  object 
 6   edu_type       26457 non-null  object 
 7   family_type    26457 non-null  object 
 8   house_type     26457 non-null  object 
 9   DAYS_BIRTH     26457 non-null  int64  
 10  DAYS_EMPLOYED  26457 non-null  int64  
 11  work_phone     26457 non-null  int64  
 12  phone          26457 non-null  int64  
 13  email          26457 non-null  int64  
 14  occyp_type     26457 non-null  object 
 15  family_size    26457 non-null  float64
 16  begin_month    26457 non-null  float64
 17  credit         26457 non-null  float64
dtypes: flo

In [12]:
onehot_col = ['income_type',
              'edu_type',
              'family_type',
              'house_type',
              'occyp_type']


onehot_encoder = OneHotEncoder()
onehot_encoder.fit(train.loc[:,onehot_col])


train_onehot_df = pd.DataFrame(onehot_encoder.transform(train.loc[:,onehot_col]).toarray(), 
             columns=onehot_encoder.get_feature_names(onehot_col))
train.drop(onehot_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [13]:
X_train = train.drop('credit', axis=1)
y_train = train['credit']

In [14]:
# 아래 md에 기록되어있는 것은 이 feature 없이 학습한 것
# 아래 코드를 추가해줌으로써 기존 실험에 해당 feature가 추가된 실험을 진행할 수 있었다.
X_train['fc'] = X_train['family_size'] - X_train['child_num']

## outlier 처리

### `child_num`
- 4: 0.7611898663220819
- 3: 0.7606424909637512
- 2: 0.7608976274578698
- 1: 0.7599246928293375

In [None]:
X_train['child_num'].value_counts()

0     18340
1      5386
2      2362
3       306
4        47
5        10
14        3
7         2
19        1
Name: child_num, dtype: int64

In [None]:
for child_num_cut in range(4 + 1):
  X_train1 = X_train.copy()
  X_train1.loc[X_train1['child_num'] > child_num_cut, 'child_num'] = child_num_cut
  scores = cross_validate(LGBMClassifier(), X_train1, y_train,
                        scoring='neg_log_loss', return_train_score = True)
  mean_score = np.mean(scores['test_score'])
  print(f"c: {child_num_cut} >>> {mean_score}")

c: 0 >>> -0.7606751978574816
c: 1 >>> -0.7611189418990956
c: 2 >>> -0.7611066439833934
c: 3 >>> -0.7602247936797507
c: 4 >>> -0.7607514496600121


### `family_size`
- 6: 0.7611898663220819
- 5: 0.7608417340664374
- 4: 0.760812025635077
- 3: 0.760449725108483
- 2: 0.7612701386832864



In [None]:
X_train['family_size'].value_counts()

2.0     14106
1.0      5109
3.0      4632
4.0      2260
5.0       291
6.0        44
7.0         9
15.0        3
9.0         2
20.0        1
Name: family_size, dtype: int64

In [None]:
for family_size_cut in range(6 + 1):
  X_train2 = X_train.copy()
  X_train2.loc[X_train2['family_size'] > family_size_cut, 'family_size'] = family_size_cut
  scores = cross_validate(LGBMClassifier(), X_train2, y_train,
                        scoring='neg_log_loss', return_train_score = True)
  mean_score = np.mean(scores['test_score'])
  print(f"f: {family_size_cut} >>> {mean_score}")

f: 0 >>> -0.761219668371244
f: 1 >>> -0.761219668371244
f: 2 >>> -0.7613133737958269
f: 3 >>> -0.7604942631590917
f: 4 >>> -0.7604917575600143
f: 5 >>> -0.7611038966473103
f: 6 >>> -0.7607514496600121


### `family_size`, `child_num`



In [None]:
f_c_scores = []
verbose = True
for family_size_cut in range(6 + 1):
  for child_num_cut in range(4 + 1):
    X_train3 = X_train.copy()
    X_train3.loc[X_train3['family_size'] > family_size_cut, 'family_size'] = family_size_cut
    X_train3.loc[X_train3['child_num'] > child_num_cut, 'child_num'] = child_num_cut
    scores = cross_validate(LGBMClassifier(), X_train3, y_train,
                        scoring='neg_log_loss', return_train_score = True)
    mean_score = np.mean(scores['test_score'])
    if verbose:
      print(f"f: {family_size_cut}, c: {child_num_cut} >>> {mean_score}")
    f_c_scores.append([family_size_cut, child_num_cut, mean_score])

f: 0, c: 0 >>> -0.761827307135404
f: 0, c: 1 >>> -0.7617770973386897
f: 0, c: 2 >>> -0.7605980175723209
f: 0, c: 3 >>> -0.7610081945881834
f: 0, c: 4 >>> -0.761219668371244
f: 1, c: 0 >>> -0.761827307135404
f: 1, c: 1 >>> -0.7617770973386897
f: 1, c: 2 >>> -0.7605980175723209
f: 1, c: 3 >>> -0.7610081945881834
f: 1, c: 4 >>> -0.761219668371244
f: 2, c: 0 >>> -0.7625729910361008
f: 2, c: 1 >>> -0.7620379938857402
f: 2, c: 2 >>> -0.7622515488414463
f: 2, c: 3 >>> -0.7614174909632045
f: 2, c: 4 >>> -0.7613133737958269
f: 3, c: 0 >>> -0.7625924027942863
f: 3, c: 1 >>> -0.7616794294453341
f: 3, c: 2 >>> -0.7606290313274234
f: 3, c: 3 >>> -0.760641528149405
f: 3, c: 4 >>> -0.7604942631590917
f: 4, c: 0 >>> -0.7609506515754874
f: 4, c: 1 >>> -0.7604928553623018
f: 4, c: 2 >>> -0.760698439000106
f: 4, c: 3 >>> -0.7608835726669898
f: 4, c: 4 >>> -0.7604917575600143
f: 5, c: 0 >>> -0.7607280982570369
f: 5, c: 1 >>> -0.761007014948043
f: 5, c: 2 >>> -0.761151969406379
f: 5, c: 3 >>> -0.7608557064

In [None]:
for f_c_score in sorted(f_c_scores,
                        key=lambda x: x[2],
                        reverse=True):
  print(f_c_score)

[6, 3, -0.7602247936797507]
[4, 4, -0.7604917575600143]
[4, 1, -0.7604928553623018]
[3, 4, -0.7604942631590917]
[0, 2, -0.7605980175723209]
[1, 2, -0.7605980175723209]
[3, 2, -0.7606290313274234]
[3, 3, -0.760641528149405]
[6, 0, -0.7606751978574816]
[4, 2, -0.760698439000106]
[5, 0, -0.7607280982570369]
[6, 4, -0.7607514496600121]
[5, 3, -0.7608557064461273]
[4, 3, -0.7608835726669898]
[4, 0, -0.7609506515754874]
[5, 1, -0.761007014948043]
[0, 3, -0.7610081945881834]
[1, 3, -0.7610081945881834]
[5, 4, -0.7611038966473103]
[6, 2, -0.7611066439833934]
[6, 1, -0.7611189418990956]
[5, 2, -0.761151969406379]
[0, 4, -0.761219668371244]
[1, 4, -0.761219668371244]
[2, 4, -0.7613133737958269]
[2, 3, -0.7614174909632045]
[3, 1, -0.7616794294453341]
[0, 1, -0.7617770973386897]
[1, 1, -0.7617770973386897]
[0, 0, -0.761827307135404]
[1, 0, -0.761827307135404]
[2, 1, -0.7620379938857402]
[2, 2, -0.7622515488414463]
[2, 0, -0.7625729910361008]
[3, 0, -0.7625924027942863]


## drop 
> 위 outlier 제거에서 좋았던 결과이용

### `child_num`

In [None]:
for family_size_cut in range(6 + 1):
  X_train4 = X_train.copy()
  X_train4.drop('child_num', axis=1, inplace=True)
  X_train4.loc[X_train4['family_size'] > family_size_cut, 'family_size'] = family_size_cut
  scores = cross_validate(LGBMClassifier(), X_train4, y_train,
                        scoring='neg_log_loss', return_train_score = True)
  mean_score = np.mean(scores['test_score'])
  print(f"f: {family_size_cut} >>> {mean_score}")

f: 0 >>> -0.761827307135404
f: 1 >>> -0.761827307135404
f: 2 >>> -0.7625729910361008
f: 3 >>> -0.7625924027942863
f: 4 >>> -0.7609506515754874
f: 5 >>> -0.7607280982570369
f: 6 >>> -0.7606751978574816


### `family_size`

In [None]:
for child_num_cut in range(4 + 1):
  X_train5 = X_train.copy()
  X_train5.drop('family_size', axis=1, inplace=True)
  X_train5.loc[X_train5['child_num'] > child_num_cut, 'child_num'] = child_num_cut
  scores = cross_validate(LGBMClassifier(), X_train5, y_train,
                        scoring='neg_log_loss', return_train_score = True)
  mean_score = np.mean(scores['test_score'])
  print(f"c: {child_num_cut} >>> {mean_score}")

c: 0 >>> -0.761827307135404
c: 1 >>> -0.7617770973386897
c: 2 >>> -0.7605980175723209
c: 3 >>> -0.7610081945881834
c: 4 >>> -0.761219668371244


### `family_size`, `child_num`

In [None]:
X_train6 = X_train.copy()

In [None]:
X_train6.drop(['family_size', 'child_num'], axis=1, inplace=True)

In [None]:
scores = cross_validate(LGBMClassifier(), X_train6, y_train,
                        scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([0.9566443 , 0.9418478 , 0.92449498, 0.947258  , 0.93635368]),
 'score_time': array([0.07845855, 0.08253956, 0.09019876, 0.08136678, 0.08195996]),
 'test_score': array([-0.75979331, -0.76272134, -0.76425039, -0.76251465, -0.75985685]),
 'train_score': array([-0.68091979, -0.6801855 , -0.67880311, -0.68128271, -0.67846869])}

In [None]:
np.mean(scores['test_score'])

-0.761827307135404

## `family_size` - `child_num` feature를 추가
> 성능 향상이 있다. 위에 다 이 피처를 추가해서 실험해보자

In [None]:
X_train7 = X_train.copy()

In [None]:
X_train7['fc'] = X_train7['family_size'] - X_train7['child_num']

In [None]:
scores = cross_validate(LGBMClassifier(), X_train7, y_train,
                        scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([1.01455283, 0.96858191, 0.98320007, 0.9779036 , 0.9881382 ]),
 'score_time': array([0.0812192 , 0.08094454, 0.08763242, 0.08286309, 0.08171439]),
 'test_score': array([-0.7596808 , -0.7621518 , -0.76162517, -0.7616678 , -0.75863166]),
 'train_score': array([-0.67950466, -0.6781597 , -0.6762141 , -0.679316  , -0.67592511])}

In [None]:
np.mean(scores['test_score'])

-0.7607514496600121

## `family_size`, `child_num` 를 pca로 1차원으로 축소
> 두 feature를 pca로 1차원으로 축소 후 두 feature는 삭제
- `fc` feature 추가 X : `0.760850247070009`
- `fc` feature 추가 O : `0.760534246196231`

In [None]:
from sklearn.decomposition import PCA

In [None]:
X_train8 = X_train.copy()

In [None]:
pca = PCA(n_components=1).fit(X_train8.loc[:, ['family_size', 'child_num']])

X_train8['fc-pca'] = pca.transform(X_train8.loc[:, ['family_size', 'child_num']])
X_train8.drop(['family_size', 'child_num'], axis=1, inplace=True)

In [None]:
scores = cross_validate(LGBMClassifier(), X_train8, y_train,
                        scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([1.01806879, 0.95372629, 1.01737976, 0.98158693, 0.96076345]),
 'score_time': array([0.07166815, 0.07103014, 0.06161547, 0.06137872, 0.05985355]),
 'test_score': array([-0.75872168, -0.76171117, -0.7621631 , -0.76089365, -0.75918163]),
 'train_score': array([-0.67928098, -0.67754291, -0.67633875, -0.67975345, -0.67712197])}

In [None]:
np.mean(scores['test_score'])

-0.760534246196231

### outlier 처리 후 차원 축소
> 위 결과가 좋아서 추가적인 실험을 해본다. `fc` feature가 있는 상태에서 더 좋은 결과가 있었으므로 해당 feature 추가된 상태에서 실험 진행


In [None]:
f_c_scores = []
verbose = True
for family_size_cut in range(6 + 1):
  for child_num_cut in range(4 + 1):
    X_train9 = X_train.copy()
    X_train9.loc[X_train9['family_size'] > family_size_cut, 'family_size'] = family_size_cut
    X_train9.loc[X_train9['child_num'] > child_num_cut, 'child_num'] = child_num_cut
    
    pca = PCA(n_components=1).fit(X_train9.loc[:, ['family_size', 'child_num']])
    X_train9['fc-pca'] = pca.transform(X_train9.loc[:, ['family_size', 'child_num']])
    X_train9.drop(['family_size', 'child_num'], axis=1, inplace=True)

    scores = cross_validate(LGBMClassifier(), X_train9, y_train,
                        scoring='neg_log_loss', return_train_score = True)
    mean_score = np.mean(scores['test_score'])
    if verbose:
      print(f"f: {family_size_cut}, c: {child_num_cut} >>> {mean_score}")
    f_c_scores.append([family_size_cut, child_num_cut, mean_score])

  self.explained_variance_ / total_var.sum()


f: 0, c: 0 >>> -0.761827307135404
f: 0, c: 1 >>> -0.7617770973386897
f: 0, c: 2 >>> -0.7605980175723209
f: 0, c: 3 >>> -0.7610081945881834
f: 0, c: 4 >>> -0.761219668371244


  self.explained_variance_ / total_var.sum()


f: 1, c: 0 >>> -0.761827307135404
f: 1, c: 1 >>> -0.7617770973386897
f: 1, c: 2 >>> -0.7605980175723209
f: 1, c: 3 >>> -0.7610081945881834
f: 1, c: 4 >>> -0.761219668371244
f: 2, c: 0 >>> -0.7625684301044725
f: 2, c: 1 >>> -0.7615971940131588
f: 2, c: 2 >>> -0.7610315749213907
f: 2, c: 3 >>> -0.760468695243959
f: 2, c: 4 >>> -0.7610946793121112
f: 3, c: 0 >>> -0.7625931219868649
f: 3, c: 1 >>> -0.7621868393178346
f: 3, c: 2 >>> -0.761018927090302
f: 3, c: 3 >>> -0.7605179763336499
f: 3, c: 4 >>> -0.7602061397247105
f: 4, c: 0 >>> -0.7609521182575179
f: 4, c: 1 >>> -0.7607092382634991
f: 4, c: 2 >>> -0.7605597999540878
f: 4, c: 3 >>> -0.7609823841248061
f: 4, c: 4 >>> -0.7601912868737516
f: 5, c: 0 >>> -0.7607250255613991
f: 5, c: 1 >>> -0.7616307497815447
f: 5, c: 2 >>> -0.7606371450658229
f: 5, c: 3 >>> -0.7604743669620061
f: 5, c: 4 >>> -0.7604454825350792
f: 6, c: 0 >>> -0.7606733820640248
f: 6, c: 1 >>> -0.760487756542561
f: 6, c: 2 >>> -0.7611564828661839
f: 6, c: 3 >>> -0.7609031

In [None]:
for f_c_score in sorted(f_c_scores,
                        key=lambda x: x[2],
                        reverse=True):
  print(f_c_score)

[4, 4, -0.7601912868737516]
[3, 4, -0.7602061397247105]
[5, 4, -0.7604454825350792]
[2, 3, -0.760468695243959]
[5, 3, -0.7604743669620061]
[6, 1, -0.760487756542561]
[3, 3, -0.7605179763336499]
[6, 4, -0.760534246196231]
[4, 2, -0.7605597999540878]
[0, 2, -0.7605980175723209]
[1, 2, -0.7605980175723209]
[5, 2, -0.7606371450658229]
[6, 0, -0.7606733820640248]
[4, 1, -0.7607092382634991]
[5, 0, -0.7607250255613991]
[6, 3, -0.7609031884937945]
[4, 0, -0.7609521182575179]
[4, 3, -0.7609823841248061]
[0, 3, -0.7610081945881834]
[1, 3, -0.7610081945881834]
[3, 2, -0.761018927090302]
[2, 2, -0.7610315749213907]
[2, 4, -0.7610946793121112]
[6, 2, -0.7611564828661839]
[0, 4, -0.761219668371244]
[1, 4, -0.761219668371244]
[2, 1, -0.7615971940131588]
[5, 1, -0.7616307497815447]
[0, 1, -0.7617770973386897]
[1, 1, -0.7617770973386897]
[0, 0, -0.761827307135404]
[1, 0, -0.761827307135404]
[3, 1, -0.7621868393178346]
[2, 0, -0.7625684301044725]
[3, 0, -0.7625931219868649]


### `family_size`, `child_num`, `fc` 모두 합쳐 pca
> 위 data preprocessing 의 `fc` feature를 추가해준 후 실행해야한다.

In [None]:
X_train10 = X_train.copy()

In [None]:
pca = PCA(n_components=1).fit(X_train10.loc[:, ['family_size', 'child_num', 'fc']])

X_train10['fc-pca'] = pca.transform(X_train10.loc[:, ['family_size', 'child_num', 'fc']])
X_train10.drop(['family_size', 'child_num', 'fc'], axis=1, inplace=True)

In [None]:
scores = cross_validate(LGBMClassifier(), X_train10, y_train,
                        scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([0.9714396 , 0.98427534, 0.98744297, 1.00542951, 0.95881343]),
 'score_time': array([0.07624555, 0.06928778, 0.07170916, 0.07034445, 0.06896138]),
 'test_score': array([-0.7589056 , -0.76187609, -0.76304733, -0.76328032, -0.75832377]),
 'train_score': array([-0.67818817, -0.67733292, -0.67682773, -0.68028636, -0.67643711])}

In [None]:
np.mean(scores['test_score'])

-0.7610866224954469

#### `family_size`, `child_num` outlier 제거 후 `fc` 모두 합쳐 pca

In [None]:
f_c_scores = []
verbose = True
for family_size_cut in range(6 + 1):
  for child_num_cut in range(4 + 1):
    X_train11 = X_train.copy()
    X_train11.loc[X_train11['family_size'] > family_size_cut, 'family_size'] = family_size_cut
    X_train11.loc[X_train11['child_num'] > child_num_cut, 'child_num'] = child_num_cut
    
    pca = PCA(n_components=1).fit(X_train11.loc[:, ['family_size', 'child_num', 'fc']])
    X_train11['fc-pca'] = pca.transform(X_train11.loc[:, ['family_size', 'child_num', 'fc']])
    X_train11.drop(['family_size', 'child_num', 'fc'], axis=1, inplace=True)

    scores = cross_validate(LGBMClassifier(), X_train11, y_train,
                        scoring='neg_log_loss', return_train_score = True)
    mean_score = np.mean(scores['test_score'])
    if verbose:
      print(f"f: {family_size_cut}, c: {child_num_cut} >>> {mean_score}")
    f_c_scores.append([family_size_cut, child_num_cut, mean_score])

f: 0, c: 0 >>> -0.7618298668020479
f: 0, c: 1 >>> -0.7614232348113211
f: 0, c: 2 >>> -0.7605454500078801
f: 0, c: 3 >>> -0.7607741768584266
f: 0, c: 4 >>> -0.7604711560721625
f: 1, c: 0 >>> -0.7618298668020479
f: 1, c: 1 >>> -0.7614232348113211
f: 1, c: 2 >>> -0.7605454500078801
f: 1, c: 3 >>> -0.7607741768584266
f: 1, c: 4 >>> -0.7604711560721625
f: 2, c: 0 >>> -0.7625624014218928
f: 2, c: 1 >>> -0.7625620170251575
f: 2, c: 2 >>> -0.7610771356149122
f: 2, c: 3 >>> -0.7606100476266411
f: 2, c: 4 >>> -0.7610827325272145
f: 3, c: 0 >>> -0.7623271676752111
f: 3, c: 1 >>> -0.7612095793453415
f: 3, c: 2 >>> -0.7612702314760134
f: 3, c: 3 >>> -0.7606316884399701
f: 3, c: 4 >>> -0.7600580763343784
f: 4, c: 0 >>> -0.761378730216034
f: 4, c: 1 >>> -0.760361712706606
f: 4, c: 2 >>> -0.7604649642691491
f: 4, c: 3 >>> -0.7606316884399701
f: 4, c: 4 >>> -0.7600580763343784
f: 5, c: 0 >>> -0.7612052812590567
f: 5, c: 1 >>> -0.7611781161247394
f: 5, c: 2 >>> -0.7613626118477255
f: 5, c: 3 >>> -0.7605

In [None]:
for f_c_score in sorted(f_c_scores,
                        key=lambda x: x[2],
                        reverse=True):
  print(f_c_score)

[3, 4, -0.7600580763343784]
[4, 4, -0.7600580763343784]
[5, 4, -0.7600580763343784]
[6, 4, -0.7600580763343784]
[6, 2, -0.7601985900920513]
[4, 1, -0.760361712706606]
[4, 2, -0.7604649642691491]
[0, 4, -0.7604711560721625]
[1, 4, -0.7604711560721625]
[5, 3, -0.7605177956886361]
[0, 2, -0.7605454500078801]
[1, 2, -0.7605454500078801]
[2, 3, -0.7606100476266411]
[3, 3, -0.7606316884399701]
[4, 3, -0.7606316884399701]
[6, 3, -0.7606918294157344]
[0, 3, -0.7607741768584266]
[1, 3, -0.7607741768584266]
[6, 0, -0.7609073473004726]
[2, 2, -0.7610771356149122]
[2, 4, -0.7610827325272145]
[5, 1, -0.7611781161247394]
[5, 0, -0.7612052812590567]
[3, 1, -0.7612095793453415]
[3, 2, -0.7612702314760134]
[5, 2, -0.7613626118477255]
[4, 0, -0.761378730216034]
[0, 1, -0.7614232348113211]
[1, 1, -0.7614232348113211]
[6, 1, -0.7615265868382421]
[0, 0, -0.7618298668020479]
[1, 0, -0.7618298668020479]
[3, 0, -0.7623271676752111]
[2, 1, -0.7625620170251575]
[2, 0, -0.7625624014218928]


## `fc` outlier
> 0, -1 값을 없앤다.
- 0, -1 값을 없애본다
  - 성능에는 영향이 없었으나 너무 적으므로 없애는 것이 좋을 것 같다.
- `family_size`, `child_num` outlier 제거한 feature로 `fc` 생성해본다 - 성능 저하


In [None]:
X_train['fc'].value_counts()

 2.0    20331
 1.0     6120
 0.0        5
-1.0        1
Name: fc, dtype: int64

In [17]:
fc_scores = []
verbose = True
family_size_cut = 6
child_num_cut = 4

for fc_cut in range(1 + 1):
  X_train12 = X_train.copy()
  X_train12.loc[X_train12['fc'] < fc_cut, 'fc'] = fc_cut
  X_train12.loc[X_train12['family_size'] > family_size_cut, 'family_size'] = family_size_cut
  X_train12.loc[X_train12['child_num'] > child_num_cut, 'child_num'] = child_num_cut

  
  pca = PCA(n_components=1).fit(X_train12.loc[:, ['family_size', 'child_num', 'fc']])
  X_train12['fc-pca'] = pca.transform(X_train12.loc[:, ['family_size', 'child_num', 'fc']])
  X_train12.drop(['family_size', 'child_num', 'fc'], axis=1, inplace=True)

  scores = cross_validate(LGBMClassifier(), X_train12, y_train,
                      scoring='neg_log_loss', return_train_score = True)
  mean_score = np.mean(scores['test_score'])
  if verbose:
    print(f"f_cut: {family_size_cut}, c_cut: {child_num_cut}, fc_cut: {fc_cut} >>> {mean_score}")
  fc_scores.append([fc_cut, mean_score])

f_cut: 6, c_cut: 4, fc_cut: 0 >>> -0.7602282929809074
f_cut: 6, c_cut: 4, fc_cut: 1 >>> -0.7602282929809074


In [None]:
fc_scores = []
verbose = True
family_size_cut = 6
child_num_cut = 4

X_train12 = X_train.copy()
X_train12.loc[X_train12['family_size'] > family_size_cut, 'family_size'] = family_size_cut
X_train12.loc[X_train12['child_num'] > child_num_cut, 'child_num'] = child_num_cut
X_train12['fc'] = X_train12['family_size'] - X_train12['child_num']


pca = PCA(n_components=1).fit(X_train12.loc[:, ['family_size', 'child_num', 'fc']])
X_train12['fc-pca'] = pca.transform(X_train12.loc[:, ['family_size', 'child_num', 'fc']])
X_train12.drop(['family_size', 'child_num', 'fc'], axis=1, inplace=True)

scores = cross_validate(LGBMClassifier(), X_train12, y_train,
                    scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([0.95069242, 0.87520242, 0.88980675, 0.85541773, 0.85921788]),
 'score_time': array([0.05568337, 0.06675601, 0.05874467, 0.05781913, 0.06091022]),
 'test_score': array([-0.7589056 , -0.76187609, -0.76304733, -0.76328032, -0.75832377]),
 'train_score': array([-0.67818817, -0.67733292, -0.67682773, -0.68028636, -0.67643711])}

In [None]:
np.mean(scores['test_score'])

-0.7610866224954469

In [19]:
fc_scores = []
verbose = True
family_size_cut = 6
child_num_cut = 4
fc_cut = 1

X_train12 = X_train.copy()
X_train12.loc[X_train12['fc'] < fc_cut, 'fc'] = fc_cut
X_train12.loc[X_train12['family_size'] > family_size_cut, 'family_size'] = family_size_cut
X_train12.loc[X_train12['child_num'] > child_num_cut, 'child_num'] = child_num_cut
X_train12['fc'] -= 1

pca = PCA(n_components=1).fit(X_train12.loc[:, ['family_size', 'child_num', 'fc']])
X_train12['fc-pca'] = pca.transform(X_train12.loc[:, ['family_size', 'child_num', 'fc']])
X_train12.drop(['family_size', 'child_num', 'fc'], axis=1, inplace=True)

scores = cross_validate(LGBMClassifier(), X_train12, y_train,
                    scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([0.91905665, 0.88347316, 0.89303112, 0.86074018, 0.8688159 ]),
 'score_time': array([0.06033993, 0.06833315, 0.06067729, 0.0599575 , 0.05946851]),
 'test_score': array([-0.75840581, -0.76172716, -0.76034956, -0.76181151, -0.75884743]),
 'train_score': array([-0.67829986, -0.67840645, -0.67451047, -0.68036836, -0.67653073])}

In [20]:
np.mean(scores['test_score'])

-0.7602282929809074