# Experiment 2



In [1]:
from urllib.request import urlretrieve

urlretrieve('https://drive.google.com/uc?export=download&id=1XLVFI_sK0smRVVuT8XU2s-M3lJT-68sN', './open.zip')

('./open.zip', <http.client.HTTPMessage at 0x7f5a3ca919d0>)

In [2]:
!unzip ./open.zip

Archive:  ./open.zip
   creating: open/
  inflating: open/train.csv          
  inflating: open/sample_submission.csv  
  inflating: open/test.csv           


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import cross_validate
from sklearn.decomposition import PCA

In [4]:
train = pd.read_csv('./open/train.csv')

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

## Data Preprocessing

In [8]:
def train_preprocessing(train):
  train.fillna('NAN', inplace=True)
  train.drop('index', axis=1, inplace=True)
  train.drop('FLAG_MOBIL', axis=1, inplace=True)

  binary_col = ['gender',
                'car',
                'reality']

  binary_encoder = OrdinalEncoder(categories=[['F', 'M'],
                                              ['N', 'Y'],
                                              ['N', 'Y']],
                                  dtype=np.int8)\
                                .fit(train.loc[:,binary_col])

  train.loc[:,binary_col] = binary_encoder.transform(train.loc[:,binary_col])

  onehot_col = ['income_type',
                'edu_type',
                'family_type',
                'house_type',
                'occyp_type']


  onehot_encoder = OneHotEncoder()
  onehot_encoder.fit(train.loc[:,onehot_col])


  train_onehot_df = pd.DataFrame(onehot_encoder.transform(train.loc[:,onehot_col]).toarray(), 
              columns=onehot_encoder.get_feature_names(onehot_col))
  train.drop(onehot_col, axis=1, inplace=True)
  train = pd.concat([train, train_onehot_df], axis=1)

  train['parent'] = train['family_size'] - train['child_num']

  family_size_cut = 6
  child_num_cut = 4

  train.loc[train['family_size'] > family_size_cut, 'family_size'] = family_size_cut
  train.loc[train['child_num'] > child_num_cut, 'child_num'] = child_num_cut

  pca_col = ['family_size', 'child_num', 'parent']
  pca = PCA(n_components=1).fit(train.loc[:, pca_col])

  train['fcp-pca'] = pca.transform(train.loc[:, pca_col])
  train.drop(pca_col, axis=1, inplace=True)

  train['income_total'] = np.log1p(train['income_total'])

  X_train = train.drop('credit', axis=1)
  y_train = train['credit']

  return X_train, y_train

## 7. `edu_type` 교육 수준별로 라벨링
> bad

In [16]:
train['edu_type'].unique()

array(['Higher education', 'Secondary / secondary special',
       'Incomplete higher', 'Lower secondary', 'Academic degree'],
      dtype=object)

In [17]:
def train_preprocessing1(train):
  train.fillna('NAN', inplace=True)
  train.drop('index', axis=1, inplace=True)
  train.drop('FLAG_MOBIL', axis=1, inplace=True)

  binary_col = ['gender',
                'car',
                'reality',
                'edu_type']

  binary_encoder = OrdinalEncoder(categories=[['F', 'M'],
                                              ['N', 'Y'],
                                              ['N', 'Y'],
                                              ['Academic degree', 'Lower secondary',
                                               'Secondary / secondary special',
                                               'Incomplete higher', 'Higher education']],
                                  dtype=np.int8)\
                                  .fit(train.loc[:,binary_col])

  train.loc[:,binary_col] = binary_encoder.transform(train.loc[:,binary_col])

  onehot_col = ['income_type',
                'family_type',
                'house_type',
                'occyp_type']


  onehot_encoder = OneHotEncoder()
  onehot_encoder.fit(train.loc[:,onehot_col])


  train_onehot_df = pd.DataFrame(onehot_encoder.transform(train.loc[:,onehot_col]).toarray(), 
              columns=onehot_encoder.get_feature_names(onehot_col))
  train.drop(onehot_col, axis=1, inplace=True)
  train = pd.concat([train, train_onehot_df], axis=1)

  train['parent'] = train['family_size'] - train['child_num']

  family_size_cut = 6
  child_num_cut = 4

  train.loc[train['family_size'] > family_size_cut, 'family_size'] = family_size_cut
  train.loc[train['child_num'] > child_num_cut, 'child_num'] = child_num_cut

  pca_col = ['family_size', 'child_num', 'parent']
  pca = PCA(n_components=1).fit(train.loc[:, pca_col])

  train['fcp-pca'] = pca.transform(train.loc[:, pca_col])
  train.drop(pca_col, axis=1, inplace=True)

  train['income_total'] = np.log1p(train['income_total'])

  X_train = train.drop('credit', axis=1)
  y_train = train['credit']

  return X_train, y_train

In [23]:
X_train_exp, y_train = train_preprocessing1(train.copy())

In [24]:
X_train_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 47 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   gender                            26457 non-null  int64  
 1   car                               26457 non-null  int64  
 2   reality                           26457 non-null  int64  
 3   income_total                      26457 non-null  float64
 4   edu_type                          26457 non-null  int64  
 5   DAYS_BIRTH                        26457 non-null  int64  
 6   DAYS_EMPLOYED                     26457 non-null  int64  
 7   work_phone                        26457 non-null  int64  
 8   phone                             26457 non-null  int64  
 9   email                             26457 non-null  int64  
 10  begin_month                       26457 non-null  float64
 11  income_type_Commercial associate  26457 non-null  float64
 12  inco

In [25]:
scores = cross_validate(LGBMClassifier(), X_train_exp, y_train,
                        scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([0.95021152, 0.93656373, 0.97658181, 0.93772435, 0.95535517]),
 'score_time': array([0.07377243, 0.07136631, 0.06446457, 0.06120062, 0.06377602]),
 'test_score': array([-0.75873393, -0.76133012, -0.76183258, -0.76075993, -0.7578092 ]),
 'train_score': array([-0.67907687, -0.67773229, -0.67626229, -0.68004292, -0.67688604])}

In [26]:
np.mean(scores['test_score'])

-0.7600931525396223

## 8. `edu_type` credit에 따라 순차적으로 라벨링

In [17]:
train_exp = train.copy()
train_exp.fillna('NAN', inplace=True)

In [27]:
edu_type_weights = {}

for n, edu in enumerate(train_exp['edu_type'].unique()):
    quant = train_exp.loc[train_exp['edu_type']==edu, 'credit'].value_counts().values
    edu_type_weights[edu] = (quant/quant.sum()).round(3)*100

In [28]:
edu_type_weights

{'Academic degree': array([60.9, 30.4,  8.7]),
 'Higher education': array([62.9, 24.4, 12.7]),
 'Incomplete higher': array([64.7, 24.1, 11.2]),
 'Lower secondary': array([66.1, 23. , 10.9]),
 'Secondary / secondary special': array([64.6, 23.4, 12.1])}

In [29]:
for edu_type in edu_type_weights.keys():
  edu_type_weights[edu_type] = \
    edu_type_weights[edu_type][0] * 3 +\
    edu_type_weights[edu_type][1] * 2 +\
    edu_type_weights[edu_type][2] * 1

In [30]:
edu_type_weights

{'Academic degree': 252.2,
 'Higher education': 250.2,
 'Incomplete higher': 253.5,
 'Lower secondary': 255.20000000000002,
 'Secondary / secondary special': 252.70000000000002}

In [31]:
sorted_edu_type_weights_list = sorted(list(edu_type_weights.values()))

In [33]:
for edu_type in edu_type_weights.keys():
  edu_type_weights[edu_type] = \
    sorted_edu_type_weights_list.index(edu_type_weights[edu_type])

In [35]:
sorted(list(edu_type_weights.items()), key=lambda x: x[1])

[('Higher education', 0),
 ('Academic degree', 1),
 ('Secondary / secondary special', 2),
 ('Incomplete higher', 3),
 ('Lower secondary', 4)]

In [36]:
def train_preprocessing2(train):
  train.fillna('NAN', inplace=True)
  train.drop('index', axis=1, inplace=True)
  train.drop('FLAG_MOBIL', axis=1, inplace=True)

  binary_col = ['gender',
                'car',
                'reality']

  binary_encoder = OrdinalEncoder(categories=[['F', 'M'],
                                              ['N', 'Y'],
                                              ['N', 'Y']],
                                  dtype=np.int8)\
                                  .fit(train.loc[:,binary_col])

  train.loc[:,binary_col] = binary_encoder.transform(train.loc[:,binary_col])

  ordinal_col = ['edu_type']

  ordinal_encoder = OrdinalEncoder(categories=[['Higher education', 'Academic degree',
                                                'Secondary / secondary special',
                                                'Incomplete higher', 'Lower secondary']],
                                    dtype=np.int8)\
                                    .fit(train.loc[:,ordinal_col])

  train.loc[:,ordinal_col] = ordinal_encoder.transform(train.loc[:,ordinal_col])


  onehot_col = ['income_type',
                'family_type',
                'house_type',
                'occyp_type']


  onehot_encoder = OneHotEncoder()
  onehot_encoder.fit(train.loc[:,onehot_col])


  train_onehot_df = pd.DataFrame(onehot_encoder.transform(train.loc[:,onehot_col]).toarray(), 
              columns=onehot_encoder.get_feature_names(onehot_col))
  train.drop(onehot_col, axis=1, inplace=True)
  train = pd.concat([train, train_onehot_df], axis=1)

  train['parent'] = train['family_size'] - train['child_num']

  family_size_cut = 6
  child_num_cut = 4

  train.loc[train['family_size'] > family_size_cut, 'family_size'] = family_size_cut
  train.loc[train['child_num'] > child_num_cut, 'child_num'] = child_num_cut

  pca_col = ['family_size', 'child_num', 'parent']
  pca = PCA(n_components=1).fit(train.loc[:, pca_col])

  train['fcp-pca'] = pca.transform(train.loc[:, pca_col])
  train.drop(pca_col, axis=1, inplace=True)

  train['income_total'] = np.log1p(train['income_total'])

  X_train = train.drop('credit', axis=1)
  y_train = train['credit']

  return X_train, y_train

In [37]:
X_train_exp, y_train = train_preprocessing2(train.copy())

In [38]:
scores = cross_validate(LGBMClassifier(), X_train_exp, y_train,
                        scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([0.9331162 , 0.88539767, 0.9522965 , 0.88887119, 0.88162065]),
 'score_time': array([0.06028128, 0.05853701, 0.062392  , 0.0615108 , 0.07072783]),
 'test_score': array([-0.75960509, -0.76188516, -0.76233361, -0.76192716, -0.75883264]),
 'train_score': array([-0.67948218, -0.67690544, -0.67759518, -0.68100993, -0.67623578])}

In [39]:
np.mean(scores['test_score'])

-0.7609167321942225

## validation code

In [10]:
scores = cross_validate(LGBMClassifier(), X_train_exp, y_train,
                        scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([0.95071197, 0.91329908, 0.96192431, 0.93059611, 0.92720604]),
 'score_time': array([0.07212067, 0.06145167, 0.06224227, 0.06157851, 0.06124687]),
 'test_score': array([-0.75801923, -0.76170773, -0.76034956, -0.76089536, -0.75931595]),
 'train_score': array([-0.67798479, -0.67827494, -0.67451047, -0.6794452 , -0.6779453 ])}

In [11]:
np.mean(scores['test_score'])

-0.7600575656819766