In [1]:
from urllib.request import urlretrieve

urlretrieve('https://drive.google.com/uc?export=download&id=1XLVFI_sK0smRVVuT8XU2s-M3lJT-68sN', './open.zip')

('./open.zip', <http.client.HTTPMessage at 0x7f19307ada10>)

In [2]:
!unzip ./open.zip

Archive:  ./open.zip
   creating: open/
  inflating: open/train.csv          
  inflating: open/sample_submission.csv  
  inflating: open/test.csv           


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import cross_validate
from sklearn.decomposition import PCA

In [20]:
def preproces(train, test=None):
  train = train.fillna('NAN')
  train = train.drop('index', axis=1)
  train = train.drop('FLAG_MOBIL', axis=1)

  binary_col = ['gender',
                'car',
                'reality']

  binary_encoder = OrdinalEncoder(categories=[['F', 'M'],
                                              ['N', 'Y'],
                                              ['N', 'Y']],
                                  dtype=np.int8)\
                                  .fit(train.loc[:,binary_col])

  train.loc[:,binary_col] = binary_encoder.transform(train.loc[:,binary_col])

  onehot_col = ['income_type',
                'edu_type',
                'family_type',
                'house_type',
                'occyp_type']


  onehot_encoder = OneHotEncoder()
  onehot_encoder.fit(train.loc[:,onehot_col])

  train_onehot_df = pd.DataFrame(onehot_encoder.transform(train.loc[:,onehot_col]).toarray(), 
                                 columns=onehot_encoder.get_feature_names(onehot_col))
  train = train.drop(onehot_col, axis=1)
  train = pd.concat([train, train_onehot_df], axis=1)

  train['parent'] = train['family_size'] - train['child_num']

  family_size_cut = 6
  child_num_cut = 4

  train.loc[train['family_size'] > family_size_cut, 'family_size'] = family_size_cut
  train.loc[train['child_num'] > child_num_cut, 'child_num'] = child_num_cut

  pca_col = ['family_size', 'child_num', 'parent']
  pca = PCA(n_components=1).fit(train.loc[:, pca_col])

  train['fcp-pca'] = pca.transform(train.loc[:, pca_col])
  train = train.drop(pca_col, axis=1)

  train['income_total'] = np.log1p(train['income_total'])

  X_train = train.drop('credit', axis=1)
  y_train = train['credit']

  if test is not None:
    test = test.fillna('NAN')
    test = test.drop('index', axis=1)
    test = test.drop('FLAG_MOBIL', axis=1)

    test.loc[:,binary_col] = binary_encoder.transform(test.loc[:,binary_col])

    test_onehot_df = pd.DataFrame(onehot_encoder.transform(test.loc[:,onehot_col]).toarray(), 
                                   columns=onehot_encoder.get_feature_names(onehot_col))
    test = test.drop(onehot_col, axis=1)
    test = pd.concat([test, test_onehot_df], axis=1)

    test['parent'] = test['family_size'] - test['child_num']
    test.loc[test['family_size'] > family_size_cut, 'family_size'] = family_size_cut
    test.loc[test['child_num'] > child_num_cut, 'child_num'] = child_num_cut

    test['fcp-pca'] = pca.transform(test.loc[:, pca_col])
    test = test.drop(pca_col, axis=1)

    test['income_total'] = np.log1p(test['income_total'])

    return X_train, y_train, test
  
  return X_train, y_train

In [22]:
train = pd.read_csv('/content/open/train.csv')
test = pd.read_csv('/content/open/test.csv')

In [24]:
X_train, y_train, X_test = data_preprocessing(train, test)

In [18]:
scores = cross_validate(LGBMClassifier(), X_train, y_train,
                        scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([0.95561504, 0.91039133, 0.90002084, 0.87949419, 0.86666751]),
 'score_time': array([0.05862617, 0.05799842, 0.0608809 , 0.06087852, 0.06041002]),
 'test_score': array([-0.75801923, -0.76170773, -0.76034956, -0.76089536, -0.75931595]),
 'train_score': array([-0.67798479, -0.67827494, -0.67451047, -0.6794452 , -0.6779453 ])}

In [19]:
np.mean(scores['test_score'])

-0.7600575656819766