# Library

In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_regression
import random
import platform
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from matplotlib import font_manager , rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Windows' :
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname = path).get_name()
    rc('font' , family = font_name)

# Data Load & Preprocessing
- 훈련에 필요없는 index 컬럼 삭제.
- missing value를 모두 NAN 문자열로 대체
- dtype object 인 컬럼들을 onehot encoding

In [2]:
train = pd.read_csv('dataset/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 

X = train.copy()
"""credit = X.pop("credit")
discrete_features = X.dtypes == float"""
test = pd.read_csv('dataset/test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('dataset/sample_submission.csv')
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores 
Y = test.copy()

In [3]:
train.dtypes

gender            object
car               object
reality           object
child_num          int64
income_total     float64
income_type       object
edu_type          object
family_type       object
house_type        object
DAYS_BIRTH         int64
DAYS_EMPLOYED      int64
FLAG_MOBIL         int64
work_phone         int64
phone              int64
email              int64
occyp_type        object
family_size      float64
begin_month      float64
credit           float64
dtype: object

In [4]:
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()
X['income_type']=label_encoder.fit_transform(X['income_type'])
X['edu_type']=label_encoder.fit_transform(X['edu_type'])
X['family_type']=label_encoder.fit_transform(X['family_type'])
X['house_type']=label_encoder.fit_transform(X['house_type'])
X['income_total']=label_encoder.fit_transform(X['income_total'])
#X['occyp_type']=label_encoder.fit_transform(X['occyp_type'])
X = X.loc[:,["income_type" ,"edu_type" , "family_type" , "house_type", "income_total"   ]]
Y['income_type']=label_encoder.fit_transform(Y['income_type'])
Y['edu_type']=label_encoder.fit_transform(Y['edu_type'])
Y['family_type']=label_encoder.fit_transform(Y['family_type'])
Y['house_type']=label_encoder.fit_transform(Y['house_type'])
Y['income_total']=label_encoder.fit_transform(Y['income_total'])
#Y['occyp_type']=label_encoder.fit_transform(Y['occyp_type'])
Y = Y.loc[:,["income_type" ,"edu_type" , "family_type" , "house_type", "income_total"   ]]
X.head()

Unnamed: 0,income_type,edu_type,family_type,house_type,income_total
0,0,1,1,2,145
1,0,4,0,1,165
2,4,1,1,1,214
3,0,4,1,1,145
4,2,1,1,1,111


In [5]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 3)
X["Cluster"] = kmeans.fit_predict(X)
Y["Cluster"] = kmeans.fit_predict(Y)
X.head()

Unnamed: 0,income_type,edu_type,family_type,house_type,income_total,Cluster
0,0,1,1,2,145,1
1,0,4,0,1,165,1
2,4,1,1,1,214,1
3,0,4,1,1,145,1
4,2,1,1,1,111,2


In [6]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [7]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [8]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [9]:
train['Cluster'] = X['Cluster']
test['Cluster'] = Y['Cluster']

In [10]:
train.dtypes

child_num                                   int64
income_total                              float64
DAYS_BIRTH                                  int64
DAYS_EMPLOYED                               int64
FLAG_MOBIL                                  int64
work_phone                                  int64
phone                                       int64
email                                       int64
family_size                               float64
begin_month                               float64
credit                                    float64
gender_F                                  float64
gender_M                                  float64
car_N                                     float64
car_Y                                     float64
reality_N                                 float64
reality_Y                                 float64
income_type_Commercial associate          float64
income_type_Pensioner                     float64
income_type_State servant                 float64


In [11]:
from sklearn.datasets import load_iris
#train['credit']
iris = load_iris()
clf = DecisionTreeClassifier(random_state = 0)
#iris.data
#iris.target
#cross_val_score(clf, iris.data,iris.target, cv = 10)
#train.drop(['credit'],axis=1)
#cross_val_score(clf,train.drop(['credit'],axis=1),train['credit'] , cv =15)

clf.fit(train.drop(['credit'],axis=1),train['credit'])
clf.feature_importances_
train['decision_tree'] = clf.predict(train.drop(['credit'],axis=1))
test['decision_tree'] = clf.predict(test)
clf = MLPClassifier(random_state = 23 , max_iter = 1000).fit(train.drop(['credit'],axis=1) , train['credit'])
train['MLP'] = clf.predict(train.drop(['credit'],axis=1))
test['MLP'] = clf.predict(test)

In [12]:
#train.drop('child_num', axis=1, inplace=True)
#test.drop('child_num', axis=1, inplace=True)
#train.drop('DAYS_BIRTH' , axis = 1 ,inplace = True)
#test.drop('DAYS_BIRTH' , axis = 1 ,inplace = True)
#train.drop('FLAG_MOBIL' , axis = 1 ,inplace = True)
#test.drop('FLAG_MOBIL' , axis = 1 ,inplace = True)
#train.drop('work_phone' , axis = 1 ,inplace = True)
#test.drop('work_phone' , axis = 1 ,inplace = True)
#train.drop('begin_month' , axis = 1 ,inplace = True)
#test.drop('begin_month' , axis = 1 ,inplace = True)

In [13]:
train.head()

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,occyp_type_NAN,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff,Cluster,decision_tree,MLP
0,0,202500.0,-13899,-4709,1,0,0,0,2.0,-6.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,0.0
1,1,247500.0,-11380,-1540,1,0,0,1,3.0,-5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0
2,0,450000.0,-19087,-4434,1,0,1,0,2.0,-22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,0.0
3,0,202500.0,-15088,-2092,1,0,1,0,2.0,-37.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,0.0
4,0,157500.0,-15037,-2105,1,0,0,0,2.0,-26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2.0,0.0


In [14]:
#from sklearn.feature_selection import GenericUnivariateSelect ,  chi2
#train.shape
#transformer = GenericUnivariateSelect(chi2, mode='fdr', param=20)
#train_new = transformer.fit_transform(train.drop(['credit' , 'DAYS_BIRTH' , 'DAYS_EMPLOYED' , 'begin_month'],axis=1),train['credit'])
#train_new.shape

In [15]:
#train_new
#test.shape

# Training
- 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 5-fold
- lightgbm의 default parameter로 훈련.
- 30번 이상 개선 없을 경우 중단.
- 각 5개의 fold를 훈련하여 저장

In [16]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [17]:
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')


Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[38]	training's multi_logloss: 0.0858827	valid_1's multi_logloss: 0.122129


Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[40]	training's multi_logloss: 0.0877383	valid_1's multi_logloss: 0.108568


Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[39]	training's multi_logloss: 0.0874838	valid_1's multi_logloss: 0.113001


Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[38]	training's multi_logloss: 0.0858416	valid_1's multi_logloss: 0.118866


Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[39]	training's multi_logloss: 0.0873788	valid_1's multi_logloss: 0.116395




# Test inference
- 각 fold를 훈련시킨 lightgbm model로 predict.
- 해당 대회는 logloss score를 겨루는 것이기 때문에 각 class의 probability를 얻어야함.
- 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함.
- predict는 class 출력을 해주고 predict_proba는 class별 probability를 출력해줌.
- predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블.

In [18]:
submit.iloc[:,1:]=0
for fold in range(5):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [19]:
submit.to_csv('submit/20210502_test_submit_K_MLP_Decision.csv', index=False) # 0.7272812144

In [20]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.001618,0.003007,0.995375
1,26458,0.922822,0.018897,0.058281
2,26459,0.001593,0.002571,0.995836
3,26460,0.003954,0.007066,0.988979
4,26461,0.001261,0.003224,0.995515
5,26462,0.001539,0.002483,0.995978
6,26463,0.953142,0.035152,0.011706
7,26464,0.002019,0.003917,0.994064
8,26465,0.001183,0.002277,0.996539
9,26466,0.925612,0.019311,0.055077


In [21]:
submit_ = pd.read_csv('dataset/sample_submission.csv')
#submit.iloc[:,1:]=0
submit_.iloc[:,1:] = 0
submit_.head()
#int(submit.iloc[:,1:].idxmax(1)[2])
#submit_.iloc[0,1:]
#submit.head()
#submit.iloc[:,1:].idxmax(1)
a = []
a= submit.iloc[:,1:].idxmax(1)
#submit_.iloc[3,1:]
for i in range(0,len(submit_)):
    submit_.iloc[i,int(a[i])+1] = 1
submit_.head()
submit_.to_csv('submit/20210502_test_submit_one_hot.csv', index=False)
submit_.head()
#submit_.head()
#len(submit_)
#submit_.iloc[:,submit.iloc[:,1:].idxmax(1)]=1
#submit.idxmax(1)
#submit_.head()
#for i in range(submit):
    
#submit.iloc[:,1:].idxmax(1)

Unnamed: 0,index,0,1,2
0,26457,0,0,1
1,26458,1,0,0
2,26459,0,0,1
3,26460,0,0,1
4,26461,0,0,1


In [22]:
#submit_ = pd.read_csv('dataset/sample_submission.csv')
#submit_.iloc[:,1:] = 0


#submit_.iloc[:,1:] = clf.predict_proba(test)
#submit_.head()
#np.round(submit_,5)
#submit_.to_csv('submit/20210502_test_submit_MLP.csv', index=False)

In [23]:
#submit_ = pd.read_csv('dataset/sample_submission.csv')
#a = clf.predict(test)
#submit_.iloc[:,1:] = 0
#for i in range(len(test)):
    #submit_.iloc[i , int(a [i]) +1] = 1
#submit_.head()
#submit_.to_csv('submit/20210502_test_submit_MLP.csv', index=False)

In [24]:
train.head()

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,occyp_type_NAN,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff,Cluster,decision_tree,MLP
0,0,202500.0,-13899,-4709,1,0,0,0,2.0,-6.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,0.0
1,1,247500.0,-11380,-1540,1,0,0,1,3.0,-5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0
2,0,450000.0,-19087,-4434,1,0,1,0,2.0,-22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,0.0
3,0,202500.0,-15088,-2092,1,0,1,0,2.0,-37.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,0.0
4,0,157500.0,-15037,-2105,1,0,0,0,2.0,-26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2.0,0.0
