# Library

In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_regression
import random
import platform
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from matplotlib import font_manager , rc
from sklearn.ensemble import RandomForestClassifier
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Windows' :
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname = path).get_name()
    rc('font' , family = font_name)

# Data Load & Preprocessing
- 훈련에 필요없는 index 컬럼 삭제.
- missing value를 모두 NAN 문자열로 대체
- dtype object 인 컬럼들을 onehot encoding

In [2]:
train = pd.read_csv('dataset/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 
train = train.drop(['occyp_type'] , axis = 1)
X = train.copy()
"""credit = X.pop("credit")
discrete_features = X.dtypes == float"""
test = pd.read_csv('dataset/test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)
test  = test.drop(['occyp_type'] , axis = 1)
submit = pd.read_csv('dataset/sample_submission.csv')
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores 
Y = test.copy()

In [3]:
train.dtypes

gender            object
car               object
reality           object
child_num          int64
income_total     float64
income_type       object
edu_type          object
family_type       object
house_type        object
DAYS_BIRTH         int64
DAYS_EMPLOYED      int64
FLAG_MOBIL         int64
work_phone         int64
phone              int64
email              int64
family_size      float64
begin_month      float64
credit           float64
dtype: object

In [4]:
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()
X['income_type']=label_encoder.fit_transform(X['income_type'])
X['edu_type']=label_encoder.fit_transform(X['edu_type'])
X['family_type']=label_encoder.fit_transform(X['family_type'])
X['house_type']=label_encoder.fit_transform(X['house_type'])
X['income_total']=label_encoder.fit_transform(X['income_total'])
#X['occyp_type']=label_encoder.fit_transform(X['occyp_type'])
X = X.loc[:,["income_type" ,"edu_type" , "family_type" , "house_type", "income_total"   ]]
Y['income_type']=label_encoder.fit_transform(Y['income_type'])
Y['edu_type']=label_encoder.fit_transform(Y['edu_type'])
Y['family_type']=label_encoder.fit_transform(Y['family_type'])
Y['house_type']=label_encoder.fit_transform(Y['house_type'])
Y['income_total']=label_encoder.fit_transform(Y['income_total'])
#Y['occyp_type']=label_encoder.fit_transform(Y['occyp_type'])
Y = Y.loc[:,["income_type" ,"edu_type" , "family_type" , "house_type", "income_total"   ]]
X.head()

Unnamed: 0,income_type,edu_type,family_type,house_type,income_total
0,0,1,1,2,145
1,0,4,0,1,165
2,4,1,1,1,214
3,0,4,1,1,145
4,2,1,1,1,111


In [5]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 7)
X["Cluster"] = kmeans.fit_predict(X)
Y["Cluster"] = kmeans.fit_predict(Y)
X.head()

Unnamed: 0,income_type,edu_type,family_type,house_type,income_total,Cluster
0,0,1,1,2,145,1
1,0,4,0,1,165,1
2,4,1,1,1,214,6
3,0,4,1,1,145,1
4,2,1,1,1,111,2


In [6]:
clf = DecisionTreeClassifier(random_state = 0)
#iris.data
#iris.target
#cross_val_score(clf, iris.data,iris.target, cv = 10)
#train.drop(['credit'],axis=1)
#cross_val_score(clf,train.drop(['credit'],axis=1),train['credit'] , cv =15)

clf.fit(X,train['credit'] )
clf.feature_importances_

array([0.11716867, 0.11044328, 0.11054985, 0.11530533, 0.49380993,
       0.05272294])

In [7]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [8]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [9]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [10]:
#train['Cluster'] = X['Cluster']
#test['Cluster'] = Y['Cluster']

In [11]:
train.dtypes

child_num                                   int64
income_total                              float64
DAYS_BIRTH                                  int64
DAYS_EMPLOYED                               int64
FLAG_MOBIL                                  int64
work_phone                                  int64
phone                                       int64
email                                       int64
family_size                               float64
begin_month                               float64
credit                                    float64
gender_F                                  float64
gender_M                                  float64
car_N                                     float64
car_Y                                     float64
reality_N                                 float64
reality_Y                                 float64
income_type_Commercial associate          float64
income_type_Pensioner                     float64
income_type_State servant                 float64


In [12]:
from sklearn.datasets import load_iris
#train['credit']
iris = load_iris()
clf = DecisionTreeClassifier(random_state = 0)
#iris.data
#iris.target
#cross_val_score(clf, iris.data,iris.target, cv = 10)
#train.drop(['credit'],axis=1)
#cross_val_score(clf,train.drop(['credit'],axis=1),train['credit'] , cv =15)

clf.fit(train.drop(['credit'],axis=1),train['credit'])
clf.feature_importances_
#train['decision_tree'] = clf.predict(train.drop(['credit'],axis=1))
#test['decision_tree'] = clf.predict(test)
#clf = MLPClassifier(random_state = 23 , max_iter = 1000).fit(train.drop(['credit'],axis=1) , train['credit'])
#train['MLP'] = clf.predict(train.drop(['credit'],axis=1))
#test['MLP'] = clf.predict(test)

array([1.28385096e-02, 1.04455091e-01, 1.68185955e-01, 1.24501059e-01,
       0.00000000e+00, 1.25254992e-02, 1.39930783e-02, 9.58714416e-03,
       1.91890752e-02, 3.82920767e-01, 1.04104168e-02, 8.70782775e-03,
       1.07477119e-02, 9.88635335e-03, 1.11137194e-02, 1.00244887e-02,
       8.05212109e-03, 1.44118449e-03, 5.44934903e-03, 0.00000000e+00,
       8.86206285e-03, 3.11563566e-04, 7.42366143e-03, 3.59789803e-03,
       1.71765787e-03, 7.75843890e-03, 5.24806035e-03, 5.96787762e-03,
       5.65571351e-03, 6.76646263e-03, 3.18728972e-03, 1.40887581e-03,
       5.45879162e-03, 3.61374693e-03, 1.31775860e-03, 3.89091349e-03,
       3.78387510e-03])

In [13]:
#train.drop('child_num', axis=1, inplace=True)
#test.drop('child_num', axis=1, inplace=True)
#train.drop('DAYS_BIRTH' , axis = 1 ,inplace = True)
#test.drop('DAYS_BIRTH' , axis = 1 ,inplace = True)
#train.drop('FLAG_MOBIL' , axis = 1 ,inplace = True)
#test.drop('FLAG_MOBIL' , axis = 1 ,inplace = True)
#train.drop('work_phone' , axis = 1 ,inplace = True)
#test.drop('work_phone' , axis = 1 ,inplace = True)
#train.drop('begin_month' , axis = 1 ,inplace = True)
#test.drop('begin_month' , axis = 1 ,inplace = True)

In [14]:
train.head()

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,family_type_Married,family_type_Separated,family_type_Single / not married,family_type_Widow,house_type_Co-op apartment,house_type_House / apartment,house_type_Municipal apartment,house_type_Office apartment,house_type_Rented apartment,house_type_With parents
0,0,202500.0,-13899,-4709,1,0,0,0,2.0,-6.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,247500.0,-11380,-1540,1,0,0,1,3.0,-5.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0,450000.0,-19087,-4434,1,0,1,0,2.0,-22.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0,202500.0,-15088,-2092,1,0,1,0,2.0,-37.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0,157500.0,-15037,-2105,1,0,0,0,2.0,-26.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
#from sklearn.feature_selection import GenericUnivariateSelect ,  chi2
#train.shape
#transformer = GenericUnivariateSelect(chi2, mode='fdr', param=20)
#train_new = transformer.fit_transform(train.drop(['credit' , 'DAYS_BIRTH' , 'DAYS_EMPLOYED' , 'begin_month'],axis=1),train['credit'])
#train_new.shape

In [16]:
#train_new
#test.shape

# Training
- 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 5-fold
- lightgbm의 default parameter로 훈련.
- 30번 이상 개선 없을 경우 중단.
- 각 5개의 fold를 훈련하여 저장

In [17]:
skf_f = StratifiedKFold(n_splits=5, shuffle=True, random_state=50)
folds_f =[]
for train_idx_f, valid_idx_f in skf_f.split(train, train['credit']):
    folds_f.append((train_idx_f, valid_idx_f))
folds_f

[(array([    0,     3,     4, ..., 26452, 26453, 26455]),
  array([    1,     2,     6, ..., 26448, 26454, 26456])),
 (array([    1,     2,     3, ..., 26454, 26455, 26456]),
  array([    0,     4,    10, ..., 26440, 26449, 26453])),
 (array([    0,     1,     2, ..., 26454, 26455, 26456]),
  array([    3,     8,    14, ..., 26426, 26430, 26446])),
 (array([    0,     1,     2, ..., 26454, 26455, 26456]),
  array([    5,     9,    11, ..., 26444, 26447, 26450])),
 (array([    0,     1,     2, ..., 26453, 26454, 26456]),
  array([    7,    22,    24, ..., 26451, 26452, 26455]))]

In [18]:
random.seed(50)

In [19]:
rf_models = {}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds_f[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    clf = RandomForestClassifier(n_estimators = 1000 , max_depth = 30)
    clf.fit(X_train, y_train)
    train_array = clf.predict_proba(train.drop(['credit'],axis=1))/10
    test_array = clf.predict_proba(test)/10
    clf.fit(X_valid , y_valid)
    train_array = clf.predict_proba(train.drop(['credit'],axis=1))/10
    test_array = clf.predict_proba(test)/10
    print(f'================================================================================\n\n')













In [20]:
train_array

array([[0.0144    , 0.01042439, 0.07517561],
       [0.0043    , 0.0174    , 0.0783    ],
       [0.00741484, 0.00861282, 0.08397234],
       ...,
       [0.01780833, 0.01921667, 0.062975  ],
       [0.0042    , 0.0119    , 0.0839    ],
       [0.00391325, 0.01065339, 0.08543337]])

In [21]:
train_array[:,0]

array([0.0144    , 0.0043    , 0.00741484, ..., 0.01780833, 0.0042    ,
       0.00391325])

In [22]:
test_array

array([[0.0067    , 0.0087    , 0.0846    ],
       [0.01746767, 0.0201069 , 0.06242543],
       [0.00397092, 0.0240814 , 0.07194769],
       ...,
       [0.00831579, 0.01212632, 0.07955789],
       [0.02402393, 0.02216203, 0.05381404],
       [0.0132    , 0.016875  , 0.069925  ]])

In [23]:
max_train_idx = np.argmax(train_array , 1)
max_test_idx = np.argmax(test_array , 1)

In [24]:
max_train_idx

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [25]:
max_test_idx

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [26]:
train_array_max = []
test_array_max = []
for i in range(0, len(max_train_idx)):  
    train_array_max.append(max_train_idx[i] + train_array[i,max_train_idx[i]])
for i in range(0, len(max_test_idx)):  
    test_array_max.append(max_test_idx[i] + test_array[i,max_test_idx[i]])

In [27]:
train_array_max

[2.075175606060606,
 2.0783,
 2.083972344322344,
 2.079296305567547,
 2.0729643838256706,
 2.06925,
 2.0803433333333334,
 0.07793928571428571,
 2.0810230248471964,
 2.0690091053141852,
 2.0742295321637427,
 2.053705381355932,
 2.0731181368320892,
 2.0565766666666665,
 2.0534198074771464,
 0.0486,
 1.056,
 2.0559,
 2.07206543035008,
 2.0804573841354723,
 2.0821095991352006,
 2.0561966508281633,
 0.06542481092436973,
 2.066333333333333,
 2.0914366666666666,
 2.0456503761755487,
 2.0656166666666667,
 2.0375,
 2.0554,
 0.06420333333333335,
 2.08026482663044,
 2.0605533333333335,
 1.05085,
 2.0943700920882167,
 2.0720295151515153,
 2.076161403508772,
 2.086672807017544,
 1.0484,
 2.0641408365008287,
 2.0693202439932947,
 2.0764414042984094,
 2.0577666666666667,
 2.0694733333333333,
 2.05825,
 2.080575641025641,
 1.0449766666666667,
 2.071917211797649,
 2.063803677248677,
 1.0640505721634776,
 2.072191666666667,
 1.07725,
 2.055675,
 1.0703687326642244,
 2.0902,
 0.059,
 0.06301330523485281,

In [28]:
test_array_max

[2.0846,
 2.062425431034483,
 2.0719476861154833,
 2.047616923076923,
 2.068608333333333,
 2.094422222222222,
 1.066225,
 2.0846628161956837,
 2.077164285714286,
 2.07125,
 2.081906163136845,
 2.0891389129334748,
 0.0683,
 2.0738991091075665,
 2.0869639879138395,
 2.06215,
 2.0654688241202837,
 1.0641,
 0.05300909090909091,
 2.069222881355932,
 2.057627372665707,
 2.0840996666666665,
 2.0671975,
 2.0581925,
 1.0523855263157895,
 2.0934,
 1.0513181818181818,
 2.0632,
 2.080034051724138,
 2.055892497909594,
 2.086539814814815,
 1.0799,
 1.0572969517365072,
 2.0742270238095237,
 2.0757939654195012,
 2.0690180555555555,
 1.0820116666666666,
 2.071460260080097,
 2.0777052875695734,
 2.083497833133015,
 1.0559,
 0.0432,
 2.0894526635319566,
 2.083477777777778,
 2.0467876344086022,
 2.09125,
 2.0507,
 2.068027916666667,
 2.0882469254658385,
 1.089275,
 1.0799,
 2.073745238095238,
 2.0818171457026966,
 1.0492874637127578,
 2.0940283564814814,
 2.049776959680243,
 2.08519,
 2.0752991482352017,


In [29]:
#train['credit0'] = train_array[:,0]
#train['credit1'] = train_array[:,1]
#train['credit2'] = train_array[:,2]

In [30]:
train['rf_credit'] = train_array_max

In [31]:
test['rf_credit'] = test_array_max

In [32]:
train.head()

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,family_type_Separated,family_type_Single / not married,family_type_Widow,house_type_Co-op apartment,house_type_House / apartment,house_type_Municipal apartment,house_type_Office apartment,house_type_Rented apartment,house_type_With parents,rf_credit
0,0,202500.0,-13899,-4709,1,0,0,0,2.0,-6.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.075176
1,1,247500.0,-11380,-1540,1,0,0,1,3.0,-5.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0783
2,0,450000.0,-19087,-4434,1,0,1,0,2.0,-22.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.083972
3,0,202500.0,-15088,-2092,1,0,1,0,2.0,-37.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.079296
4,0,157500.0,-15037,-2105,1,0,0,0,2.0,-26.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.072964


In [33]:
#test['credit0'] = test_array[:,0]
#test['credit1'] = test_array[:,1]
#test['credit2'] = test_array[:,2]

In [34]:
train.dtypes

child_num                                   int64
income_total                              float64
DAYS_BIRTH                                  int64
DAYS_EMPLOYED                               int64
FLAG_MOBIL                                  int64
work_phone                                  int64
phone                                       int64
email                                       int64
family_size                               float64
begin_month                               float64
credit                                    float64
gender_F                                  float64
gender_M                                  float64
car_N                                     float64
car_Y                                     float64
reality_N                                 float64
reality_Y                                 float64
income_type_Commercial associate          float64
income_type_Pensioner                     float64
income_type_State servant                 float64


In [35]:
test.dtypes

child_num                                   int64
income_total                              float64
DAYS_BIRTH                                  int64
DAYS_EMPLOYED                               int64
FLAG_MOBIL                                  int64
work_phone                                  int64
phone                                       int64
email                                       int64
family_size                               float64
begin_month                               float64
gender_F                                  float64
gender_M                                  float64
car_N                                     float64
car_Y                                     float64
reality_N                                 float64
reality_Y                                 float64
income_type_Commercial associate          float64
income_type_Pensioner                     float64
income_type_State servant                 float64
income_type_Student                       float64


In [36]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [37]:
folds

[(array([    0,     1,     3, ..., 26453, 26454, 26456]),
  array([    2,    26,    45, ..., 26446, 26452, 26455])),
 (array([    2,     3,     6, ..., 26454, 26455, 26456]),
  array([    0,     1,     4, ..., 26435, 26447, 26450])),
 (array([    0,     1,     2, ..., 26453, 26454, 26455]),
  array([    3,    12,    16, ..., 26438, 26448, 26456])),
 (array([    0,     1,     2, ..., 26454, 26455, 26456]),
  array([    6,    14,    29, ..., 26440, 26442, 26453])),
 (array([    0,     1,     2, ..., 26453, 26455, 26456]),
  array([    7,     9,    10, ..., 26449, 26451, 26454]))]

In [38]:
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(boosting_type='gbdt',
                        learning_rate=0.1,
                         max_depth=30,
                         n_estimators=1500,
                         objective='regression',
                         metric='multi_logloss',
                         is_training_metric=True,
                         num_leaves=150, #중요
                         feature_fraction=0.7,#몇프로feature랜덤하게 학습 열 샘플링
                         bagging_fraction=0.7, #행 샘플링
                         seed=2020,
                         early_stopping_round=100,
                         min_data_in_leaf=5, #중요
                         tree_learner='feature',
                         extra_trees='False'
                        )
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.293097	valid_1's multi_logloss: 0.640532
Early stopping, best iteration is:
[67]	training's multi_logloss: 0.359866	valid_1's multi_logloss: 0.63216


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.283746	valid_1's multi_logloss: 0.670158
Early stopping, best iteration is:
[54]	training's multi_logloss: 0.386699	valid_1's multi_logloss: 0.6579


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.288019	valid_1's multi_logloss: 0.660425
Early stopping, best iteration is:
[51]	training's multi_logloss: 0.401986	valid_1's multi_logloss: 0.647548


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.291982	valid_1's multi_logloss: 0.648397
Early stopping, best iteration is:
[54]	training's multi_logloss: 0.394075	valid_1's multi_logloss: 0.639286


Training un

# Test inference
- 각 fold를 훈련시킨 lightgbm model로 predict.
- 해당 대회는 logloss score를 겨루는 것이기 때문에 각 class의 probability를 얻어야함.
- 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함.
- predict는 class 출력을 해주고 predict_proba는 class별 probability를 출력해줌.
- predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블.

In [39]:
submit.iloc[:,1:]=0
for fold in range(5):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [40]:
submit.to_csv('submit/20210510_test_submit.csv', index=False) # 0.7272812144

In [41]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.022172,0.070017,0.90781
1,26458,0.161913,0.226765,0.611322
2,26459,0.098174,0.133272,0.768554
3,26460,0.101585,0.14092,0.757494
4,26461,0.093165,0.187536,0.719299
5,26462,0.010162,0.015485,0.974354
6,26463,0.253244,0.731699,0.015058
7,26464,0.036351,0.049915,0.913735
8,26465,0.066586,0.12523,0.808184
9,26466,0.083333,0.219652,0.697015


In [42]:
submit_ = pd.read_csv('dataset/sample_submission.csv')
#submit.iloc[:,1:]=0
submit_.iloc[:,1:] = 0
submit_.head()
#int(submit.iloc[:,1:].idxmax(1)[2])
#submit_.iloc[0,1:]
#submit.head()
#submit.iloc[:,1:].idxmax(1)
a = []
a= submit.iloc[:,1:].idxmax(1)
#submit_.iloc[3,1:]
for i in range(0,len(submit_)):
    submit_.iloc[i,int(a[i])+1] = 1
submit_.head()
submit_.to_csv('submit/20210502_test_submit_one_hot.csv', index=False)
submit_.head()
#submit_.head()
#len(submit_)
#submit_.iloc[:,submit.iloc[:,1:].idxmax(1)]=1
#submit.idxmax(1)
#submit_.head()
#for i in range(submit):
    
#submit.iloc[:,1:].idxmax(1)

Unnamed: 0,index,0,1,2
0,26457,0,0,1
1,26458,0,0,1
2,26459,0,0,1
3,26460,0,0,1
4,26461,0,0,1


In [43]:
#submit_ = pd.read_csv('dataset/sample_submission.csv')
#submit_.iloc[:,1:] = 0


#submit_.iloc[:,1:] = clf.predict_proba(test)
#submit_.head()
#np.round(submit_,5)
#submit_.to_csv('submit/20210502_test_submit_MLP.csv', index=False)

In [44]:
#submit_ = pd.read_csv('dataset/sample_submission.csv')
#a = clf.predict(test)
#submit_.iloc[:,1:] = 0
#for i in range(len(test)):
    #submit_.iloc[i , int(a [i]) +1] = 1
#submit_.head()
#submit_.to_csv('submit/20210502_test_submit_MLP.csv', index=False)

In [45]:
train.head()

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,family_type_Separated,family_type_Single / not married,family_type_Widow,house_type_Co-op apartment,house_type_House / apartment,house_type_Municipal apartment,house_type_Office apartment,house_type_Rented apartment,house_type_With parents,rf_credit
0,0,202500.0,-13899,-4709,1,0,0,0,2.0,-6.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.075176
1,1,247500.0,-11380,-1540,1,0,0,1,3.0,-5.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0783
2,0,450000.0,-19087,-4434,1,0,1,0,2.0,-22.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.083972
3,0,202500.0,-15088,-2092,1,0,1,0,2.0,-37.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.079296
4,0,157500.0,-15037,-2105,1,0,0,0,2.0,-26.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.072964
