In [12]:
import numpy as np  
import matplotlib.pyplot as plt  
import pandas as pd 
import random
import xgboost as xgb
import math
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from xgboost.sklearn import XGBClassifier
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from sklearn import metrics
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler 
rcParams['figure.figsize'] = 12, 4

In [20]:
def my_preprocess(train):
    #one hot encoding
    S0 = [0]*len(train)
    S1 = [0]*len(train)
    S2 = [0]*len(train)
    Male = [0]*len(train)
    Female = [0]*len(train)
    for i in range(len(train)):
        if train['Geography'].loc[i]=='S0':
            S0[i] = 1
        elif train['Geography'].loc[i]=='S1':
            S1[i] = 1
        else:
            S2[i] = 1
        if train['Gender'].loc[i]=='Male':
            Male[i] = 1
        else:
            Female[i] = 1
    train['S0'] = S0
    train['S1'] = S1
    train['S2'] = S2
    train['Male'] = Male
    train['Female'] = Female

    Age = train['Age'].tolist()
    Age_range = []
    for i in range(len(Age)):
        if Age[i]>=15 and Age[i]<22:
            Age_range.append(1)
        elif Age[i]>=22 and Age[i]<35:
            Age_range.append(2)
        elif Age[i]>=35 and Age[i]<50:
            Age_range.append(3)
        elif Age[i]>=50 and Age[i]<65:
            Age_range.append(4)
        else:
            Age_range.append(0)
    train['Age_range'] = Age_range
    
    return train

In [22]:
#load and preprocessing
train = pd.read_csv('train.csv')
target = 'Exited'
train = my_preprocess(train)
predictors = [x for x in train.columns if x not in \
              ['U', 'RowNumber', 'CustomerId', 'Surname', 'Exited',\
               'Geography', 'Gender', 'Age']]
train[predictors] = pd.DataFrame(train[predictors],dtype=np.float)
scaler = StandardScaler()
scaler.fit(train[predictors])
train[predictors] = scaler.transform(train[predictors])


train[predictors].head()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,S0,S1,S2,Male,Female,Age_range
0,1.509906,0.339933,0.428978,0.799546,-1.537586,0.966801,0.304927,-1.001501,-0.580814,1.746019,0.911178,-0.911178,0.019645
1,0.866637,-0.445812,-1.214512,-0.917139,0.65037,-1.034339,0.529787,0.998501,-0.580814,-0.572731,-1.097481,1.097481,1.623292
2,0.697356,0.339933,0.615161,-0.917139,0.65037,0.966801,1.339115,0.998501,-0.580814,-0.572731,-1.097481,1.097481,-1.584003
3,-0.656894,-0.445812,0.117359,-0.917139,0.65037,-1.034339,0.438752,-1.001501,-0.580814,1.746019,-1.097481,1.097481,0.019645
4,2.068534,0.339933,0.630562,-0.917139,-1.537586,0.966801,1.072015,-1.001501,-0.580814,1.746019,0.911178,-0.911178,0.019645


In [23]:
#XGBoost parameters
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=7,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27
)

In [24]:
cv_folds = 5
early_stopping_rounds=50

xgb_param = xgb1.get_xgb_params()
xgtrain = xgb.DMatrix(train[predictors].values, label=train[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=cv_folds,
    metrics='auc', early_stopping_rounds=early_stopping_rounds)
xgb1.set_params(n_estimators=cvresult.shape[0])

xgb1.fit(train[predictors], train[target], eval_metric='auc')

#Predict testing set:
y_pred = xgb1.predict(train[predictors])

print(classification_report(train[target], y_pred)) 

              precision    recall  f1-score   support

           0       0.87      0.98      0.93      6368
           1       0.88      0.45      0.59      1632

   micro avg       0.87      0.87      0.87      8000
   macro avg       0.88      0.71      0.76      8000
weighted avg       0.88      0.87      0.86      8000



In [11]:
#using gsearch to find the suitable max_depth and min_child_weight
param_test1 = {
 'max_depth':range(1,2,3),
 'min_child_weight':range(1,2,3)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(         learning_rate =0.1, n_estimators=140, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8,             colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4,     scale_pos_weight=1, seed=27), 
 param_grid = param_test1,     scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.best_params_

{'max_depth': 1, 'min_child_weight': 1}

In [25]:
#load and preprocess the test data
df_test = pd.read_csv('test.csv')

df_test = my_preprocess(df_test)

df_test[predictors] = pd.DataFrame(df_test[predictors],dtype=np.float)
df_test[predictors] = scaler.transform(df_test[predictors])

#predict
y_ans = xgb1.predict(df_test[predictors])

In [26]:
#rewrite the upload file
df_sample = pd.read_csv('sample_upload.csv')
df_sample['Exited'] = y_ans.astype(int)
df_sample.to_csv('the_ans.csv', index=False, sep=',')
df_check = pd.read_csv('the_ans.csv')
df_check[:20]

Unnamed: 0.1,Unnamed: 0,RowNumber,Exited
0,0,2209,0
1,1,9924,0
2,2,4617,0
3,3,6077,1
4,4,9240,0
5,5,4834,0
6,6,8523,0
7,7,2826,0
8,8,871,0
9,9,6698,0
