In [49]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report

#All command lines can be output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [50]:
#Read training data
source_path = "C:/Users/huang/COMP5511_Project/5. Binary Classification with a Bank Churn Dataset/train.csv"
df = pd.read_csv(source_path)
df

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [51]:
#Set ID as Index
df.set_index("id", inplace = True)

In [52]:
#Mapping gender and geography to numeric values
df['Gender'].replace({'Male': 0, 'Female': 1}, inplace=True)
df['Geography'].replace({'France': 0, 'Germany': 1,'Spain':2}, inplace=True)
df

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,15674932,Okwudilichukwu,668,0,0,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,0,0,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,15694510,Hsueh,678,0,0,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,0,0,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,2,0,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,15667085,Meng,667,2,1,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,15665521,Okechukwu,792,0,0,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,15664752,Hsia,565,0,0,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,15689614,Hsiung,554,2,1,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [53]:
# Separate data into feature vectors and class label.
X = df[['CreditScore','Geography','Gender','Age', 'Tenure', 'Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary']]
y = df['Exited']

In [54]:
# Separate data into traning set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGboost model training
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

# Testing the model
y_pred_xgb = model_xgb.predict(X_test)

# Output the xgb accuarcy for outlier remove data
accuracy_xgb = sum(y_pred_xgb == y_test) / len(y_test)
print(f"XGB accuarcy: {accuracy_xgb:.2%}")

XGB accuarcy: 86.63%


In [55]:
# Calculate the micro and marco f1 scores
micro_f1 = f1_score(y_test,y_pred_xgb,average='micro')
macro_f1 = f1_score(y_test,y_pred_xgb,average='macro')
print('micro f1: ', micro_f1)
print('marco f1: ', macro_f1)

micro f1:  0.8663313842518253
marco f1:  0.7793042249740518


In [56]:
#Read test data
source_path = "C:/Users/huang/COMP5511_Project/5. Binary Classification with a Bank Churn Dataset/test.csv"
df_test = pd.read_csv(source_path)
df_test

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.00,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.00,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.00,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.00,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110018,275052,15662091,P'eng,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62
110019,275053,15774133,Cox,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68
110020,275054,15728456,Ch'iu,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38
110021,275055,15687541,Yegorova,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58


In [57]:
#Set ID as Index
df_test.set_index("id", inplace = True)

In [58]:
#Mapping gender and geography to numeric values
df_test['Gender'].replace({'Male': 0, 'Female': 1}, inplace=True)
df_test['Geography'].replace({'France': 0, 'Germany': 1,'Spain':2}, inplace=True)
df_test

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
165034,15773898,Lucchese,586,0,1,23.0,2,0.00,2,0.0,1.0,160976.75
165035,15782418,Nott,683,0,1,46.0,2,0.00,1,1.0,0.0,72549.27
165036,15807120,K?,656,0,1,34.0,7,0.00,2,1.0,0.0,138882.09
165037,15808905,O'Donnell,681,0,0,36.0,8,0.00,1,1.0,0.0,113931.57
165038,15607314,Higgins,752,1,0,38.0,10,121263.62,1,1.0,0.0,139431.00
...,...,...,...,...,...,...,...,...,...,...,...,...
275052,15662091,P'eng,570,2,0,29.0,7,116099.82,1,1.0,1.0,148087.62
275053,15774133,Cox,575,0,1,36.0,4,178032.53,1,1.0,1.0,42181.68
275054,15728456,Ch'iu,712,0,0,31.0,2,0.00,2,1.0,0.0,16287.38
275055,15687541,Yegorova,709,0,1,32.0,3,0.00,1,1.0,1.0,158816.58


In [59]:
#Feature vectors
X_test = df_test[['CreditScore','Geography','Gender','Age', 'Tenure', 'Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary']]

In [60]:
#Test the model
y_pred_xgb_test = model_xgb.predict(X_test)

In [61]:
y_pred_xgb_test

array([0, 1, 0, ..., 0, 0, 0])

In [62]:
#Output the result with CustomerID and Exited columns in csv format.
result_df = pd.DataFrame({'CustomerId': df_test['CustomerId'], 'Exited': y_pred_xgb_test})
result_df.to_csv('C:/Users/huang/COMP5511_Project/5. Binary Classification with a Bank Churn Dataset/submissions.csv', index=False)


In [63]:
result_df 

Unnamed: 0_level_0,CustomerId,Exited
id,Unnamed: 1_level_1,Unnamed: 2_level_1
165034,15773898,0
165035,15782418,1
165036,15807120,0
165037,15808905,0
165038,15607314,0
...,...,...
275052,15662091,0
275053,15774133,0
275054,15728456,0
275055,15687541,0
