In [110]:
# General Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier




In [111]:
# Load in dataset to pandas DataFrame
dataset = pd.read_csv("https://github.com/SBodapati11/Tree-Classification/blob/main/Bank%20Customer%20Churn%20Prediction.csv?raw=true")

In [112]:
# Check for any missing values
sum(dataset.isna().sum())

0

In [113]:
# Check for any null values
sum(dataset.isnull().sum())

0

In [114]:
# Find the statistics of the dataset
dataset.describe()

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [115]:
dataset.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [116]:
# Variables and their meaning
# customer_id - an identification number for the customer (not used)
# credit_score - the customer's credit score
# country - the location the customer's account originated from (France, Germany, or Spain)
# gender - the customer's gender (Male or Female)
# age - the customer's age
# tenure - how long the customer's account has been active for
# balance - the current balance of the customer's account
# products_number - number of products the customer has bought from the bank
# credit_card - whether the customer owns a credit card from the bank (1=yes, 0=no)
# active_member - whether the customer is an active member (1=yes, 0=no)
# estimated_salary - the estimated salary of the customer
# churn - whether the customer has left the bank during some period (1=yes, 0=no) (This is what we are trying to predict)

In [117]:
# Get attributes

# NOTE, REMOVE LATER: Took out string attributes for now: country, gender, idk what to do with it
X = dataset[['customer_id', 'credit_score', 'age', 'tenure', 'balance', 'products_number', 'credit_card', 'active_member', 'estimated_salary']]
Y = dataset['churn']

In [118]:
# Standardize the attributes
standard_scaler = StandardScaler()
X = pd.DataFrame(standard_scaler.fit(X).fit_transform(X))
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,7.583978e-15,-4.824585e-16,2.318146e-16,-1.078249e-16,-6.252776000000001e-17,1.634248e-17,-5.2580160000000004e-17,-7.389644e-17,-2.8776980000000004e-17
std,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005
min,-1.741069,-3.109504,-1.994969,-1.733315,-1.225848,-0.9115835,-1.547768,-1.03067,-1.740268
25%,-0.8676501,-0.6883586,-0.6600185,-0.6959818,-1.225848,-0.9115835,-1.547768,-1.03067,-0.8535935
50%,-0.0028161,0.01522218,-0.1832505,-0.004425957,0.3319639,-0.9115835,0.6460917,0.9702426,0.001802807
75%,0.8659939,0.6981094,0.4842246,0.6871299,0.8199205,0.8077366,0.6460917,0.9702426,0.8572431
max,1.734255,2.063884,5.061197,1.724464,2.795323,4.246377,0.6460917,0.9702426,1.7372


In [119]:
# Split the X and Y into training and testing sets in a 80%:20% split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(8000, 9)
(2000, 9)
(8000,)
(2000,)


In [120]:
dtr = DecisionTreeRegressor()

dtr.fit(X_train, Y_train)

print("R^2 value:", dtr.score(X_train, Y_train))

R^2 value: 1.0


In [121]:
ypred = dtr.predict(X_test)

mse = mean_squared_error(Y_test, ypred)
print("MSE: ", mse)

MSE:  0.208


In [122]:
rdf = RandomForestClassifier()

rdf.fit(X_train, Y_train)

print("R^2 value:", rdf.score(X_train, Y_train))

R^2 value: 0.999875


In [123]:
ypred = rdf.predict(X_test)

mse = mean_squared_error(Y_test, ypred)
print("MSE: ", mse)

MSE:  0.1505


In [124]:
adb = AdaBoostClassifier()

adb.fit(X_train, Y_train)

print("R^2 value:", adb.score(X_train, Y_train))

R^2 value: 0.852375


In [125]:
ypred = adb.predict(X_test)

mse = mean_squared_error(Y_test, ypred)
print("MSE: ", mse)

MSE:  0.1645


In [126]:
gdb = GradientBoostingClassifier()

gdb.fit(X_train, Y_train)

print("R^2 value:", gdb.score(X_train, Y_train))

R^2 value: 0.868625


In [127]:
ypred = gdb.predict(X_test)

mse = mean_squared_error(Y_test, ypred)
print("MSE: ", mse)

MSE:  0.145
