## Importing the relevant libraries

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression, GBTClassifier, DecisionTreeClassifier, RandomForestClassifier, NaiveBayes, LinearSVC, FMClassifier, GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [0]:
spark = SparkSession.builder.appName('bank_customer_churn').getOrCreate()

## Loading the dataset

In [0]:
df = spark.read.csv("file:/Workspace/Users/n01606417@humber.ca/bank_customer_churn_prediction/train.csv", 
                    inferSchema=True, 
                    header=True)
display(df)

id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0
5,15771669,Genovese,588,Germany,Male,36.0,4,131778.58,1,1.0,0.0,136024.31,1
6,15692819,Ch'ang,593,France,Female,30.0,8,144772.69,1,1.0,0.0,29792.11,0
7,15669611,Chukwuebuka,678,Spain,Male,37.0,1,138476.41,1,1.0,0.0,106851.6,0
8,15691707,Manna,676,France,Male,43.0,4,0.0,2,1.0,0.0,142917.13,0
9,15591721,Cattaneo,583,Germany,Male,40.0,4,81274.33,1,1.0,1.0,170843.07,0


## Data Exploration

In [0]:
df.count()

165034

In [0]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: double (nullable = true)
 |-- IsActiveMember: double (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [0]:
df.show(5)

+---+----------+--------------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+------+
| id|CustomerId|       Surname|CreditScore|Geography|Gender| Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---+----------+--------------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+------+
|  0|  15674932|Okwudilichukwu|        668|   France|  Male|33.0|     3|      0.0|            2|      1.0|           0.0|      181449.97|     0|
|  1|  15749177| Okwudiliolisa|        627|   France|  Male|33.0|     1|      0.0|            2|      1.0|           1.0|        49503.5|     0|
|  2|  15694510|         Hsueh|        678|   France|  Male|40.0|    10|      0.0|            2|      1.0|           0.0|      184866.69|     0|
|  3|  15741417|           Kao|        581|   France|  Male|34.0|     2|148882.54|            1|      1.0|           1.0|       84

In [0]:
df.describe().collect()

[Row(summary='count', id='165034', CustomerId='165034', Surname='165034', CreditScore='165034', Geography='165034', Gender='165034', Age='165034', Tenure='165034', Balance='165034', NumOfProducts='165034', HasCrCard='165034', IsActiveMember='165034', EstimatedSalary='165034', Exited='165034'),
 Row(summary='mean', id='82516.5', CustomerId='1.5692005019026382E7', Surname=None, CreditScore='656.454373038283', Geography=None, Gender=None, Age='38.12588787764945', Tenure='5.020353381727402', Balance='55478.086689349235', NumOfProducts='1.5544554455445545', HasCrCard='0.7539537307463916', IsActiveMember='0.49777015645260975', EstimatedSalary='112574.82273434362', Exited='0.21159882206090866'),
 Row(summary='stddev', id='47641.356500069', CustomerId='71397.81679067112', Surname=None, CreditScore='80.1033404871783', Geography=None, Gender=None, Age='8.867204591410792', Tenure='2.8061585665860913', Balance='62817.66327783688', NumOfProducts='0.5471536788441764', HasCrCard='0.4307071240449495',

In [0]:
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

+---+----------+-------+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
| id|CustomerId|Surname|CreditScore|Geography|Gender|Age|Tenure|Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---+----------+-------+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|  0|         0|      0|          0|        0|     0|  0|     0|      0|            0|        0|             0|              0|     0|
+---+----------+-------+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+



In [0]:
df_grouped = df.groupBy(df.columns).count() # Group by all columns and count occurrences
duplicates = df_grouped.where("count>1") # Filter rows where count > 1 which indicates the presence of duplicate records in the dataset
duplicates.show()

+---+----------+-------+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+-----+
| id|CustomerId|Surname|CreditScore|Geography|Gender|Age|Tenure|Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|count|
+---+----------+-------+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+-----+
+---+----------+-------+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+-----+



## Exploratory Data Analysis

In [0]:
df.createOrReplaceTempView("customers")

In [0]:
%sql
select * from customers;

id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0
5,15771669,Genovese,588,Germany,Male,36.0,4,131778.58,1,1.0,0.0,136024.31,1
6,15692819,Ch'ang,593,France,Female,30.0,8,144772.69,1,1.0,0.0,29792.11,0
7,15669611,Chukwuebuka,678,Spain,Male,37.0,1,138476.41,1,1.0,0.0,106851.6,0
8,15691707,Manna,676,France,Male,43.0,4,0.0,2,1.0,0.0,142917.13,0
9,15591721,Cattaneo,583,Germany,Male,40.0,4,81274.33,1,1.0,1.0,170843.07,0


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

German customers are substantially wealthy as compared to the customers from France and Spain i.e. they have very high account balances in comparison to their French and Spanish counterparts.

Male customers are generally more active than female customers in terms of their account activity.

The bank has relatively higher number of male customers than female customers.

There was a higher churn rate among customers in France relative to those in Germany and Spain.

A significant proportion of customers had a credit card, contributing to more than 75% of the entire population.

More than half of the total population of customers used 2 products provided by the bank. 

## Feature Engineering

In [0]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: double (nullable = true)
 |-- IsActiveMember: double (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [0]:
df.groupBy('id').count().show(5)

+---+-----+
| id|count|
+---+-----+
|148|    1|
|463|    1|
|471|    1|
|496|    1|
|833|    1|
+---+-----+
only showing top 5 rows



In [0]:
df.groupBy('CustomerId').count().orderBy('count',ascending=False).show(5)

+----------+-----+
|CustomerId|count|
+----------+-----+
|  15682355|  121|
|  15570194|   99|
|  15585835|   98|
|  15595588|   91|
|  15793331|   90|
+----------+-----+
only showing top 5 rows



In [0]:
df.groupBy('Surname').count().orderBy('count',ascending=False).count()

2797

In [0]:
indexer = StringIndexer(inputCols=["Geography","Gender"],outputCols=["Geography_Index","Gender_Index"])
assembler = VectorAssembler(inputCols=["CreditScore","Age","Tenure","Balance","NumOfProducts","HasCrCard","IsActiveMember","EstimatedSalary","Geography_Index","Gender_Index"],outputCol='features')
scaler = StandardScaler(inputCol="features",outputCol="scaledFeatures")

In [0]:
data_prep_pipeline = Pipeline(stages=[indexer,assembler,scaler]) # Create a data preparation pipeline to convert it into a format suitable for machine learning models
data_prep_pipeline

Pipeline_d1eaa329b86c

In [0]:
fitted_data_prep_pipeline = data_prep_pipeline.fit(df)
final_df = fitted_data_prep_pipeline.transform(df)
final_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: double (nullable = true)
 |-- IsActiveMember: double (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)
 |-- Geography_Index: double (nullable = false)
 |-- Gender_Index: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



## Splitting the dataset into train and test sets

In [0]:
train_df, test_df = final_df.randomSplit([0.7,0.3],seed=42)

## Model Training & Evaluation

In [0]:
model_names = []
models = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []

In [0]:
def train_and_evaluate_model(model):
    clf = model.fit(train_df)
    results = clf.transform(test_df)
    roc_eval = BinaryClassificationEvaluator(labelCol="Exited",metricName="areaUnderROC")
    pr_eval = BinaryClassificationEvaluator(labelCol="Exited",metricName="areaUnderPR")
    acc_eval = MulticlassClassificationEvaluator(labelCol="Exited",metricName="accuracy")
    prec_eval = MulticlassClassificationEvaluator(labelCol="Exited",metricName="weightedPrecision")
    recall_eval = MulticlassClassificationEvaluator(labelCol="Exited",metricName="weightedRecall")
    f1_eval = MulticlassClassificationEvaluator(labelCol="Exited",metricName="f1")
    print(f"Area under ROC AUC Curve: {roc_eval.evaluate(results): .2f}")
    print(f"Area under PR Curve: {pr_eval.evaluate(results): .2f}")
    print(f"Accuracy: {acc_eval.evaluate(results): .2f}")
    print(f"Weighted Precision: {prec_eval.evaluate(results): .2f}")
    print(f"Weighted Recall: {recall_eval.evaluate(results): .2f}")
    print(f"F1 Score: {f1_eval.evaluate(results): .2f}")
    model_names.append(str(model).split('(')[0])
    accuracy_scores.append(acc_eval.evaluate(results))
    precision_scores.append(prec_eval.evaluate(results))
    recall_scores.append(recall_eval.evaluate(results))
    f1_scores.append(f1_eval.evaluate(results))
    roc_auc_scores.append(roc_eval.evaluate(results))
    models.append(clf)

In [0]:
train_and_evaluate_model(LogisticRegression(labelCol='Exited',featuresCol='scaledFeatures'))

Area under ROC AUC Curve:  0.81
Area under PR Curve:  0.59
Accuracy:  0.83
Weighted Precision:  0.82
Weighted Recall:  0.83
F1 Score:  0.81


In [0]:
train_and_evaluate_model(NaiveBayes(labelCol='Exited',featuresCol='scaledFeatures'))

Area under ROC AUC Curve:  0.45
Area under PR Curve:  0.20
Accuracy:  0.80
Weighted Precision:  0.77
Weighted Recall:  0.80
F1 Score:  0.77


In [0]:
train_and_evaluate_model(DecisionTreeClassifier(labelCol='Exited',featuresCol='scaledFeatures'))

Area under ROC AUC Curve:  0.57
Area under PR Curve:  0.45
Accuracy:  0.86
Weighted Precision:  0.85
Weighted Recall:  0.86
F1 Score:  0.85


In [0]:
train_and_evaluate_model(RandomForestClassifier(labelCol='Exited',featuresCol='scaledFeatures'))

Area under ROC AUC Curve:  0.87
Area under PR Curve:  0.70
Accuracy:  0.86
Weighted Precision:  0.85
Weighted Recall:  0.86
F1 Score:  0.84


In [0]:
train_and_evaluate_model(LinearSVC(labelCol='Exited',featuresCol='scaledFeatures'))

Area under ROC AUC Curve:  0.81
Area under PR Curve:  0.59
Accuracy:  0.83
Weighted Precision:  0.82
Weighted Recall:  0.83
F1 Score:  0.80


In [0]:
train_and_evaluate_model(GBTClassifier(labelCol='Exited',featuresCol='scaledFeatures'))

Area under ROC AUC Curve:  0.88
Area under PR Curve:  0.72
Accuracy:  0.86
Weighted Precision:  0.85
Weighted Recall:  0.86
F1 Score:  0.86


In [0]:
train_and_evaluate_model(FMClassifier(labelCol='Exited',featuresCol='scaledFeatures'))

Area under ROC AUC Curve:  0.78
Area under PR Curve:  0.53
Accuracy:  0.53
Weighted Precision:  0.80
Weighted Recall:  0.53
F1 Score:  0.56


## Baseline Models Performance Comparison

In [0]:
model_perfs = spark.createDataFrame([
  {'Model': model_names[i],
   'Accuracy': accuracy_scores[i],
   'Weighted Precision': precision_scores[i],
   'Weighted Recall': recall_scores[i],
   'F1': f1_scores[i],
   'ROC AUC': roc_auc_scores[i]}
  for i in range(len(model_names))
]).orderBy('Accuracy',ascending=False)
model_perfs.show()

+------------------+------------------+--------------------+-------------------+------------------+------------------+
|          Accuracy|                F1|               Model|            ROC AUC|Weighted Precision|   Weighted Recall|
+------------------+------------------+--------------------+-------------------+------------------+------------------+
|0.8630537076099303|0.8550630942554716|GBTClassifier_cd6...| 0.8843996691023615|0.8548663540231475|0.8630537076099302|
|0.8571109849099464|0.8475787261657939|DecisionTreeClass...|  0.567399885656856|0.8477715478912762|0.8571109849099465|
|0.8557317864676294|0.8438632424572252|RandomForestClass...| 0.8747129541525387|0.8460998615759202|0.8557317864676294|
|0.8339079993509655|0.8138461310719045|LogisticRegressio...| 0.8146321349679835|0.8189734646801862|0.8339079993509654|
|0.8309264968359565|0.8036377150597478|LinearSVC_f541ca4...| 0.8125646039011324|0.8175022948104731|0.8309264968359565|
| 0.803504786629888|0.7717900891913193|NaiveBaye

From the baseline model performance assessment, we can conclude that the GBT Classifier is the best performing model as it achieved an outstanding accuracy of more than 86% on the test set.

## Hyperparameter Tuning and Cross Validation

In [0]:
lr = LogisticRegression(labelCol='Exited',featuresCol='scaledFeatures')

param_grid = ParamGridBuilder() \
                .addGrid(lr.regParam,[0.01,0.1,1.0]) \
                .addGrid(lr.elasticNetParam,[0.0,0.5,1.0]) \
                .build()

acc_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='accuracy')
prec_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='weightedPrecision')
recall_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='weightedRecall')
f1_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='f1')
roc_auc_eval = BinaryClassificationEvaluator(labelCol='Exited',metricName='areaUnderROC')
area_under_pr_eval = BinaryClassificationEvaluator(labelCol='Exited',metricName='areaUnderPR')

lr_cv = CrossValidator(estimator=lr,
                       evaluator=acc_eval,
                       estimatorParamMaps=param_grid,
                       numFolds=4) # Number of cross validation folds

fitted_lr = lr_cv.fit(train_df)
results = fitted_lr.transform(test_df)
print("Test set accuracy: " + str(acc_eval.evaluate(results)))
print("Test set weighted precision: " + str(prec_eval.evaluate(results)))
print("Test set weighted recall: " + str(recall_eval.evaluate(results)))
print("Test set F1: " + str(f1_eval.evaluate(results)))
print("Test set area under ROC: " + str(roc_auc_eval.evaluate(results)))
print("Test set area under PR: " + str(area_under_pr_eval.evaluate(results)))

Test set accuracy: 0.8315755313970469
Test set weighted precision: 0.8166963027907141
Test set weighted recall: 0.8315755313970469
Test set F1: 0.8077437261131738
Test set area under ROC: 0.8150223541637119
Test set area under PR: 0.5845466943092773


In [0]:
fitted_lr.bestModel

LogisticRegressionModel: uid=LogisticRegression_e0b3a7b062c9, numClasses=2, numFeatures=10

In [0]:
fitted_lr.bestModel.getElasticNetParam()

0.0

In [0]:
fitted_lr.bestModel.getRegParam()

0.01

In [0]:
dtc = DecisionTreeClassifier(labelCol='Exited',featuresCol='scaledFeatures')

paramGrid = ParamGridBuilder() \
            .addGrid(dtc.impurity, ['gini','entropy']) \
            .addGrid(dtc.maxDepth,[2,6,12,18]) \
            .addGrid(dtc.minInstancesPerNode, [1, 2, 4]) \
            .build()

acc_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='accuracy')
prec_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='weightedPrecision')
recall_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='weightedRecall')
f1_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='f1')
roc_auc_eval = BinaryClassificationEvaluator(labelCol='Exited',metricName='areaUnderROC')
area_under_pr_eval = BinaryClassificationEvaluator(labelCol='Exited',metricName='areaUnderPR')

dtc_cv = CrossValidator(estimator=dtc,
                       evaluator=acc_eval,
                       estimatorParamMaps=param_grid,
                       numFolds=4) # Number of cross validation folds

fitted_dtc = dtc_cv.fit(train_df)
results = fitted_dtc.transform(test_df)
print("Test set accuracy: " + str(acc_eval.evaluate(results)))
print("Test set weighted precision: " + str(prec_eval.evaluate(results)))
print("Test set weighted recall: " + str(recall_eval.evaluate(results)))
print("Test set F1: " + str(f1_eval.evaluate(results)))
print("Test set area under ROC: " + str(roc_auc_eval.evaluate(results)))
print("Test set area under PR: " + str(area_under_pr_eval.evaluate(results)))

Test set accuracy: 0.8571109849099464
Test set weighted precision: 0.8477715478912762
Test set weighted recall: 0.8571109849099465
Test set F1: 0.8475787261657939
Test set area under ROC: 0.567399885656856
Test set area under PR: 0.4546249630294807


In [0]:
fitted_dtc.bestModel

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6fa693614a50, depth=5, numNodes=25, numClasses=2, numFeatures=10

In [0]:
fitted_dtc.bestModel.getImpurity()

'gini'

In [0]:
fitted_dtc.bestModel.getMaxDepth()

5

In [0]:
fitted_dtc.bestModel.getMinInstancesPerNode()

1

In [0]:
rfc = RandomForestClassifier(labelCol='Exited',featuresCol='scaledFeatures')

paramGrid = ParamGridBuilder() \
            .addGrid(rfc.numTrees, [20,50,100]) \
            .addGrid(rfc.impurity,['gini','entropy']) \
            .addGrid(rfc.maxDepth,[2,6,12,18]) \
            .addGrid(rfc.featureSubsetStrategy,['auto','all','onethird','sqrt','log2']) \
            .addGrid(rfc.bootstrap,[True,False]) \
            .build()

acc_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='accuracy')
prec_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='weightedPrecision')
recall_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='weightedRecall')
f1_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='f1')
roc_auc_eval = BinaryClassificationEvaluator(labelCol='Exited',metricName='areaUnderROC')
area_under_pr_eval = BinaryClassificationEvaluator(labelCol='Exited',metricName='areaUnderPR')

rfc_cv = CrossValidator(estimator=rfc,
                       evaluator=acc_eval,
                       estimatorParamMaps=param_grid,
                       numFolds=3) # Number of cross validation folds

fitted_rfc = rfc_cv.fit(train_df)
results = fitted_rfc.transform(test_df)
print("Test set accuracy: " + str(acc_eval.evaluate(results)))
print("Test set weighted precision: " + str(prec_eval.evaluate(results)))
print("Test set weighted recall: " + str(recall_eval.evaluate(results)))
print("Test set F1: " + str(f1_eval.evaluate(results)))
print("Test set area under ROC: " + str(roc_auc_eval.evaluate(results)))
print("Test set area under PR: " + str(area_under_pr_eval.evaluate(results)))

Test set accuracy: 0.8557317864676294
Test set weighted precision: 0.8460998615759202
Test set weighted recall: 0.8557317864676294
Test set F1: 0.8438632424572252
Test set area under ROC: 0.8747129541525387
Test set area under PR: 0.698291151986226


In [0]:
fitted_rfc.bestModel

RandomForestClassificationModel: uid=RandomForestClassifier_46295da738e7, numTrees=20, numClasses=2, numFeatures=10

In [0]:
fitted_rfc.bestModel.getImpurity()

'gini'

In [0]:
fitted_rfc.bestModel.getFeatureSubsetStrategy()

'auto'

In [0]:
fitted_rfc.bestModel.getMaxDepth()

5

In [0]:
fitted_rfc.bestModel.getBootstrap()

True

In [0]:
svc = LinearSVC(labelCol='Exited',featuresCol='scaledFeatures')

param_grid = ParamGridBuilder() \
                .addGrid(svc.regParam,[0.01,0.1,1.0]) \
                .build()

acc_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='accuracy')
prec_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='weightedPrecision')
recall_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='weightedRecall')
f1_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='f1')
roc_auc_eval = BinaryClassificationEvaluator(labelCol='Exited',metricName='areaUnderROC')
area_under_pr_eval = BinaryClassificationEvaluator(labelCol='Exited',metricName='areaUnderPR')

svc_cv = CrossValidator(estimator=svc,
                       evaluator=acc_eval,
                       estimatorParamMaps=param_grid,
                       numFolds=4) # Number of cross validation folds

fitted_svc = svc_cv.fit(train_df)
results = fitted_svc.transform(test_df)
print("Test set accuracy: " + str(acc_eval.evaluate(results)))
print("Test set weighted precision: " + str(prec_eval.evaluate(results)))
print("Test set weighted recall: " + str(recall_eval.evaluate(results)))
print("Test set F1: " + str(f1_eval.evaluate(results)))
print("Test set area under ROC: " + str(roc_auc_eval.evaluate(results)))
print("Test set area under PR: " + str(area_under_pr_eval.evaluate(results)))

Test set accuracy: 0.8274785007301639
Test set weighted precision: 0.8155241057991958
Test set weighted recall: 0.8274785007301638
Test set F1: 0.7948977644164394
Test set area under ROC: 0.8130730846226704
Test set area under PR: 0.5842634216307871


In [0]:
fitted_svc.bestModel

LinearSVCModel: uid=LinearSVC_f352d7872747, numClasses=2, numFeatures=10

In [0]:
fitted_svc.bestModel.getRegParam()

0.01

In [0]:
nb = NaiveBayes(labelCol='Exited',featuresCol='scaledFeatures')

param_grid = ParamGridBuilder() \
                .addGrid(nb.smoothing,[0.0,0.5,1.0]) \
                .build()

acc_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='accuracy')
prec_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='weightedPrecision')
recall_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='weightedRecall')
f1_eval = MulticlassClassificationEvaluator(labelCol='Exited',metricName='f1')
roc_auc_eval = BinaryClassificationEvaluator(labelCol='Exited',metricName='areaUnderROC')
area_under_pr_eval = BinaryClassificationEvaluator(labelCol='Exited',metricName='areaUnderPR')

nb_cv = CrossValidator(estimator=nb,
                       evaluator=acc_eval,
                       estimatorParamMaps=param_grid,
                       numFolds=4) # Number of cross validation folds

fitted_nb = nb_cv.fit(train_df)
results = fitted_nb.transform(test_df)
print("Test set accuracy: " + str(acc_eval.evaluate(results)))
print("Test set weighted precision: " + str(prec_eval.evaluate(results)))
print("Test set weighted recall: " + str(recall_eval.evaluate(results)))
print("Test set F1: " + str(f1_eval.evaluate(results)))
print("Test set area under ROC: " + str(roc_auc_eval.evaluate(results)))
print("Test set area under PR: " + str(area_under_pr_eval.evaluate(results)))

Test set accuracy: 0.8035250689599222
Test set weighted precision: 0.7741544351815587
Test set weighted recall: 0.803525068959922
Test set F1: 0.7718055802322218
Test set area under ROC: 0.4498173019038865
Test set area under PR: 0.19615605648870912


In [0]:
fitted_nb.bestModel

NaiveBayesModel: uid=NaiveBayes_f6be95f68c63, modelType=multinomial, numClasses=2, numFeatures=10

In [0]:
fitted_nb.bestModel.getSmoothing()

0.0

After performing hyperparameter tuning of all baseline models, we can clearly infer that the GBT Classifier remains the best performing model having achieved a stupendous accuracy score of more than 86% on the test set.

## Saving the GBT Classifier model for deployment

In [0]:
models

[LogisticRegressionModel: uid=LogisticRegression_531c1610cbed, numClasses=2, numFeatures=10,
 NaiveBayesModel: uid=NaiveBayes_3ffb380f44ca, modelType=multinomial, numClasses=2, numFeatures=10,
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fd101a297fac, depth=5, numNodes=25, numClasses=2, numFeatures=10,
 RandomForestClassificationModel: uid=RandomForestClassifier_450261364f11, numTrees=20, numClasses=2, numFeatures=10,
 LinearSVCModel: uid=LinearSVC_f541ca416fe9, numClasses=2, numFeatures=10,
 GBTClassificationModel: uid = GBTClassifier_cd682624e10d, numTrees=20, numClasses=2, numFeatures=10,
 FMClassificationModel: uid=FMClassifier_159efbc8727f, numClasses=2, numFeatures=10, factorSize=8, fitLinear=true, fitIntercept=true]

In [0]:
models[-2].save("file:/Workspace/Users/n01606417@humber.ca/bank_customer_churn_prediction/bank_customer_churn_classifier")

In [0]:
models[-2]

GBTClassificationModel: uid = GBTClassifier_cd682624e10d, numTrees=20, numClasses=2, numFeatures=10

In [0]:
loaded_model = GBTClassificationModel.load("file:/Workspace/Users/n01606417@humber.ca/bank_customer_churn_prediction/bank_customer_churn_classifier")
loaded_model

GBTClassificationModel: uid = GBTClassifier_cd682624e10d, numTrees=20, numClasses=2, numFeatures=10

## Loading the test dataset

In [0]:
test_df = spark.read.csv("file:/Workspace/Users/n01606417@humber.ca/bank_customer_churn_prediction/test.csv",inferSchema=True,header=True)
test_df.show(5)

+------+----------+---------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+
|    id|CustomerId|  Surname|CreditScore|Geography|Gender| Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|
+------+----------+---------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+
|165034|  15773898| Lucchese|        586|   France|Female|23.0|     2|      0.0|            2|      0.0|           1.0|      160976.75|
|165035|  15782418|     Nott|        683|   France|Female|46.0|     2|      0.0|            1|      1.0|           0.0|       72549.27|
|165036|  15807120|       K?|        656|   France|Female|34.0|     7|      0.0|            2|      1.0|           0.0|      138882.09|
|165037|  15808905|O'Donnell|        681|   France|  Male|36.0|     8|      0.0|            1|      1.0|           0.0|      113931.57|
|165038|  15607314|  Higgins|        752|  Germa

## Making predictions on test data

In [0]:
cleaned_test_df = fitted_data_prep_pipeline.transform(test_df) # transform the test data using the feature transformation pipeline
cleaned_test_df.show(5)

+------+----------+---------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+---------------+------------+--------------------+--------------------+
|    id|CustomerId|  Surname|CreditScore|Geography|Gender| Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Geography_Index|Gender_Index|            features|      scaledFeatures|
+------+----------+---------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+---------------+------------+--------------------+--------------------+
|165034|  15773898| Lucchese|        586|   France|Female|23.0|     2|      0.0|            2|      0.0|           1.0|      160976.75|            0.0|         1.0|[586.0,23.0,2.0,0...|[7.31555009361685...|
|165035|  15782418|     Nott|        683|   France|Female|46.0|     2|      0.0|            1|      1.0|           0.0|       72549.27|            0.0|         1.0|[683.0,4

In [0]:
cleaned_test_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: double (nullable = true)
 |-- IsActiveMember: double (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Geography_Index: double (nullable = false)
 |-- Gender_Index: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [0]:
test_results = loaded_model.transform(cleaned_test_df)
test_results.show(5)

+------+----------+---------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+---------------+------------+--------------------+--------------------+--------------------+--------------------+----------+
|    id|CustomerId|  Surname|CreditScore|Geography|Gender| Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Geography_Index|Gender_Index|            features|      scaledFeatures|       rawPrediction|         probability|prediction|
+------+----------+---------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+---------------+------------+--------------------+--------------------+--------------------+--------------------+----------+
|165034|  15773898| Lucchese|        586|   France|Female|23.0|     2|      0.0|            2|      0.0|           1.0|      160976.75|            0.0|         1.0|[586.0,23.0,2.0,0...|[7.31555009361685...|[1.44297840546

In [0]:
test_results.printSchema()

root
 |-- id: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: double (nullable = true)
 |-- IsActiveMember: double (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Geography_Index: double (nullable = false)
 |-- Gender_Index: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [0]:
predictions = test_results.select("prediction")
predictions.show()

+----------+
|prediction|
+----------+
|       0.0|
|       1.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       1.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       1.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
+----------+
only showing top 20 rows



So, the SBT Classifier model successfully made the predictions on the test dataframe with an accuracy of approximately 86.3%.