In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import timeit
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from pyspark.sql import SparkSession
import findspark
import os
from pyspark.sql.functions import col
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import math
from pyspark.ml.tuning import CrossValidator
from random import random
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import MultilayerPerceptronClassifier



os.environ["JAVA_HOME"] = 'C:/Program Files/Java/jre1.8.0_333/'
os.environ["SPARK_HOME"] = 'C:/spark/spark-3.3.0-bin-hadoop2/'
os.environ["HADOOP_HOME"] = 'C:/hadoop/'
#os.environ["PYSPARK_SUBMIT_ARGS"] = '--master local pyspark-shell'

In [2]:
findspark.find()

'C:/spark/spark-3.3.0-bin-hadoop2/'

In [3]:
findspark.init()

In [4]:
spark = SparkSession\
.builder\
.master("local")\
.config("spark.executor.cores", "4")\
.config("spark.executor.memory", "10g")\
.config("spark.driver.memory", "10g")\
.config("spark.executor.memoryOverhead", "10g")\
.appName("Churn Project").getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [5]:
df = spark.read.csv('C:/Users/paulc/Documents/Data Analysis Courses/MastersChurn/ProjectWork/ChurnModelScaled.csv',inferSchema=True,header=True)
scaled_df = spark.read.csv('C:/Users/paulc/Documents/Data Analysis Courses/MastersChurn/ProjectWork/ChurnModelScaled.csv',inferSchema=True,header=True)

In [6]:
df = df.drop('_c0')
scaled_df = scaled_df.drop('_c0')

In [7]:
numericCols = ['MonthNumber', 'ClientProfileSummary', 'Gender', 'CashActive_YN',
       'L1DSBBetCount', 'L7DSBBetCount', 'L30DSBBetCount', 'L90DSBBetCount',
       'L1DDepositCount', 'L7DDepositCount', 'L30DDepositCount',
       'L90DDepositCount', 'L7DOtherSportsBetCount', 'L30DOtherSportsBetCount',
       'L90DOtherSportsBetCount', 'L90DUnsuccessfulDepositCount',
       'DaysSinceLastSBCashAPD', 'DaysSinceLastSBAPD', 'OlderMale40',
       'CustomerConcession90days',
       'L7DSBTurnover', 'US_SportsTurnover7D', 'L30DSBTurnover',
       'US_SportsTurnover30D', 'CustomerConcession30days', 'L90DSBTurnover',
       'L1DSBTurnover', 'US_SportsTurnover1D', 'L30DSBFreeBetsHandle',
       'US_SportsTurnover90D', 'L7DSBFreeBetsHandle']




assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
df = assembler.transform(df)
scaled_df = assembler.transform(scaled_df)

Train/Test Split

In [8]:
train,test = df.randomSplit([0.7,0.3],seed=42)
train_scaled,test_scaled = scaled_df.randomSplit([0.7,0.3],seed=42)

In [15]:
train.columns

['MonthNumber',
 'ClientProfileSummary',
 'Gender',
 'CashActive_YN',
 'L1DSBBetCount',
 'L7DSBBetCount',
 'L30DSBBetCount',
 'L90DSBBetCount',
 'L1DDepositCount',
 'L7DDepositCount',
 'L30DDepositCount',
 'L90DDepositCount',
 'L7DOtherSportsBetCount',
 'L30DOtherSportsBetCount',
 'L90DOtherSportsBetCount',
 'L90DUnsuccessfulDepositCount',
 'DaysSinceLastSBCashAPD',
 'DaysSinceLastSBAPD',
 'OlderMale40',
 'CustomerConcession90days',
 'Active_Next30Days_Cash_YN',
 'L7DSBTurnover',
 'US_SportsTurnover7D',
 'L30DSBTurnover',
 'US_SportsTurnover30D',
 'CustomerConcession30days',
 'L90DSBTurnover',
 'L1DSBTurnover',
 'US_SportsTurnover1D',
 'L30DSBFreeBetsHandle',
 'US_SportsTurnover90D',
 'L7DSBFreeBetsHandle',
 'features']

In [21]:
# layers 31 is the number of features, 16 is the only layer and 2 is the output layer

mlpc=MultilayerPerceptronClassifier(featuresCol='features',labelCol='Active_Next30Days_Cash_YN',layers = [31,16,2],maxIter=10,blockSize=128,seed=42,solver='gd')
ann = mlpc.fit(scaled_df)

pred = ann.transform(test_scaled)
evaluator = MulticlassClassificationEvaluator(labelCol='Active_Next30Days_Cash_YN',predictionCol='prediction',metricName='f1')
ann_f1 = evaluator.evaluate(pred)
print("The Accuracy of the MLPC is:",round(ann_f1,4))

The Accuracy of the MLPC is: 0.5274


<h3> Increasing the number of layers

In [9]:
mlpc=MultilayerPerceptronClassifier(featuresCol='features',labelCol='Active_Next30Days_Cash_YN',layers = [31,128,64,16,8,4,2],maxIter=20,blockSize=128,seed=42,solver='gd')
ann = mlpc.fit(scaled_df)

pred = ann.transform(test_scaled)
evaluator = MulticlassClassificationEvaluator(labelCol='Active_Next30Days_Cash_YN',predictionCol='prediction',metricName='f1')
ann_f1 = evaluator.evaluate(pred)
print("The Accuracy of the MLPC is:",round(ann_f1,4))

The Accuracy of the MLPC is: 0.5274


In [10]:
%%time

mlpc=MultilayerPerceptronClassifier(featuresCol='features',labelCol='Active_Next30Days_Cash_YN',layers = [31,128,64,16,8,4,2],maxIter=20,blockSize=128,seed=42,solver='l-bfgs')
ann = mlpc.fit(scaled_df)

pred = ann.transform(test_scaled)
evaluator = MulticlassClassificationEvaluator(labelCol='Active_Next30Days_Cash_YN',predictionCol='prediction',metricName='f1')
ann_f1 = evaluator.evaluate(pred)
print("The Accuracy of the MLPC is:",round(ann_f1,4))

The Accuracy of the MLPC is: 0.7506
CPU times: total: 15.6 ms
Wall time: 8min 44s


In [9]:
%%time

mlpc=MultilayerPerceptronClassifier(featuresCol='features',labelCol='Active_Next30Days_Cash_YN',layers = [31,128,64,16,8,4,2],maxIter=50,blockSize=128,seed=42,solver='l-bfgs')
ann = mlpc.fit(scaled_df)

pred = ann.transform(test_scaled)
evaluator = MulticlassClassificationEvaluator(labelCol='Active_Next30Days_Cash_YN',predictionCol='prediction',metricName='f1')
ann_f1 = evaluator.evaluate(pred)
print("The Accuracy of the MLPC is:",round(ann_f1,4))

The Accuracy of the MLPC is: 0.7924
CPU times: total: 62.5 ms
Wall time: 45min 31s


In [None]:
# layers 31 is the number of features, 16 is the only layer and 2 is the output layer

mlpc=MultilayerPerceptronClassifier(featuresCol='features',labelCol='Active_Next30Days_Cash_YN',layers = [31,128,64,16,8,4,2],maxIter=200,blockSize=128,seed=42,solver='l-bfgs')
ann = mlpc.fit(scaled_df)

pred = ann.transform(test_scaled)
evaluator = MulticlassClassificationEvaluator(labelCol='Active_Next30Days_Cash_YN',predictionCol='prediction',metricName='f1')
ann_f1 = evaluator.evaluate(pred)
print("The Accuracy of the MLPC is:",round(ann_f1,4))