In [1]:
sc

In [2]:
spark

## Read Dataset

In [3]:
churn_data = spark.read.csv('file:///home/hadoop/Downloads/Telco_Customer_Churn.csv', header=True, inferSchema=True)
churn_data.head()

Row(customerID='7590-VHVEG', gender='Female', SeniorCitizen=0, Partner='Yes', Dependents='No', tenure=1, PhoneService='No', MultipleLines='No phone service', InternetService='DSL', OnlineSecurity='No', OnlineBackup='Yes', DeviceProtection='No', TechSupport='No', StreamingTV='No', StreamingMovies='No', Contract='Month-to-month', PaperlessBilling='Yes', PaymentMethod='Electronic check', MonthlyCharges=29.85, TotalCharges='29.85', Churn='No')

In [4]:
churn_data.show()

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|     OnlineSecurity|       OnlineBackup|   DeviceProtection|        TechSupport|        StreamingTV|    StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|            0|    Yes|        No|     1|  

## 2. EDA Exploration
#### a: How many customer records are present in dataset?

In [5]:
churn_data.count()

7043

In [6]:
len(churn_data.columns)

21

In [7]:
churn_data.columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [8]:
churn_data.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: string (nullable = true)
 |-- Churn: string (nullable = true)



#### b. What is the distribution of gender among customers

In [9]:
churn_data.groupBy(['gender']).count().show()

+------+-----+
|gender|count|
+------+-----+
|Female| 3488|
|  Male| 3555|
+------+-----+



#### c. What is the distribution of contract types amon customers?

In [10]:
churn_data.groupBy(['contract']).count().show()

+--------------+-----+
|      contract|count|
+--------------+-----+
|Month-to-month| 3875|
|      One year| 1473|
|      Two year| 1695|
+--------------+-----+



#### d. What is the percentage of customers who got churned

In [11]:
churned_percent = churn_data.select(['Churn']).where("Churn='Yes'").count() / churn_data.count() * 100
print(f"Percentage of customers who got churned = {round(churned_percent, 5)}")

Percentage of customers who got churned = 26.53699


## 3. Data PreProcessing
#### Check for missing values and handle missing data

In [12]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [13]:
churn_data.select([count(when(isnull(col), col)).alias(col) for col in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [14]:
churn_data = churn_data.withColumn('TotalCharges', when(col('TotalCharges') == ' ', None).otherwise(col('TotalCharges')))

In [15]:
churn_data.select([count(when(isnull(col), col)).alias(col) for col in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [16]:
# above, we see 11 values in 'TotalCharges' are missing. we replaced them by None - and we have to drop them now
churn_data1 = churn_data.na.drop()

In [17]:
# converting TotalCharges to float from string
from pyspark.sql.types import FloatType
churn_data1 = churn_data1.withColumn('TotalCharges', col('TotalCharges').cast(FloatType()))
churn_data1

DataFrame[customerID: string, gender: string, SeniorCitizen: int, Partner: string, Dependents: string, tenure: int, PhoneService: string, MultipleLines: string, InternetService: string, OnlineSecurity: string, OnlineBackup: string, DeviceProtection: string, TechSupport: string, StreamingTV: string, StreamingMovies: string, Contract: string, PaperlessBilling: string, PaymentMethod: string, MonthlyCharges: double, TotalCharges: float, Churn: string]

## 4. Importing ML Libraries
###### convert categorical variables into numerical format using one-hot encoding or label encoding

In [18]:
churn_data1 = churn_data1.drop(col('customerID'))

In [19]:
import numpy as np
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml import Pipeline

In [20]:
print(churn_data1.columns)

['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [21]:
# StringIndexer() - maps male to 1 and female to 0 [alphabetically] - each categorical value assigned an integer value
# OneHotEncoder - for each row, only one value will be True/1 others are False/0
#  https://www.google.com/url?sa=i&url=https%3A%2F%2Ftowardsdatascience.com%2Fbuilding-a-one-hot-encoding-layer-with-tensorflow-f907d686bf39&psig=AOvVaw1ep0LjD3FlmFo_qaJXpCww&ust=1725617545026000&source=images&cd=vfe&opi=89978449&ved=0CBIQjRxqFwoTCIj7-bfIq4gDFQAAAAAdAAAAABAJ

In [22]:
categorical_columns = [field.name for field in churn_data1.schema.fields if isinstance(field.dataType, StringType)]
print(categorical_columns)

['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']


In [31]:
stages = []

for catcols in categorical_columns[:-1]:
    stringIndexer = StringIndexer(inputCol=catcols, outputCol=catcols+'Index')
    onehotencoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()],
                                          outputCols=[catcols+'classVec'])
    stages += [stringIndexer, onehotencoder]

In [32]:
numerical_columns = [field.name for field in churn_data1.schema.fields if not isinstance(field.dataType, StringType)]
print(numerical_columns)

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


In [33]:
assemblerInputs = [c + 'classVec' for c in categorical_columns[:-1]] + numerical_columns
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')
stages += [assembler]

In [34]:
label_stringIdx = StringIndexer(inputCol='Churn', outputCol='label')
stages += [label_stringIdx]

In [35]:
pipeline = Pipeline(stages=stages)

In [36]:
preprocessing = pipeline.fit(churn_data1)

In [37]:
churn_df = preprocessing.transform(churn_data1)

In [39]:
churn_df.select(['Contract', 'ContractIndex', 'ContractclassVec']).show()

+--------------+-------------+----------------+
|      Contract|ContractIndex|ContractclassVec|
+--------------+-------------+----------------+
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      Two year|          1.0|   (2,[1],[1.0])|
|      One year|          2.0|       (2,[],[])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|Month-to-month|          0.0|   (2,[0],[1.0])|
|      Two year|          1.0|   (2,[1],[1.0])|
|      One year|          2.0|       (2,[],[])|
|      Two year|          1.0|   (2,[1],

In [40]:
churn_df1 = churn_df.select(['features', 'label'])
churn_df1.show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                        |label|
+------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|(30,[2,7,8,11,12,14,16,18,20,22,23,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.85,29.850000381469727])                        |0.0  |
|(30,[0,1,2,3,4,7,9,10,13,14,16,18,24,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,34.0,56.95,1889.5])                         |0.0  |
|(30,[0,1,2,3,4,7,9,11,12,14,16,18,20,22,24,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,53.85,108.1500015258789]) |1.0  |
|(30,[0,1,2,7,9,10,13,15,16,18,25,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1

## Split dataset into train and test

In [66]:
train, test = churn_df1.randomSplit([0.8, 0.2], seed = 27)

In [67]:
train.select(['features', 'label'])

DataFrame[features: vector, label: double]

## Build the Model - Decision Tree

In [68]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
tree = DecisionTreeClassifier(featuresCol='features', labelCol='label')
decision_model = tree.fit(train)
# fit means training
# transform means prediction

## Evaluate the Model

In [69]:
predictions = decision_model.transform(test)
predictions.select(['label', 'probability', 'prediction']).show(truncate=False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1.0  |[0.11602209944751381,0.8839779005524862]|1.0       |
|1.0  |[0.11602209944751381,0.8839779005524862]|1.0       |
|1.0  |[0.11602209944751381,0.8839779005524862]|1.0       |
|1.0  |[0.11602209944751381,0.8839779005524862]|1.0       |
|0.0  |[0.3746518105849582,0.6253481894150418] |1.0       |
|0.0  |[0.4553191489361702,0.5446808510638298] |1.0       |
|1.0  |[0.11602209944751381,0.8839779005524862]|1.0       |
|0.0  |[0.3746518105849582,0.6253481894150418] |1.0       |
|0.0  |[0.6925207756232687,0.3074792243767313] |0.0       |
|1.0  |[0.3746518105849582,0.6253481894150418] |1.0       |
|0.0  |[0.6925207756232687,0.3074792243767313] |0.0       |
|1.0  |[0.11602209944751381,0.8839779005524862]|1.0       |
|1.0  |[0.11602209944751381,0.8839779005524862]|1.0       |
|1.0  |[0.3746518105849582,0.62534818941

In [74]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

0.8113207547169812

## Build RandomForest Model

In [76]:
from pyspark.ml.classification import RandomForestClassifier
tree = RandomForestClassifier(featuresCol='features', labelCol='label')
decision_model = tree.fit(train)

In [77]:
predictions = decision_model.transform(test)
predictions.select(['label', 'probability', 'prediction']).show(truncate=False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1.0  |[0.3236709487687401,0.6763290512312599] |1.0       |
|1.0  |[0.3236709487687401,0.6763290512312599] |1.0       |
|1.0  |[0.3236709487687401,0.6763290512312599] |1.0       |
|1.0  |[0.3236709487687401,0.6763290512312599] |1.0       |
|0.0  |[0.3236709487687401,0.6763290512312599] |1.0       |
|0.0  |[0.6322845008775212,0.36771549912247886]|0.0       |
|1.0  |[0.3969223245966429,0.6030776754033571] |1.0       |
|0.0  |[0.5023122737925212,0.4976877262074789] |0.0       |
|0.0  |[0.6557903446711741,0.34420965532882597]|0.0       |
|1.0  |[0.3799555007894674,0.6200444992105325] |1.0       |
|0.0  |[0.6247455448098591,0.37525445519014083]|0.0       |
|1.0  |[0.3287876313203134,0.6712123686796866] |1.0       |
|1.0  |[0.30417400884912893,0.6958259911508711]|1.0       |
|1.0  |[0.3384496043122155,0.66155039568

In [78]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

0.806966618287373

## Logistic Regression ML Model

In [79]:
from pyspark.ml.classification import LogisticRegression
tree = LogisticRegression(featuresCol='features', labelCol='label')
decision_model = tree.fit(train)

In [80]:
predictions = decision_model.transform(test)
predictions.select(['label', 'probability', 'prediction']).show(truncate=False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1.0  |[0.3001442147747802,0.6998557852252199] |1.0       |
|1.0  |[0.3024907541351523,0.6975092458648476] |1.0       |
|1.0  |[0.30437544419078716,0.6956245558092129]|1.0       |
|1.0  |[0.30674056483603795,0.693259435163962] |1.0       |
|0.0  |[0.3119991216954253,0.6880008783045747] |1.0       |
|0.0  |[0.6021499450158613,0.3978500549841388] |0.0       |
|1.0  |[0.4864209445636462,0.5135790554363538] |1.0       |
|0.0  |[0.5636764422163408,0.4363235577836592] |0.0       |
|0.0  |[0.6410197543385553,0.3589802456614447] |0.0       |
|1.0  |[0.33258032666130666,0.6674196733386933]|1.0       |
|0.0  |[0.5322875907385917,0.46771240926140834]|0.0       |
|1.0  |[0.3334029119171606,0.6665970880828395] |1.0       |
|1.0  |[0.2709476479548886,0.7290523520451114] |1.0       |
|1.0  |[0.2756520477126979,0.72434795228

In [81]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

0.8243831640058055