In [1]:
sc

In [2]:
spark

### 1. Read the Dataset

In [3]:
churn_data = spark.read.csv("file:///home/hadoop/Downloads/Telco_Customer_Churn.csv",
                            header = True, inferSchema= True)

churn_data.head()

Row(customerID='7590-VHVEG', gender='Female', SeniorCitizen=0, Partner='Yes', Dependents='No', tenure=1, PhoneService='No', MultipleLines='No phone service', InternetService='DSL', OnlineSecurity='No', OnlineBackup='Yes', DeviceProtection='No', TechSupport='No', StreamingTV='No', StreamingMovies='No', Contract='Month-to-month', PaperlessBilling='Yes', PaymentMethod='Electronic check', MonthlyCharges=29.85, TotalCharges='29.85', Churn='No')

In [10]:
churn_data.show()

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|     OnlineSecurity|       OnlineBackup|   DeviceProtection|        TechSupport|        StreamingTV|    StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|            0|    Yes|        No|     1|  

### 2. Data Exploration
    a) How many customer records are in the dataset

In [4]:
churn_data.count()

7043

In [5]:
len(churn_data.columns)

21

In [6]:
churn_data.columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

### b) What is the distribution of gender among customers?

In [8]:
churn_data.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: string (nullable = true)
 |-- Churn: string (nullable = true)



In [9]:
churn_data.groupBy(['gender']).count().show()

+------+-----+
|gender|count|
+------+-----+
|Female| 3488|
|  Male| 3555|
+------+-----+



### c) What is the distribution of contract type

In [11]:
churn_data.groupBy(['Contract']).count().show()

+--------------+-----+
|      Contract|count|
+--------------+-----+
|Month-to-month| 3875|
|      One year| 1473|
|      Two year| 1695|
+--------------+-----+



### d) Percentage of customers who churned?

In [13]:
churn_data.select(['Churn']).where("Churn = 'Yes'").count()

1869

In [14]:
(churn_data.select(['Churn']).where("Churn = 'Yes'").count()/ churn_data.count())*100

26.536987079369588

### 3. Data PreprocessIng
    * Check for missing values and handle missing data

In [16]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [18]:
churn_data.select([count(when(isnull(col),col)).alias(col) for col in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [20]:
churn_data = churn_data.withColumn('TotalCharges', when(col('TotalCharges')== " ",None)\
                     .otherwise(col("TotAlCharges")))

In [21]:
churn_data.select([count(when(isnull(col),col)).alias(col) for col in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [22]:
churn_data1 = churn_data.na.drop()

In [23]:
from pyspark.sql.types import FloatType
churn_data1 = churn_data1.withColumn('TotalCharges', col('TotalCharges').cast(FloatType()))

### 4. Import MLlib
    f) Convert categorical variables into numerical format using one-hot encoding or label encoding 

In [46]:
churn_data1 = churn_data1.drop(col('CustomerID'))

In [48]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml import Pipeline

In [49]:
churn_data1.columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [50]:
categorical_cols = [field.name for field in churn_data1.schema.fields if isinstance(field.dataType, StringType)]

In [51]:
categorical_cols

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [53]:
stages = []

for catcols in categorical_cols[:-1]:
    stringindexer = StringIndexer(inputCol = catcols, outputCol=catcols + "Index")
    onehotencoder = OneHotEncoderEstimator(inputCols=[stringindexer.getOutputCol()],
                                           outputCols = [catcols + "classVec"])
    stages += [stringindexer, onehotencoder]

In [54]:
numericalCols = [field.name for field in churn_data1.schema.fields 
                 if not isinstance(field.dataType, StringType)]

In [55]:
assemblerInputs = [c + "classVec" for c in categorical_cols[:-1]] + numericalCols
assembler = VectorAssembler(inputCols = assemblerInputs,outputCol="features")
stages += [assembler]

In [56]:
label_stringIdx = StringIndexer(inputCol='Churn', outputCol = 'label')

In [57]:
stages += [label_stringIdx]

In [58]:
pipeline = Pipeline(stages = stages)

In [59]:
preprocessing = pipeline.fit(churn_data1)

In [60]:
churn_df = preprocessing.transform(churn_data1)

In [61]:
churn_df.select(['gender','genderIndex','genderclassVec']).show()

+------+-----------+--------------+
|gender|genderIndex|genderclassVec|
+------+-----------+--------------+
|Female|        1.0|     (1,[],[])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|Female|        1.0|     (1,[],[])|
|Female|        1.0|     (1,[],[])|
|  Male|        0.0| (1,[0],[1.0])|
|Female|        1.0|     (1,[],[])|
|Female|        1.0|     (1,[],[])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|Female|        1.0|     (1,[],[])|
|Female|        1.0|     (1,[],[])|
|  Male|        0.0| (1,[0],[1.0])|
|Female|        1.0|     (1,[],[])|
|Female|        1.0|     (1,[],[])|
+------+-----------+--------------+
only showing top 20 rows



In [62]:
churn_df1 = churn_df.select(["features","label"])
churn_df1.show(truncate = False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                                                                              |label|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|(7061,[1950,7033,7038,7039,7042,7043,7045,7047,7049,7051,7053,7054,7058,7059,7060],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.85,29.850000381469727])                                    |0.0  |
|(7061,[776,7031,7032,7033,7034,7035,7038,7040,7041,7044,7045,7047,7049,7055,7058,7059,7060],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,34.0,56.95,1889.5])   

###  Split Dataset into Train and Test

In [90]:
train, test = churn_df1.randomSplit([0.8,0.2], seed = 42)

In [91]:
train.select(['features','label'])

DataFrame[features: vector, label: double]

### Build Decision Tree ML Model

In [92]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
tree = DecisionTreeClassifier(featuresCol='features',labelCol='label')
decision_model = tree.fit(train)

### Evaluate the model

In [93]:
predictions = decision_model.transform(test)

In [94]:
predictions.select(['label','probability','prediction']).show(truncate=False)

+-----+-----------------------------------------+----------+
|label|probability                              |prediction|
+-----+-----------------------------------------+----------+
|1.0  |[0.800650054171181,0.19934994582881907]  |0.0       |
|0.0  |[0.36984126984126986,0.6301587301587301] |1.0       |
|0.0  |[0.9163987138263665,0.08360128617363344] |0.0       |
|0.0  |[0.6940874035989717,0.3059125964010283]  |0.0       |
|0.0  |[0.9163987138263665,0.08360128617363344] |0.0       |
|0.0  |[0.8722627737226277,0.12773722627737227] |0.0       |
|0.0  |[0.9778672032193159,0.022132796780684104]|0.0       |
|0.0  |[0.9778672032193159,0.022132796780684104]|0.0       |
|0.0  |[0.800650054171181,0.19934994582881907]  |0.0       |
|0.0  |[0.7348993288590604,0.2651006711409396]  |0.0       |
|1.0  |[0.9163987138263665,0.08360128617363344] |0.0       |
|0.0  |[0.9778672032193159,0.022132796780684104]|0.0       |
|0.0  |[0.36984126984126986,0.6301587301587301] |1.0       |
|1.0  |[0.69408740359897

In [95]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',
                                             metricName='accuracy')

accuracy = evaluator.evaluate(predictions)
accuracy

0.805575935436537

### Build Random Forest ML Model

In [96]:
from pyspark.ml.classification import RandomForestClassifier
randomForest = RandomForestClassifier(featuresCol='features',labelCol='label')
rf_model = randomForest.fit(train)

In [97]:
predictions = rf_model.transform(test)

In [98]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',
                                             metricName='accuracy')

accuracy = evaluator.evaluate(predictions)
accuracy

0.7248716067498165

### Logistic Regression ML Model

* perform well with binary classification

In [103]:
from pyspark.ml.classification import LogisticRegression
logistic = LogisticRegression(featuresCol='features',labelCol='label')
logistic_model = logistic.fit(train)

In [104]:
predictions = logistic_model.transform(test)

In [105]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',
                                             metricName='accuracy')

accuracy = evaluator.evaluate(predictions)
accuracy

0.7989728539985327