In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd
# At Intact, this session is usually already created for you in a notebook

spark = SparkSession.builder \
    .appName("KMeansFix") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

26/01/04 14:53:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
sparkDf = spark.read.csv("telecom_churn_data.csv",header=True, inferSchema=True)
print(sparkDf.show(5))
print(f"Number of Rows:{sparkDf.count}")

                                                                                

+-----------+---+------------+--------------+--------------+-------------+------+-----+
|Customer_ID|Age|Monthly_Bill|Total_Usage_GB| Contract_Type|Support_Calls|Gender|Churn|
+-----------+---+------------+--------------+--------------+-------------+------+-----+
|          1| 56|       180.6|         540.9|Month-to-Month|            3|     0|    1|
|          2| 69|      105.41|         266.8|Month-to-Month|            0|     0|    0|
|          3| 46|       91.48|         412.1|      Two Year|            1|     1|    0|
|          4| 32|       45.06|         966.2|      One Year|            1|     0|    0|
|          5| 60|      139.03|         580.4|      Two Year|            2|     0|    1|
+-----------+---+------------+--------------+--------------+-------------+------+-----+
only showing top 5 rows
None
Number of Rows:<bound method DataFrame.count of DataFrame[Customer_ID: int, Age: int, Monthly_Bill: double, Total_Usage_GB: double, Contract_Type: string, Support_Calls: int, Gend

In [8]:
trainData, testData = sparkDf.randomSplit([0.8,0.2])
print(f"TrainData: {trainData.count()} & TestData: {testData.count()}")



TrainData: 3998394 & TestData: 1001606


                                                                                

## Pipeline Creation: 
### String2Vec -> OneHotEncoding-> FeatureVector -> Normalisation ) wrapping in pipeline

In [16]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
indexer = StringIndexer(inputCol='Contract_Type', outputCol ='Contract_Type_Index')
encoder = OneHotEncoder(inputCol='Contract_Type_Index',outputCol='ContractVector')
assembler = VectorAssembler(inputCols=['Age', 'Monthly_Bill', 'Total_Usage_GB', 'Support_Calls', 'Gender', 'Churn', 'ContractVector'],
                           outputCol='features')
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
rf = RandomForestClassifier(featuresCol='scaledFeatures',labelCol='Churn')
pipeline = Pipeline(stages=[indexer,encoder,assembler,scaler,rf])

In [17]:
pipelineModel = pipeline.fit(trainData)
predictions = pipelineModel.transform(testData)
predictions.show()

                                                                                

+-----------+---+------------+--------------+--------------+-------------+------+-----+-------------------+--------------+--------------------+--------------------+--------------------+--------------------+----------+
|Customer_ID|Age|Monthly_Bill|Total_Usage_GB| Contract_Type|Support_Calls|Gender|Churn|Contract_Type_Index|ContractVector|            features|      scaledFeatures|       rawPrediction|         probability|prediction|
+-----------+---+------------+--------------+--------------+-------------+------+-----+-------------------+--------------+--------------------+--------------------+--------------------+--------------------+----------+
|          3| 46|       91.48|         412.1|      Two Year|            1|     1|    0|                1.0| (2,[1],[1.0])|[46.0,91.48,412.1...|[2.57117676883307...|[19.8936638860249...|[0.99468319430124...|       0.0|
|          9| 56|       110.8|         755.3|Month-to-Month|            1|     0|    0|                2.0|     (2,[],[])|[56.0,

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Churn', metricName ='areaUnderROC')
accuracy = evaluator.evaluate(predictions)
print(f'Accuracy Of Pipeline: {accuracy}')



Accuracy Of Pipeline: 1.0


                                                                                

In [22]:
predictedActualDf = predictions.select("Churn","prediction")
predictedActualDf.show()

+-----+----------+
|Churn|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       1.0|
+-----+----------+
only showing top 20 rows
