In [1]:
# File location and type
file_location = "/FileStore/tables/WA_Fn_UseC_Telco_Customer_Churn.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option('nanValue', ' ')\
  .option('nullValue', ' ')\
  .load(file_location)

display(df)

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [2]:
df.printSchema()

In [3]:
from pyspark.sql.functions import isnan, when, count, col
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [4]:
# Create a view or table

temp_table_name = "churn_analysis"

df.createOrReplaceTempView(temp_table_name)

In [5]:
pd_df=df.toPandas()

In [6]:
import matplotlib.pyplot as plt
plt.clf()
plt.plot(pd_df['tenure'], pd_df['TotalCharges'], '.')
plt.xlabel('tenure')
plt.ylabel('totalcharges')
display()

In [7]:
%sql
select * from churn_analysis

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [8]:
df.groupBy('Churn').count().show()

In [9]:
display(df.groupBy('Churn').count())

Churn,count
No,5174
Yes,1869


In [10]:
df.select('tenure','TotalCharges','MonthlyCharges').describe().show()

In [11]:
%sql
select gender, churn, count(*) from churn_analysis group by gender, churn

gender,churn,count(1)
Male,No,2625
Male,Yes,930
Female,No,2549
Female,Yes,939


In [12]:
%sql
select SeniorCitizen, churn, count(*) from churn_analysis group by SeniorCitizen, churn

SeniorCitizen,churn,count(1)
1,No,666
0,No,4508
0,Yes,1393
1,Yes,476


In [13]:
%sql
select
  tenure,
  churn,
  count(churn)
from
  churn_analysis
group by
  tenure,
  churn
order by
  tenure

tenure,churn,count(churn)
0,No,11
1,No,233
1,Yes,380
2,No,115
2,Yes,123
3,Yes,94
3,No,106
4,No,93
4,Yes,83
5,Yes,64


In [14]:
%sql
select cast(tenure as int), churn, count(churn) from churn_analysis group by tenure, churn order by cast(tenure as int)

tenure,churn,count(churn)
0,No,11
1,No,233
1,Yes,380
2,No,115
2,Yes,123
3,Yes,94
3,No,106
4,No,93
4,Yes,83
5,Yes,64


In [15]:
df.stat.crosstab("SeniorCitizen", "InternetService").show()

In [16]:
display((df.stat.crosstab("SeniorCitizen", "InternetService")))

SeniorCitizen_InternetService,DSL,Fiber optic,No
1,259,831,52
0,2162,2265,1474


In [17]:
df.stat.freqItems

In [18]:
display(df.limit(5))

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [19]:
display(df.groupBy('PhoneService').count())
#display(df.select("PhoneService"))

PhoneService,count
No,682
Yes,6361


In [20]:
df.stat.freqItems(["PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"], 0.6).collect()

In [21]:
display(df.stat.freqItems(["MonthlyCharges", "TotalCharges"], 0.6))

MonthlyCharges_freqItems,TotalCharges_freqItems
List(105.65),List(6844.5)


In [22]:
%sql
select PaperlessBilling, churn, count(*) from churn_analysis group by PaperlessBilling, churn

PaperlessBilling,churn,count(1)
Yes,Yes,1400
No,No,2403
Yes,No,2771
No,Yes,469


In [23]:
#df.select("PaperlessBilling","churn")
#df.withColumn("counting",count(churn))
display(df.groupby(["PaperlessBilling","churn"]).count())

PaperlessBilling,churn,count
Yes,Yes,1400
No,No,2403
Yes,No,2771
No,Yes,469


In [24]:
display(df.groupby(["PaymentMethod","churn"]).count())

PaymentMethod,churn,count
Credit card (automatic),No,1290
Bank transfer (automatic),No,1286
Mailed check,Yes,308
Credit card (automatic),Yes,232
Electronic check,No,1294
Electronic check,Yes,1071
Bank transfer (automatic),Yes,258
Mailed check,No,1304


In [25]:
%sql
select PaymentMethod, churn, count(*) from churn_analysis group by PaymentMethod, churn

PaymentMethod,churn,count(1)
Credit card (automatic),No,1290
Bank transfer (automatic),No,1286
Mailed check,Yes,308
Credit card (automatic),Yes,232
Electronic check,No,1294
Electronic check,Yes,1071
Bank transfer (automatic),Yes,258
Mailed check,No,1304


ML STARTS From HERE

In [27]:
churn_df = df
(train_data, test_data) = churn_df.randomSplit([0.7, 0.3], 24)

print("Records for training: " + str(train_data.count()))
print("Records for evaluation: " + str(test_data.count()))

In [28]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

catColumns = ["gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract","PaperlessBilling","PaymentMethod"]

In [29]:
catColumns

In [30]:
stringIndexer = StringIndexer(inputCol="gender", outputCol="gender" + "Index")

In [31]:
stages= []

for catCol in catColumns:

    stringIndexer = StringIndexer(inputCol=catCol, outputCol=catCol + "Index")

    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[catCol + "catVec"])

    stages += [stringIndexer, encoder]

In [32]:
stages

In [33]:
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=["TotalCharges"], outputCols=["Out_TotalCharges"])
stages += [imputer]

In [34]:
imputer

In [35]:
stages

In [36]:
label_Idx = StringIndexer(inputCol="Churn", outputCol="label")
stages += [label_Idx]

In [37]:
temp=label_Idx.fit(train_data).transform(train_data)

In [38]:
temp.show(1)

In [39]:
df.stat.corr('TotalCharges', 'MonthlyCharges')

In [40]:
display(churn_df.groupby("tenure","churn").count().sort("tenure").filter(col("churn") == "Yes"))

tenure,churn,count
1,Yes,380
2,Yes,123
3,Yes,94
4,Yes,83
5,Yes,64
6,Yes,40
7,Yes,51
8,Yes,42
9,Yes,46
10,Yes,45


In [41]:
%sql
select cast(tenure as int), churn, count(*) as churned from churn_analysis where churn='Yes' group by tenure, churn order by cast(tenure as int)

tenure,churn,churned
1,Yes,380
2,Yes,123
3,Yes,94
4,Yes,83
5,Yes,64
6,Yes,40
7,Yes,51
8,Yes,42
9,Yes,46
10,Yes,45


In [42]:
from pyspark.ml.feature import QuantileDiscretizer
tenure_bin = QuantileDiscretizer(numBuckets=3, inputCol="tenure", outputCol="tenure_bin")
stages += [tenure_bin]

In [43]:
stages

In [44]:
display(churn_df)

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [45]:
churn_df.printSchema()

In [46]:
#newly generated col due to transformations, but not added to df column. 


In [47]:
catColumns

In [48]:
ok=[c + "catVec" for c in catColumns]

In [49]:
numericCols = ["tenure_bin", "Out_TotalCharges","MonthlyCharges"]
assembleInputs = assemblerInputs = [c + "catVec" for c in catColumns] + numericCols

In [50]:
assembleInputs

In [51]:
numericCols = ["tenure_bin", "Out_TotalCharges","MonthlyCharges"]
assembleInputs = assemblerInputs = [c + "catVec" for c in catColumns] + numericCols
assembler = VectorAssembler(inputCols=assembleInputs, outputCol="features")
stages += [assembler]

In [52]:
stages

In [53]:
from pyspark.ml import *
pipeline = Pipeline().setStages(stages)
pipelineModel = pipeline.fit(train_data)

In [54]:
pipeline.stages

In [55]:
pipelineModel.stages

In [56]:
trainprepDF = pipelineModel.transform(train_data)
testprepDF = pipelineModel.transform(test_data)

In [57]:
trainprepDF.head(1)

In [58]:
trainprepDF.columns

In [59]:
display(trainprepDF.head(1))

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,genderIndex,gendercatVec,SeniorCitizenIndex,SeniorCitizencatVec,PartnerIndex,PartnercatVec,DependentsIndex,DependentscatVec,PhoneServiceIndex,PhoneServicecatVec,MultipleLinesIndex,MultipleLinescatVec,InternetServiceIndex,InternetServicecatVec,OnlineSecurityIndex,OnlineSecuritycatVec,OnlineBackupIndex,OnlineBackupcatVec,DeviceProtectionIndex,DeviceProtectioncatVec,TechSupportIndex,TechSupportcatVec,StreamingTVIndex,StreamingTVcatVec,StreamingMoviesIndex,StreamingMoviescatVec,ContractIndex,ContractcatVec,PaperlessBillingIndex,PaperlessBillingcatVec,PaymentMethodIndex,PaymentMethodcatVec,Out_TotalCharges,label,tenure_bin,features
0004-TLHLJ,Male,0,No,No,4,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85,Yes,1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",280.85,1.0,0.0,"List(0, 30, List(1, 2, 3, 4, 5, 7, 9, 11, 14, 15, 17, 19, 21, 23, 24, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 280.85, 73.9))"


In [60]:
trainprepDF.select("tenure_bin").show()

In [61]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainprepDF)

In [62]:
lrModel

In [63]:
lrModel.featuresCol

In [64]:
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

In [65]:
summary=lrModel.summary


In [66]:
summary.areaUnderROC

In [67]:
accuracy = summary.accuracy
falsePositiveRate = summary.weightedFalsePositiveRate
truePositiveRate = summary.weightedTruePositiveRate
fMeasure = summary.weightedFMeasure()
precision = summary.weightedPrecision
recall = summary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s\nAreaUnderROC: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall, summary.areaUnderROC))

In [68]:
display(lrModel, trainprepDF, "ROC")

False Positive Rate,True Positive Rate,Threshold
0.0,0.0,0.8331148512857078
0.0,0.0384615384615384,0.8331148512857078
0.0153846153846153,0.0384615384615384,0.7797057517363821
0.0153846153846153,0.0769230769230769,0.7586357194568448
0.0153846153846153,0.1153846153846153,0.6987165453214434
0.0307692307692307,0.1153846153846153,0.6915118402949202
0.0307692307692307,0.1538461538461538,0.6860604944548899
0.0461538461538461,0.1538461538461538,0.6450038522648787
0.0461538461538461,0.1923076923076923,0.6429123845502271
0.0461538461538461,0.2307692307692307,0.6295249580338602


In [69]:
display(lrModel, trainprepDF, "fittedVsResiduals")

fitted values,residuals
-0.2230676406365782,-0.4444631879006744
-2.553276223691306,-0.0722066938258735
-2.152370097862412,-0.1041099540390032
-3.5228682836249368,-0.0286685158166472
0.5880272814371096,0.3570876154497728
-0.759899964509167,0.6813320146072503
-2.557741016374676,-0.0719081551805569
-1.8696124131497969,-0.1335865759308833
-3.077465588498432,-0.0440464069087995
0.0397190258555967,0.4900715487630114


In [70]:
display(testprepDF)

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,genderIndex,gendercatVec,SeniorCitizenIndex,SeniorCitizencatVec,PartnerIndex,PartnercatVec,DependentsIndex,DependentscatVec,PhoneServiceIndex,PhoneServicecatVec,MultipleLinesIndex,MultipleLinescatVec,InternetServiceIndex,InternetServicecatVec,OnlineSecurityIndex,OnlineSecuritycatVec,OnlineBackupIndex,OnlineBackupcatVec,DeviceProtectionIndex,DeviceProtectioncatVec,TechSupportIndex,TechSupportcatVec,StreamingTVIndex,StreamingTVcatVec,StreamingMoviesIndex,StreamingMoviescatVec,ContractIndex,ContractcatVec,PaperlessBillingIndex,PaperlessBillingcatVec,PaymentMethodIndex,PaymentMethodcatVec,Out_TotalCharges,label,tenure_bin,features
0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,DSL,No,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",2.0,"List(0, 2, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 3, List(1), List(1.0))",593.3,0.0,0.0,"List(0, 30, List(0, 1, 4, 5, 8, 9, 12, 13, 16, 18, 19, 23, 25, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 593.3, 65.6))"
0003-MKNFE,Male,0,No,No,9,Yes,Yes,DSL,No,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4,No,1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 3, List(1), List(1.0))",542.4,0.0,0.0,"List(0, 30, List(1, 2, 3, 4, 6, 8, 9, 11, 13, 15, 17, 20, 21, 25, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 542.4, 59.9))"
0011-IGKFF,Male,1,Yes,No,13,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85,Yes,1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",1237.85,1.0,0.0,"List(0, 30, List(3, 4, 5, 7, 9, 12, 14, 15, 18, 20, 21, 23, 24, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1237.85, 98.0))"
0013-MHZWF,Female,0,No,Yes,9,Yes,No,DSL,No,No,No,Yes,Yes,Yes,Month-to-month,Yes,Credit card (automatic),69.4,571.45,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",2.0,"List(0, 3, List(2), List(1.0))",571.45,0.0,0.0,"List(0, 30, List(0, 1, 2, 4, 5, 8, 9, 11, 13, 16, 18, 20, 21, 23, 26, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 571.45, 69.4))"
0013-SMEOE,Female,1,Yes,No,71,Yes,No,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),109.7,7904.25,No,0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",3.0,"List(0, 3, List(), List())",7904.25,0.0,2.0,"List(0, 30, List(0, 3, 4, 5, 7, 10, 12, 14, 16, 18, 20, 22, 23, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 7904.25, 109.7))"
0017-IUDMW,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),116.8,8456.75,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",2.0,"List(0, 3, List(2), List(1.0))",8456.75,0.0,2.0,"List(0, 30, List(0, 1, 4, 6, 7, 10, 12, 14, 16, 18, 20, 22, 23, 26, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 8456.75, 116.8))"
0019-EFAEP,Female,0,No,No,72,Yes,Yes,Fiber optic,Yes,Yes,Yes,No,Yes,No,Two year,Yes,Bank transfer (automatic),101.3,7261.25,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",3.0,"List(0, 3, List(), List())",7261.25,0.0,2.0,"List(0, 30, List(0, 1, 2, 3, 4, 6, 7, 10, 12, 14, 15, 18, 19, 22, 23, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 7261.25, 101.3))"
0019-GFNTW,Female,0,No,No,56,No,No phone service,DSL,Yes,Yes,Yes,Yes,No,No,Two year,No,Bank transfer (automatic),45.05,2560.1,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",2.0,"List(0, 2, List(), List())",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 1, List(), List())",3.0,"List(0, 3, List(), List())",2560.1,0.0,2.0,"List(0, 30, List(0, 1, 2, 3, 8, 10, 12, 14, 16, 17, 19, 22, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2560.1, 45.05))"
0020-JDNXP,Female,0,Yes,Yes,34,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,No,Mailed check,61.25,1993.2,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",2.0,"List(0, 2, List(), List())",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",2.0,"List(0, 2, List(), List())",1.0,"List(0, 1, List(), List())",1.0,"List(0, 3, List(1), List(1.0))",1993.2,0.0,1.0,"List(0, 30, List(0, 1, 8, 10, 11, 14, 16, 18, 20, 25, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1993.2, 61.25))"
0023-HGHWL,Male,1,No,No,1,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,25.1,25.1,Yes,1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",2.0,"List(0, 2, List(), List())",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",25.1,1.0,0.0,"List(0, 30, List(2, 3, 8, 9, 11, 13, 15, 17, 19, 21, 23, 24, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 25.1, 25.1))"


In [71]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = lrModel.transform(testprepDF)

In [72]:
display(predictions)

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,genderIndex,gendercatVec,SeniorCitizenIndex,SeniorCitizencatVec,PartnerIndex,PartnercatVec,DependentsIndex,DependentscatVec,PhoneServiceIndex,PhoneServicecatVec,MultipleLinesIndex,MultipleLinescatVec,InternetServiceIndex,InternetServicecatVec,OnlineSecurityIndex,OnlineSecuritycatVec,OnlineBackupIndex,OnlineBackupcatVec,DeviceProtectionIndex,DeviceProtectioncatVec,TechSupportIndex,TechSupportcatVec,StreamingTVIndex,StreamingTVcatVec,StreamingMoviesIndex,StreamingMoviescatVec,ContractIndex,ContractcatVec,PaperlessBillingIndex,PaperlessBillingcatVec,PaymentMethodIndex,PaymentMethodcatVec,Out_TotalCharges,label,tenure_bin,features,rawPrediction,probability,prediction
0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,DSL,No,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",2.0,"List(0, 2, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 3, List(1), List(1.0))",593.3,0.0,0.0,"List(0, 30, List(0, 1, 4, 5, 8, 9, 12, 13, 16, 18, 19, 23, 25, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 593.3, 65.6))","List(1, 2, List(), List(1.6932851783330038, -1.6932851783330038))","List(1, 2, List(), List(0.844655704389517, 0.15534429561048296))",0.0
0003-MKNFE,Male,0,No,No,9,Yes,Yes,DSL,No,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4,No,1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 3, List(1), List(1.0))",542.4,0.0,0.0,"List(0, 30, List(1, 2, 3, 4, 6, 8, 9, 11, 13, 15, 17, 20, 21, 25, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 542.4, 59.9))","List(1, 2, List(), List(0.3044325845816356, -0.3044325845816356))","List(1, 2, List(), List(0.5755257397472797, 0.4244742602527202))",0.0
0011-IGKFF,Male,1,Yes,No,13,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85,Yes,1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",1237.85,1.0,0.0,"List(0, 30, List(3, 4, 5, 7, 9, 12, 14, 15, 18, 20, 21, 23, 24, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1237.85, 98.0))","List(1, 2, List(), List(-1.3533123673917238, 1.3533123673917238))","List(1, 2, List(), List(0.20532936784584063, 0.7946706321541593))",1.0
0013-MHZWF,Female,0,No,Yes,9,Yes,No,DSL,No,No,No,Yes,Yes,Yes,Month-to-month,Yes,Credit card (automatic),69.4,571.45,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",2.0,"List(0, 3, List(2), List(1.0))",571.45,0.0,0.0,"List(0, 30, List(0, 1, 2, 4, 5, 8, 9, 11, 13, 16, 18, 20, 21, 23, 26, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 571.45, 69.4))","List(1, 2, List(), List(0.7469047869361479, -0.7469047869361479))","List(1, 2, List(), List(0.6785038940384934, 0.3214961059615066))",0.0
0013-SMEOE,Female,1,Yes,No,71,Yes,No,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),109.7,7904.25,No,0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",3.0,"List(0, 3, List(), List())",7904.25,0.0,2.0,"List(0, 30, List(0, 3, 4, 5, 7, 10, 12, 14, 16, 18, 20, 22, 23, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 7904.25, 109.7))","List(1, 2, List(), List(3.1566611662624267, -3.1566611662624267))","List(1, 2, List(), List(0.9591703894511711, 0.040829610548828915))",0.0
0017-IUDMW,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),116.8,8456.75,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",2.0,"List(0, 3, List(2), List(1.0))",8456.75,0.0,2.0,"List(0, 30, List(0, 1, 4, 6, 7, 10, 12, 14, 16, 18, 20, 22, 23, 26, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 8456.75, 116.8))","List(1, 2, List(), List(3.550132013670682, -3.550132013670682))","List(1, 2, List(), List(0.9720810092269708, 0.02791899077302911))",0.0
0019-EFAEP,Female,0,No,No,72,Yes,Yes,Fiber optic,Yes,Yes,Yes,No,Yes,No,Two year,Yes,Bank transfer (automatic),101.3,7261.25,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",3.0,"List(0, 3, List(), List())",7261.25,0.0,2.0,"List(0, 30, List(0, 1, 2, 3, 4, 6, 7, 10, 12, 14, 15, 18, 19, 22, 23, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 7261.25, 101.3))","List(1, 2, List(), List(3.0099434022312828, -3.0099434022312828))","List(1, 2, List(), List(0.9530213204093128, 0.04697867959068728))",0.0
0019-GFNTW,Female,0,No,No,56,No,No phone service,DSL,Yes,Yes,Yes,Yes,No,No,Two year,No,Bank transfer (automatic),45.05,2560.1,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",2.0,"List(0, 2, List(), List())",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 1, List(), List())",3.0,"List(0, 3, List(), List())",2560.1,0.0,2.0,"List(0, 30, List(0, 1, 2, 3, 8, 10, 12, 14, 16, 17, 19, 22, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2560.1, 45.05))","List(1, 2, List(), List(4.055832709738697, -4.055832709738697))","List(1, 2, List(), List(0.9829738598687286, 0.017026140131271336))",0.0
0020-JDNXP,Female,0,Yes,Yes,34,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,No,Mailed check,61.25,1993.2,No,0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",2.0,"List(0, 2, List(), List())",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",1.0,"List(0, 2, List(1), List(1.0))",2.0,"List(0, 2, List(), List())",1.0,"List(0, 1, List(), List())",1.0,"List(0, 3, List(1), List(1.0))",1993.2,0.0,1.0,"List(0, 30, List(0, 1, 8, 10, 11, 14, 16, 18, 20, 25, 27, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1993.2, 61.25))","List(1, 2, List(), List(2.064872598850755, -2.064872598850755))","List(1, 2, List(), List(0.8874418078236265, 0.11255819217637354))",0.0
0023-HGHWL,Male,1,No,No,1,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,25.1,25.1,Yes,1.0,"List(0, 1, List(), List())",1.0,"List(0, 1, List(), List())",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 1, List(), List())",2.0,"List(0, 2, List(), List())",1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 3, List(0), List(1.0))",25.1,1.0,0.0,"List(0, 30, List(2, 3, 8, 9, 11, 13, 15, 17, 19, 21, 23, 24, 28, 29), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 25.1, 25.1))","List(1, 2, List(), List(-0.742611714091292, 0.742611714091292))","List(1, 2, List(), List(0.32243329800347237, 0.6775667019965276))",1.0


In [73]:
predictions.columns

In [74]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = lrModel.transform(testprepDF)
evaluatorLR = BinaryClassificationEvaluator(rawPredictionCol="prediction")
area_under_curve = evaluatorLR.evaluate(predictions)

#default evaluation is areaUnderROC
print("areaUnderROC = %g" % area_under_curve)

evaluatorLR.getMetricName()


In [75]:
results = predictions.select(['prediction', 'label'])

In [76]:
display(results.head(10))

prediction,label
0.0,0.0
0.0,0.0
1.0,1.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
0.0,0.0
1.0,1.0


In [77]:
results_collect = results.collect()
results_collect

In [78]:
## prepare score-label set
results_collect = results.collect()
results_list = [(float(i[0]), float(i[1])) for i in results_collect]

In [79]:
results_list1 = [(i[0], i[1]) for i in results_collect]

In [80]:
results_list1

In [81]:
results_list

In [82]:
predictionAndLabels = sc.parallelize(results_list)

In [83]:
ok=predictionAndLabels.collect()

In [84]:
ok

In [85]:
results_collect[0][0]

In [86]:
prediction=[]
label=[]
for i in results_collect:
  prediction+= str(i[0]) 
  label+= str(i[1])

In [87]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

results = predictions.select(['prediction', 'label'])
 
## prepare score-label set
results_collect = results.collect()
results_list = [(float(i[0]), float(i[1])) for i in results_collect]
predictionAndLabels = sc.parallelize(results_list)
 
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

predictions.show(1)



In [88]:
count=predictions.count()
#comparing between 2 columns by condition and filter
correct = results.filter(results.prediction == results.label).count()
wrong = results.filter(results.prediction != results.label).count()

tp = results.filter(results.prediction == 1.0).filter(results.prediction == results.label).count()
fp = results.filter(results.prediction == 1.0).filter(results.prediction != results.label).count()
fn = results.filter(results.prediction == 0.0).filter(results.prediction != results.label).count()
tn = results.filter(results.prediction == 0.0).filter(results.prediction == results.label).count()

accuracy = (tp+tn)/count

precision = tp/(tp+fp)

recall = tp/(tp+fn)

print("Correct: %s\nWrong: %s\ntp: %s\nfp: %s\nfn: %s\ntn: %s\nAccuracy: %s\nPrecision: %s\nRecall: %s"
      % (correct, wrong, tp, fp, fn, tn, accuracy, precision, recall))


In [89]:
test_data.groupBy('Churn').count().show()

In [90]:
test1=results.filter(results.prediction==1.0)
test1.count()

In [91]:
test2=results.filter(results.prediction==1.0).filter(results.prediction == results.label).count()
test2

In [92]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [5, 10, 20])
             .build())

In [93]:
paramGrid

In [94]:
paramGrid[0]

In [95]:
# Create a schema for the dataframe
schema = StructType([
    StructField('Category', StringType(), True),
    StructField('Count', IntegerType(), True),
    StructField('Description', StringType(), True)
])


In [96]:
# Convert list to RDD
rdd = spark.sparkContext.parallelize(paramGrid)

# Create data frame
df = spark.createDataFrame(rdd,schema)
print(df.schema)
df.show()

In [97]:
rdd.values()

In [98]:
display(df1)

WordList
"List(Hello, world)"
"List(I, am, fine)"


In [99]:
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluatorLR, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainprepDF)

In [100]:
cvModel.params

In [101]:
cvModel.bestModel.coefficients

In [102]:
predictions = cvModel.bestModel.transform(testprepDF)

In [103]:
evaluatorLR.evaluate(predictions)

In [104]:
results = predictions.select(['prediction', 'label'])

count=predictions.count()
correct = results.filter(results.prediction == results.label).count()
wrong = results.filter(results.prediction != results.label).count()
tp = results.filter(results.prediction == 1.0).filter(results.prediction == results.label).count()
fp = results.filter(results.prediction == 1.0).filter(results.prediction != results.label).count()
fn = results.filter(results.prediction == 0.0).filter(results.prediction != results.label).count()
tn = results.filter(results.prediction == 0.0).filter(results.prediction == results.label).count()

accuracy = (tp+tn)/count

precision = tp/(tp+fp)

recall = tp/(tp+fn)

print("Correct: %s\nWrong: %s\ntp: %s\nfp: %s\nfn: %s\ntn: %s\nAccuracy: %s\nPrecision: %s\nRecall: %s"
      % (correct, wrong, tp, fp, fn, tn, accuracy, precision, recall))


In [105]:
cvModel.explainParams()

In [106]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features").setImpurity("gini").setMaxDepth(6).setNumTrees(50).setFeatureSubsetStrategy("auto").setSeed(1010)

rfModel = rf.fit(trainprepDF)

In [107]:
rfModel.getNumTrees

In [108]:
predictions = rfModel.transform(testprepDF)

In [109]:
predictions

In [110]:
results = predictions.select(['prediction', 'label'])

count=predictions.count()
correct = results.filter(results.prediction == results.label).count()
wrong = results.filter(results.prediction != results.label).count()
tp = results.filter(results.prediction == 1.0).filter(results.prediction == results.label).count()
fp = results.filter(results.prediction == 1.0).filter(results.prediction != results.label).count()
fn = results.filter(results.prediction == 0.0).filter(results.prediction != results.label).count()
tn = results.filter(results.prediction == 0.0).filter(results.prediction == results.label).count()

accuracy = (tp+tn)/count

precision = tp/(tp+fp)

recall = tp/(tp+fn)

print("Correct: %s\nWrong: %s\ntp: %s\nfp: %s\nfn: %s\ntn: %s\nAccuracy: %s\nPrecision: %s\nRecall: %s"
      % (correct, wrong, tp, fp, fn, tn, accuracy, precision, recall))
