In [None]:
%sql USE CHURN;
CREATE
OR REPLACE TEMPORARY VIEW CHURNDATA AS
SELECT
  A.*,
  B.`Account Length`,
  B.Churn,
  B.`Int'l Plan`,
  B.`VMail Plan`,
  B.State
FROM
  callsdata A,
  contractdata B
WHERE
  A.`Area Code` = B.`Area Code`
  AND A.PHONE = B.PHONE;

In [None]:
import pyspark
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler

from distutils.version import LooseVersion
df=spark.table('CHURNDATA')
#df=df.select([c for c in df.columns if c not in {'Phone'}])
cols=df.columns

stages = []

stringIndexer = StringIndexer(inputCol="State", outputCol="State" + "Index")
if LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
  from pyspark.ml.feature import OneHotEncoderEstimator
  encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=["State" + "classVec"])
  encoder1 = OneHotEncoderEstimator(inputCols=["Area Code"], outputCols=["Area" + "classVec"])
else:
  from pyspark.ml.feature import OneHotEncoder
  encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=["State" + "classVec"])
  encoder1 = OneHotEncoder(inputCols=["Area Code"], outputCols=["Area Code"+ "classVec"])
# Add stages.  These are not run here, but will run all at once later on.
stages += [stringIndexer, encoder]
stages += [encoder1]

In [None]:
label_stringIdx = StringIndexer(inputCol="Churn", outputCol="label")
stages += [label_stringIdx]

numericCols = [col for col in df.columns if col not in ["State","Area Code","Churn","Phone"]]
assemblerInputs = [c + "classVec" for c in ["State","Area Code"]] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [None]:
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(df)
preppedDataDF = pipelineModel.transform(df)

In [None]:
from pyspark.ml.classification import LogisticRegression
# Fit model to prepped data
lrModel = LogisticRegression().fit(preppedDataDF)

# ROC for training data
display(lrModel, preppedDataDF, "ROC")


False Positive Rate,True Positive Rate,Threshold
0.0,0.0,0.770966781993131
0.0,0.0625,0.770966781993131
0.0,0.125,0.7496690156313285
0.0119047619047619,0.125,0.7155013647917717
0.0119047619047619,0.1875,0.591277790680605
0.0119047619047619,0.25,0.5739870371183692
0.0119047619047619,0.3125,0.5540921841378263
0.0238095238095238,0.3125,0.5499895564128667
0.0238095238095238,0.375,0.5353428734355756
0.0238095238095238,0.4375,0.5202341000100895


In [None]:
#from pyspark.sql.functions import lit
selectedcols = ["label", "features"] + cols
dataset = preppedDataDF.select(selectedcols) 
from pyspark.sql.functions import col, explode, array, lit
major_df = dataset.filter(col("label") == 0)
minor_df = dataset.filter(col("label") == 1)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))
a = range(ratio)
# duplicate the minority rows
oversampled_df = minor_df.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
# combine both oversampled minority rows and previous majority rows 
finaldataset = major_df.unionAll(oversampled_df)
display(finaldataset)

label,features,VMail Message,Day Mins,Eve Mins,Night Mins,Intl Mins,CustServ Calls,Day Calls,Day Charge,Eve Calls,Eve Charge,Night Calls,Night Charge,Intl Calls,Intl Charge,Area Code,Phone,Account Length,Churn,Int'l Plan,VMail Plan,State
0.0,"List(0, 577, List(16, 465, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 576), List(1.0, 1.0, 25.0, 265.1, 197.4, 244.7, 10.0, 1.0, 110.0, 45.07, 99.0, 16.78, 91.0, 11.01, 3.0, 2.7, 128.0, 1.0))",25,265.1,197.4,244.7,10.0,1,110,45.07,99,16.78,91,11.01,3,2.7,415,382-4657,128,0,0,1,KS
0.0,"List(0, 577, List(4, 465, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 576), List(1.0, 1.0, 26.0, 161.6, 195.5, 254.4, 13.7, 1.0, 123.0, 27.47, 103.0, 16.62, 103.0, 11.45, 3.0, 3.7, 107.0, 1.0))",26,161.6,195.5,254.4,13.7,1,123,27.47,103,16.62,103,11.45,3,3.7,415,371-7191,107,0,0,1,OH
0.0,"List(0, 577, List(20, 465, 561, 562, 563, 564, 566, 567, 568, 569, 570, 571, 572, 573, 574), List(1.0, 1.0, 243.4, 121.2, 162.6, 12.2, 114.0, 41.38, 110.0, 10.3, 104.0, 7.32, 5.0, 3.29, 137.0))",0,243.4,121.2,162.6,12.2,0,114,41.38,110,10.3,104,7.32,5,3.29,415,358-1921,137,0,0,0,NJ
0.0,"List(0, 577, List(4, 458, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575), List(1.0, 1.0, 299.4, 61.9, 196.9, 6.6, 2.0, 71.0, 50.9, 88.0, 5.26, 89.0, 8.86, 7.0, 1.78, 84.0, 1.0))",0,299.4,61.9,196.9,6.6,2,71,50.9,88,5.26,89,8.86,7,1.78,408,375-9999,84,0,1,0,OH
0.0,"List(0, 577, List(35, 465, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575), List(1.0, 1.0, 166.7, 148.3, 186.9, 10.1, 3.0, 113.0, 28.34, 122.0, 12.61, 121.0, 8.41, 3.0, 2.73, 75.0, 1.0))",0,166.7,148.3,186.9,10.1,3,113,28.34,122,12.61,121,8.41,3,2.73,415,330-6626,75,0,1,0,OK
0.0,"List(0, 577, List(3, 561, 562, 563, 564, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575), List(1.0, 223.4, 220.6, 203.9, 6.3, 98.0, 37.98, 101.0, 18.75, 118.0, 9.18, 6.0, 1.7, 118.0, 1.0))",0,223.4,220.6,203.9,6.3,0,98,37.98,101,18.75,118,9.18,6,1.7,510,391-8027,118,0,1,0,AL
0.0,"List(0, 577, List(24, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 576), List(1.0, 24.0, 218.2, 348.5, 212.6, 7.5, 3.0, 88.0, 37.09, 108.0, 29.62, 118.0, 9.57, 7.0, 2.03, 121.0, 1.0))",24,218.2,348.5,212.6,7.5,3,88,37.09,108,29.62,118,9.57,7,2.03,510,355-9993,121,0,0,1,MA
0.0,"List(0, 577, List(29, 465, 561, 562, 563, 564, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575), List(1.0, 1.0, 157.0, 103.1, 211.8, 7.1, 79.0, 26.69, 94.0, 8.76, 96.0, 9.53, 6.0, 1.92, 147.0, 1.0))",0,157.0,103.1,211.8,7.1,0,79,26.69,94,8.76,96,9.53,6,1.92,415,329-9001,147,0,1,0,MO
0.0,"List(0, 577, List(47, 458, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574), List(1.0, 1.0, 184.5, 351.6, 215.8, 8.7, 1.0, 97.0, 31.37, 80.0, 29.89, 90.0, 9.71, 4.0, 2.35, 117.0))",0,184.5,351.6,215.8,8.7,1,97,31.37,80,29.89,90,9.71,4,2.35,408,335-4719,117,0,0,0,LA
0.0,"List(0, 577, List(0, 465, 560, 561, 562, 563, 564, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576), List(1.0, 1.0, 37.0, 258.6, 222.0, 326.4, 11.2, 84.0, 43.96, 111.0, 18.87, 97.0, 14.69, 5.0, 3.02, 141.0, 1.0, 1.0))",37,258.6,222.0,326.4,11.2,0,84,43.96,111,18.87,97,14.69,5,3.02,415,330-8173,141,0,1,1,WV


In [None]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = finaldataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())

In [None]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [None]:
# Make predictions on test data using the transform() method.
# LogisticRegression.transform() will only use the 'features' column.
predictions = lrModel.transform(testData)

In [None]:
selected = predictions.select("label", "prediction", "probability","Phone")
display(selected)


label,prediction,probability,Phone
0.0,0.0,"List(1, 2, List(), List(0.7076649308158433, 0.29233506918415675))",391-6558
0.0,0.0,"List(1, 2, List(), List(0.751661730136469, 0.24833826986353094))",406-6708
0.0,0.0,"List(1, 2, List(), List(0.7702468206314637, 0.22975317936853643))",370-5001
0.0,1.0,"List(1, 2, List(), List(0.189807177394755, 0.8101928226052449))",365-8831
0.0,1.0,"List(1, 2, List(), List(0.4498979267695209, 0.5501020732304791))",408-3269
0.0,1.0,"List(1, 2, List(), List(0.1599229912261716, 0.8400770087738284))",419-6418
0.0,0.0,"List(1, 2, List(), List(0.8255269518349587, 0.1744730481650413))",357-2748
0.0,0.0,"List(1, 2, List(), List(0.7342455518355641, 0.26575444816443594))",367-2056
0.0,1.0,"List(1, 2, List(), List(0.24312938833767, 0.75687061166233))",330-8173
0.0,0.0,"List(1, 2, List(), List(0.7711828283558616, 0.22881717164413842))",383-3375


In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)


In [None]:
evaluator.getMetricName()

In [None]:
print(lr.explainParams())

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

In [None]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

In [None]:
# Use test set to measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

The AUC ROC Score showed a minor increase from 82.4 to 82.9

In [None]:
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)

In [None]:
print('Model Intercept: ', cvModel.bestModel.intercept)

In [None]:
weights = cvModel.bestModel.coefficients
weights = [(float(w),) for w in weights]  # convert numpy type to float, and to tuple
weightsDF = sqlContext.createDataFrame(weights, ["Feature Weight"])
display(weightsDF)

Feature Weight
-0.0389366427363711
0.0158606259826252
0.0263469380955301
-0.0672799155961221
-0.0299912535601033
-0.0065003920388711
-0.0424004311127875
-0.1154423158952097
-0.0455095316108481
0.0084630155870267


In [None]:
# View best model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability", "Phone")
display(selected)

label,prediction,probability,Phone
0.0,0.0,"List(1, 2, List(), List(0.5851007323631793, 0.4148992676368207))",391-6558
0.0,0.0,"List(1, 2, List(), List(0.6008806459583829, 0.3991193540416172))",406-6708
0.0,0.0,"List(1, 2, List(), List(0.565778924370969, 0.43422107562903106))",370-5001
0.0,0.0,"List(1, 2, List(), List(0.5124012788256049, 0.4875987211743951))",365-8831
0.0,0.0,"List(1, 2, List(), List(0.5448921792325845, 0.45510782076741557))",408-3269
0.0,0.0,"List(1, 2, List(), List(0.5098633030540797, 0.4901366969459203))",419-6418
0.0,0.0,"List(1, 2, List(), List(0.6022111387756152, 0.3977888612243849))",357-2748
0.0,0.0,"List(1, 2, List(), List(0.581767820021751, 0.41823217997824896))",367-2056
0.0,0.0,"List(1, 2, List(), List(0.5169389205294224, 0.48306107947057764))",330-8173
0.0,0.0,"List(1, 2, List(), List(0.5955119392125175, 0.40448806078748245))",383-3375


In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(trainingData)

In [None]:
print("numNodes = ", dtModel.numNodes)
print("depth = ", dtModel.depth)

In [None]:
display(dtModel)

treeNode
"{""index"":5,""featureType"":""continuous"",""prediction"":null,""threshold"":248.14999999999998,""categories"":null,""feature"":561,""overflow"":false}"
"{""index"":3,""featureType"":""continuous"",""prediction"":null,""threshold"":3.5,""categories"":null,""feature"":565,""overflow"":false}"
"{""index"":1,""featureType"":""continuous"",""prediction"":null,""threshold"":0.5,""categories"":null,""feature"":575,""overflow"":false}"
"{""index"":0,""featureType"":null,""prediction"":0.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":2,""featureType"":null,""prediction"":1.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":4,""featureType"":null,""prediction"":1.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":7,""featureType"":""continuous"",""prediction"":null,""threshold"":2.0,""categories"":null,""feature"":560,""overflow"":false}"
"{""index"":6,""featureType"":null,""prediction"":1.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":9,""featureType"":""continuous"",""prediction"":null,""threshold"":0.5,""categories"":null,""feature"":575,""overflow"":false}"
"{""index"":8,""featureType"":null,""prediction"":0.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"


In [None]:
# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(testData)

In [None]:
predictions.printSchema()

In [None]:
# View model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability", "Phone")
display(selected)

label,prediction,probability,Phone
0.0,0.0,"List(1, 2, List(), List(0.8250369640216856, 0.17496303597831445))",391-6558
0.0,0.0,"List(1, 2, List(), List(0.8250369640216856, 0.17496303597831445))",406-6708
0.0,0.0,"List(1, 2, List(), List(0.8250369640216856, 0.17496303597831445))",370-5001
0.0,1.0,"List(1, 2, List(), List(0.14671814671814673, 0.8532818532818532))",365-8831
0.0,1.0,"List(1, 2, List(), List(0.12520593080724876, 0.8747940691927513))",408-3269
0.0,1.0,"List(1, 2, List(), List(0.26291079812206575, 0.7370892018779343))",419-6418
0.0,0.0,"List(1, 2, List(), List(0.8250369640216856, 0.17496303597831445))",357-2748
0.0,0.0,"List(1, 2, List(), List(0.8250369640216856, 0.17496303597831445))",367-2056
0.0,1.0,"List(1, 2, List(), List(0.08333333333333333, 0.9166666666666666))",330-8173
0.0,0.0,"List(1, 2, List(), List(0.8250369640216856, 0.17496303597831445))",383-3375


In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Evaluate model
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [None]:
dt.getImpurity()

In [None]:
# Create ParamGrid for Cross Validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1, 2, 6, 10])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

In [None]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)
# Takes ~5 minutes

In [None]:
print("numNodes = ", cvModel.bestModel.numNodes)
print("depth = ", cvModel.bestModel.depth)

In [None]:
# Use test set to measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

In [None]:
evaluator.evaluate(predictions)

In [None]:
# View Best model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability", "Phone")
display(selected)

label,prediction,probability,Phone
0.0,0.0,"List(1, 2, List(), List(0.9782214156079855, 0.021778584392014518))",391-6558
0.0,0.0,"List(1, 2, List(), List(0.9782214156079855, 0.021778584392014518))",406-6708
0.0,0.0,"List(1, 2, List(), List(0.9782214156079855, 0.021778584392014518))",370-5001
0.0,0.0,"List(1, 2, List(), List(1.0, 0.0))",365-8831
0.0,1.0,"List(1, 2, List(), List(0.011235955056179775, 0.9887640449438202))",408-3269
0.0,0.0,"List(1, 2, List(), List(1.0, 0.0))",419-6418
0.0,0.0,"List(1, 2, List(), List(0.9782214156079855, 0.021778584392014518))",357-2748
0.0,0.0,"List(1, 2, List(), List(0.9782214156079855, 0.021778584392014518))",367-2056
0.0,0.0,"List(1, 2, List(), List(1.0, 0.0))",330-8173
0.0,0.0,"List(1, 2, List(), List(0.7272727272727273, 0.2727272727272727))",383-3375


In [None]:
from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
rfModel = rf.fit(trainingData)

In [None]:
# Make predictions on test data using the Transformer.transform() method.
predictions = rfModel.transform(testData)

In [None]:
predictions.printSchema()

In [None]:
# View model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability", "Phone")
display(selected)

label,prediction,probability,Phone
0.0,0.0,"List(1, 2, List(), List(0.6132798963479383, 0.3867201036520617))",391-6558
0.0,0.0,"List(1, 2, List(), List(0.6117720654830512, 0.3882279345169487))",406-6708
0.0,0.0,"List(1, 2, List(), List(0.5784089789704904, 0.4215910210295095))",370-5001
0.0,0.0,"List(1, 2, List(), List(0.5122026704917576, 0.4877973295082424))",365-8831
0.0,1.0,"List(1, 2, List(), List(0.4428703297905542, 0.5571296702094459))",408-3269
0.0,0.0,"List(1, 2, List(), List(0.6054854102736826, 0.39451458972631726))",419-6418
0.0,0.0,"List(1, 2, List(), List(0.6737362748731685, 0.3262637251268315))",357-2748
0.0,0.0,"List(1, 2, List(), List(0.6324982597332178, 0.3675017402667823))",367-2056
0.0,1.0,"List(1, 2, List(), List(0.4733306088475593, 0.5266693911524407))",330-8173
0.0,0.0,"List(1, 2, List(), List(0.5797474395598022, 0.42025256044019776))",383-3375


In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [None]:
# Create ParamGrid for Cross Validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())

In [None]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
cvModel = cv.fit(trainingData)

In [None]:
# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

Applying Cross Validation we can see the data come from 82.80 to 87.5  which is still good but less than what we have got from Decision Tree after applying Cross Validation

In [None]:
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)

In [None]:
# View Best model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability", "Phone")
display(selected)

label,prediction,probability,Phone
0.0,0.0,"List(1, 2, List(), List(0.6537949691098436, 0.34620503089015653))",391-6558
0.0,0.0,"List(1, 2, List(), List(0.6093569096971151, 0.3906430903028849))",406-6708
0.0,0.0,"List(1, 2, List(), List(0.5544071820385332, 0.4455928179614668))",370-5001
0.0,1.0,"List(1, 2, List(), List(0.481054489581204, 0.518945510418796))",365-8831
0.0,1.0,"List(1, 2, List(), List(0.44952558286924527, 0.5504744171307546))",408-3269
0.0,0.0,"List(1, 2, List(), List(0.6470016084092074, 0.3529983915907926))",419-6418
0.0,0.0,"List(1, 2, List(), List(0.6868063794176821, 0.3131936205823179))",357-2748
0.0,0.0,"List(1, 2, List(), List(0.6823556819983491, 0.3176443180016509))",367-2056
0.0,0.0,"List(1, 2, List(), List(0.507167255423221, 0.49283274457677884))",330-8173
0.0,0.0,"List(1, 2, List(), List(0.5914222825182887, 0.40857771748171123))",383-3375


In [None]:
bestModel = cvModel.bestModel

In [None]:
# Generate predictions for entire dataset
finalPredictions = bestModel.transform(dataset)

In [None]:
# Evaluate best model
evaluator.evaluate(finalPredictions)

In [None]:
finalPredictions.createOrReplaceTempView("finalPredictions")

In [None]:
%sql
SELECT prediction,label, count(*) AS count
FROM finalPredictions
GROUP BY prediction,label
ORDER BY Count(*)

prediction,label,count
1.0,0.0,131
0.0,1.0,199
1.0,1.0,284
0.0,0.0,2719


In [None]:
%sql
select Count(*),Churn from finalPredictions group by Churn;

count(1),Churn
483,1
2850,0


In [None]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=15)
GBT_Model = gbt.fit(trainingData)
gbt_predictions = GBT_Model.transform(testData)
evaluator = BinaryClassificationEvaluator()
print("Test_SET (Area Under ROC): " + str(evaluator.evaluate(gbt_predictions, {evaluator.metricName: "areaUnderROC"})))

In [None]:
GBT_Model.transform(dataset).createOrReplaceTempView("GBTPredictions")

In [None]:
%sql
select Prediction,label,Count(*) from GBTPredictions
group by Prediction,label

Prediction,label,count(1)
1.0,1.0,412
0.0,1.0,71
1.0,0.0,88
0.0,0.0,2762


In [None]:
%sql select * from GBTPredictions;

label,features,VMail Message,Day Mins,Eve Mins,Night Mins,Intl Mins,CustServ Calls,Day Calls,Day Charge,Eve Calls,Eve Charge,Night Calls,Night Charge,Intl Calls,Intl Charge,Area Code,Phone,Account Length,Churn,Int'l Plan,VMail Plan,State,rawPrediction,probability,prediction
0.0,"List(0, 577, List(16, 465, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 576), List(1.0, 1.0, 25.0, 265.1, 197.4, 244.7, 10.0, 1.0, 110.0, 45.07, 99.0, 16.78, 91.0, 11.01, 3.0, 2.7, 128.0, 1.0))",25,265.1,197.4,244.7,10.0,1,110,45.07,99,16.78,91,11.01,3,2.7,415,382-4657,128,0,0,1,KS,"List(1, 2, List(), List(1.1518327997168585, -1.1518327997168585))","List(1, 2, List(), List(0.9091801676805301, 0.09081983231946988))",0.0
0.0,"List(0, 577, List(4, 465, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 576), List(1.0, 1.0, 26.0, 161.6, 195.5, 254.4, 13.7, 1.0, 123.0, 27.47, 103.0, 16.62, 103.0, 11.45, 3.0, 3.7, 107.0, 1.0))",26,161.6,195.5,254.4,13.7,1,123,27.47,103,16.62,103,11.45,3,3.7,415,371-7191,107,0,0,1,OH,"List(1, 2, List(), List(0.8351912450734761, -0.8351912450734761))","List(1, 2, List(), List(0.8416268103581362, 0.15837318964186375))",0.0
0.0,"List(0, 577, List(20, 465, 561, 562, 563, 564, 566, 567, 568, 569, 570, 571, 572, 573, 574), List(1.0, 1.0, 243.4, 121.2, 162.6, 12.2, 114.0, 41.38, 110.0, 10.3, 104.0, 7.32, 5.0, 3.29, 137.0))",0,243.4,121.2,162.6,12.2,0,114,41.38,110,10.3,104,7.32,5,3.29,415,358-1921,137,0,0,0,NJ,"List(1, 2, List(), List(0.9149658255900011, -0.9149658255900011))","List(1, 2, List(), List(0.8617535843369872, 0.13824641566301277))",0.0
0.0,"List(0, 577, List(4, 458, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575), List(1.0, 1.0, 299.4, 61.9, 196.9, 6.6, 2.0, 71.0, 50.9, 88.0, 5.26, 89.0, 8.86, 7.0, 1.78, 84.0, 1.0))",0,299.4,61.9,196.9,6.6,2,71,50.9,88,5.26,89,8.86,7,1.78,408,375-9999,84,0,1,0,OH,"List(1, 2, List(), List(0.8009084162835106, -0.8009084162835106))","List(1, 2, List(), List(0.8322721593908246, 0.16772784060917545))",0.0
0.0,"List(0, 577, List(35, 465, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575), List(1.0, 1.0, 166.7, 148.3, 186.9, 10.1, 3.0, 113.0, 28.34, 122.0, 12.61, 121.0, 8.41, 3.0, 2.73, 75.0, 1.0))",0,166.7,148.3,186.9,10.1,3,113,28.34,122,12.61,121,8.41,3,2.73,415,330-6626,75,0,1,0,OK,"List(1, 2, List(), List(1.046428291578282, -1.046428291578282))","List(1, 2, List(), List(0.8902069352942191, 0.10979306470578087))",0.0
0.0,"List(0, 577, List(3, 561, 562, 563, 564, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575), List(1.0, 223.4, 220.6, 203.9, 6.3, 98.0, 37.98, 101.0, 18.75, 118.0, 9.18, 6.0, 1.7, 118.0, 1.0))",0,223.4,220.6,203.9,6.3,0,98,37.98,101,18.75,118,9.18,6,1.7,510,391-8027,118,0,1,0,AL,"List(1, 2, List(), List(0.8659086914215557, -0.8659086914215557))","List(1, 2, List(), List(0.8496447358512501, 0.1503552641487499))",0.0
0.0,"List(0, 577, List(24, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 576), List(1.0, 24.0, 218.2, 348.5, 212.6, 7.5, 3.0, 88.0, 37.09, 108.0, 29.62, 118.0, 9.57, 7.0, 2.03, 121.0, 1.0))",24,218.2,348.5,212.6,7.5,3,88,37.09,108,29.62,118,9.57,7,2.03,510,355-9993,121,0,0,1,MA,"List(1, 2, List(), List(0.37459382801162255, -0.37459382801162255))","List(1, 2, List(), List(0.6790016677316155, 0.3209983322683845))",0.0
0.0,"List(0, 577, List(29, 465, 561, 562, 563, 564, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575), List(1.0, 1.0, 157.0, 103.1, 211.8, 7.1, 79.0, 26.69, 94.0, 8.76, 96.0, 9.53, 6.0, 1.92, 147.0, 1.0))",0,157.0,103.1,211.8,7.1,0,79,26.69,94,8.76,96,9.53,6,1.92,415,329-9001,147,0,1,0,MO,"List(1, 2, List(), List(1.0017784398617644, -1.0017784398617644))","List(1, 2, List(), List(0.8811700220058993, 0.11882997799410067))",0.0
0.0,"List(0, 577, List(47, 458, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574), List(1.0, 1.0, 184.5, 351.6, 215.8, 8.7, 1.0, 97.0, 31.37, 80.0, 29.89, 90.0, 9.71, 4.0, 2.35, 117.0))",0,184.5,351.6,215.8,8.7,1,97,31.37,80,29.89,90,9.71,4,2.35,408,335-4719,117,0,0,0,LA,"List(1, 2, List(), List(1.0738216129962235, -1.0738216129962235))","List(1, 2, List(), List(0.8954483398049948, 0.10455166019500517))",0.0
0.0,"List(0, 577, List(0, 465, 560, 561, 562, 563, 564, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576), List(1.0, 1.0, 37.0, 258.6, 222.0, 326.4, 11.2, 84.0, 43.96, 111.0, 18.87, 97.0, 14.69, 5.0, 3.02, 141.0, 1.0, 1.0))",37,258.6,222.0,326.4,11.2,0,84,43.96,111,18.87,97,14.69,5,3.02,415,330-8173,141,0,1,1,WV,"List(1, 2, List(), List(-0.9270506915044602, 0.9270506915044602))","List(1, 2, List(), List(0.13539206642953927, 0.8646079335704607))",1.0


We try to use cross validation but there is a slight increase to 95.39 where area under ROC remains the same i.e 95.48

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [2, 4, 6])
             .addGrid(gbt.maxBins, [20, 30])
             .addGrid(gbt.maxIter, [10, 15])
             .build())
cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
# Run cross validations.
cvModel = cv.fit(trainingData)
gbt_cv_predictions = cvModel.transform(testData)
evaluator.evaluate(gbt_cv_predictions)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evalu = MulticlassClassificationEvaluator()
print("GBT_CV (Accuracy): " + str(evalu.evaluate(gbt_cv_predictions, {evalu.metricName: "accuracy"})))
print("GBT_CV (F1): " + str(evalu.evaluate(gbt_cv_predictions, {evalu.metricName: "f1"})))
print("GBT_CV (truePositiveRate): " + str(evalu.evaluate(gbt_cv_predictions, {evalu.metricName: "truePositiveRateByLabel", evalu.metricLabel: 1.0})))
print("GBT_CV (Area Under ROC): " + str(evaluator.evaluate(gbt_predictions, {evaluator.metricName: "areaUnderROC"})))
print("GBT_CV (Area Under PR): " + str(evaluator.evaluate(gbt_predictions, {evaluator.metricName: "areaUnderPR"})))

In [None]:
cvModel.bestModel