In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession \
.builder \
.appName("Employee Attrition using GradientBoostedTree") \
.config("spark.some.config.option","some-value") \
.getOrCreate()

In [3]:
df=spark.read.load('churn_train.csv',format='csv',header=True,inferSchema=True)
df.show()

+---+-------+-------+---------+--------+------+----------+------+------+-------+------+------+-------+------+-------+-------+------+------+--------+----+-------+
| st| acclen| arcode|    phnum| intplan| voice|nummailmes| tdmin| tdcal| tdchar| temin| tecal| tecahr| tnmin| tn cal| tnchar| timin| tical| tichar |ncsc|  label|
+---+-------+-------+---------+--------+------+----------+------+------+-------+------+------+-------+------+-------+-------+------+------+--------+----+-------+
| KS|    128|    415| 382-4657|      no|   yes|        25| 265.1|   110|  45.07| 197.4|    99|  16.78| 244.7|     91|  11.01|  10.0|     3|     2.7|   1| False.|
| OH|    107|    415| 371-7191|      no|   yes|        26| 161.6|   123|  27.47| 195.5|   103|  16.62| 254.4|    103|  11.45|  13.7|     3|     3.7|   1| False.|
| NJ|    137|    415| 358-1921|      no|    no|         0| 243.4|   114|  41.38| 121.2|   110|   10.3| 162.6|    104|   7.32|  12.2|     5|    3.29|   0| False.|
| OH|     84|    408| 375-99

In [4]:
df.printSchema()

root
 |-- st: string (nullable = true)
 |--  acclen: integer (nullable = true)
 |--  arcode: integer (nullable = true)
 |--  phnum: string (nullable = true)
 |--  intplan: string (nullable = true)
 |--  voice: string (nullable = true)
 |-- nummailmes: integer (nullable = true)
 |--  tdmin: double (nullable = true)
 |--  tdcal: integer (nullable = true)
 |--  tdchar: double (nullable = true)
 |--  temin: double (nullable = true)
 |--  tecal: integer (nullable = true)
 |--  tecahr: double (nullable = true)
 |--  tnmin: double (nullable = true)
 |--  tn cal: integer (nullable = true)
 |--  tnchar: double (nullable = true)
 |--  timin: double (nullable = true)
 |--  tical: integer (nullable = true)
 |--  tichar : double (nullable = true)
 |-- ncsc: integer (nullable = true)
 |--  label: string (nullable = true)



### Vectorize the columns

In [6]:
assembler = VectorAssembler(
    inputCols=[" acclen", " arcode", "nummailmes"," tdmin"," tdcal"," temin"," tecal"," tecahr"," tnmin"," tn cal"," tnchar"," timin"," tical"," tichar ","ncsc"],
    outputCol="features")

In [7]:
output = assembler.transform(df)
output.select("features").show(truncate=False)

+-----------------------------------------------------------------------------------+
|features                                                                           |
+-----------------------------------------------------------------------------------+
|[128.0,415.0,25.0,265.1,110.0,197.4,99.0,16.78,244.7,91.0,11.01,10.0,3.0,2.7,1.0]  |
|[107.0,415.0,26.0,161.6,123.0,195.5,103.0,16.62,254.4,103.0,11.45,13.7,3.0,3.7,1.0]|
|[137.0,415.0,0.0,243.4,114.0,121.2,110.0,10.3,162.6,104.0,7.32,12.2,5.0,3.29,0.0]  |
|[84.0,408.0,0.0,299.4,71.0,61.9,88.0,5.26,196.9,89.0,8.86,6.6,7.0,1.78,2.0]        |
|[75.0,415.0,0.0,166.7,113.0,148.3,122.0,12.61,186.9,121.0,8.41,10.1,3.0,2.73,3.0]  |
|[118.0,510.0,0.0,223.4,98.0,220.6,101.0,18.75,203.9,118.0,9.18,6.3,6.0,1.7,0.0]    |
|[121.0,510.0,24.0,218.2,88.0,348.5,108.0,29.62,212.6,118.0,9.57,7.5,7.0,2.03,3.0]  |
|[147.0,415.0,0.0,157.0,79.0,103.1,94.0,8.76,211.8,96.0,9.53,7.1,6.0,1.92,0.0]      |
|[117.0,408.0,0.0,184.5,97.0,351.6,80.0,29.89,215.8,90

### Index labels, adding metadata to the label column
### Fit on whole dataset to include all labels in index

In [8]:
labelIndexer = StringIndexer(inputCol=" label", outputCol="newlabel").fit(output)

### Automatically identify categorical features, and index them
### Set maxCategories so features with > 4 distinct values are treated as continuous

In [9]:
featureIndexer= VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)

### Train a GBT model

In [11]:

gbt = GBTClassifier(labelCol="newlabel", featuresCol="indexedFeatures", maxIter=10)

### Chain indexers and GBT in a Pipeline

In [12]:

pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])

###  Train model

In [13]:

model = pipeline.fit(output)

In [14]:
df1=spark.read.load('churn_test.csv',format='csv',header=True,inferSchema=True)
df1.show()

+---+-------+-------+---------+--------+------+----------+------+------+-------+------+------+-------+------+-------+-------+------+------+--------+----+-------+
| st| acclen| arcode|    phnum| intplan| voice|nummailmes| tdmin| tdcal| tdchar| temin| tecal| tecahr| tnmin| tn cal| tnchar| timin| tical| tichar |ncsc|  label|
+---+-------+-------+---------+--------+------+----------+------+------+-------+------+------+-------+------+-------+-------+------+------+--------+----+-------+
| HI|    101|    510| 354-8815|      no|    no|         0|  70.9|   123|  12.05| 211.9|    73|  18.01| 236.0|     73|  10.62|  10.6|     3|    2.86|   3| False.|
| MT|    137|    510| 381-7211|      no|    no|         0| 223.6|    86|  38.01| 244.8|   139|  20.81|  94.2|     81|   4.24|   9.5|     7|    2.57|   0| False.|
| OH|    103|    408| 411-9481|      no|   yes|        29| 294.7|    95|   50.1| 237.3|   105|  20.17| 300.3|    127|  13.51|  13.7|     6|     3.7|   1| False.|
| NM|     99|    415| 418-91

In [15]:
df1.printSchema()

root
 |-- st: string (nullable = true)
 |--  acclen: integer (nullable = true)
 |--  arcode: integer (nullable = true)
 |--  phnum: string (nullable = true)
 |--  intplan: string (nullable = true)
 |--  voice: string (nullable = true)
 |-- nummailmes: integer (nullable = true)
 |--  tdmin: double (nullable = true)
 |--  tdcal: integer (nullable = true)
 |--  tdchar: double (nullable = true)
 |--  temin: double (nullable = true)
 |--  tecal: integer (nullable = true)
 |--  tecahr: double (nullable = true)
 |--  tnmin: double (nullable = true)
 |--  tn cal: integer (nullable = true)
 |--  tnchar: double (nullable = true)
 |--  timin: double (nullable = true)
 |--  tical: integer (nullable = true)
 |--  tichar : double (nullable = true)
 |-- ncsc: integer (nullable = true)
 |--  label: string (nullable = true)



### vectorize the columns of test dataset

In [16]:
assembler = VectorAssembler(
    inputCols=[" acclen", " arcode", "nummailmes"," tdmin"," tdcal"," temin"," tecal"," tecahr"," tnmin"," tn cal"," tnchar"," timin"," tical"," tichar ","ncsc"],
    outputCol="features")

In [17]:
output1 = assembler.transform(df1)
output1.select("features").show(truncate=False)

+-----------------------------------------------------------------------------------+
|features                                                                           |
+-----------------------------------------------------------------------------------+
|[101.0,510.0,0.0,70.9,123.0,211.9,73.0,18.01,236.0,73.0,10.62,10.6,3.0,2.86,3.0]   |
|[137.0,510.0,0.0,223.6,86.0,244.8,139.0,20.81,94.2,81.0,4.24,9.5,7.0,2.57,0.0]     |
|[103.0,408.0,29.0,294.7,95.0,237.3,105.0,20.17,300.3,127.0,13.51,13.7,6.0,3.7,1.0] |
|[99.0,415.0,0.0,216.8,123.0,126.4,88.0,10.74,220.6,82.0,9.93,15.7,2.0,4.24,1.0]    |
|[108.0,415.0,0.0,197.4,78.0,124.0,101.0,10.54,204.5,107.0,9.2,7.7,4.0,2.08,2.0]    |
|[117.0,415.0,0.0,226.5,85.0,141.6,68.0,12.04,223.0,90.0,10.04,6.9,5.0,1.86,1.0]    |
|[63.0,415.0,32.0,218.9,124.0,214.3,125.0,18.22,260.3,120.0,11.71,12.9,3.0,3.48,1.0]|
|[94.0,408.0,0.0,157.5,97.0,224.5,112.0,19.08,310.8,106.0,13.99,11.1,6.0,3.0,0.0]   |
|[138.0,510.0,0.0,89.1,117.0,126.8,46.0,10.78,190.5,71

In [18]:
featureIndexer1= VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output1)

In [19]:
predictions = model.transform(output1)

### Select (prediction, true label) and compute test error

In [21]:

evaluator = MulticlassClassificationEvaluator(
labelCol="newlabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Accuracy=%g" %accuracy)

Test Error = 0.074985
Accuracy=0.925015


In [22]:
gbtModel = model.stages[2]
print(gbtModel)  # summary only

GBTClassificationModel (uid=GBTClassifier_4ddf975623932297043e) with 10 trees
