### Create SPARK_HOME and PYLIB env var and update PATH env var

In [2]:
!hdfs dfs -ls /user/thomasj/bank.csv


ls: `/user/thomasj/bank.csv': No such file or directory


In [3]:
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

### Initializing Spark

Build __SparkConf__ object 

    Contains information about your application.  


Create __SparkContext__ object 
    
    Tells Spark how to access a cluster. 
    

Create __SparkSession__ object

    The entry point to programming Spark with the Dataset and DataFrame API.

    Used to create DataFrame, register DataFrame as tables and execute SQL over tables etc.

In [4]:
# from pyspark.conf import SparkConf
# from pyspark import SparkContext
# from pyspark.sql import SparkSession

# conf = SparkConf().setAppName("Universal Bank Data Set").setMaster('local')
# sc = SparkContext(conf=conf)
# spark = SparkSession(sc)

In [5]:
# from pyspark.conf import SparkConf
# from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .appName("SparkML")\
    .master('local[*]')\
    .getOrCreate()
sc = spark.sparkContext

### Loading the dependent libraries

In [6]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark.sql.functions import isnan, when, count, col, countDistinct


#### Problem Statement
The dataset is from a bank, data related to direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, to access if the product (bank term deposit) would be (or not) subscribed. The data and attribute description are in the folder. 


#### Data Dictionary
 The dataset has the following attributes:

1 - age (numeric)

2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student",
                                    "blue-collar","self-employed","retired","technician","services") 

3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)

4 - education (categorical: "unknown","secondary","primary","tertiary")

5 - default: has credit in default? (binary: "yes","no")

#### 6 - balance: average yearly balance, in euros (numeric) 

7 - housing: has housing loan? (binary: "yes","no")

8 - loan: has personal loan? (binary: "yes","no")

9 - contact: contact communication type (categorical: "unknown","telephone","cellular") 

10 - day: last contact day of the month (numeric)

11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")

12 - duration: last contact duration, in seconds (numeric)

13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)

14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
  
15 - previous: number of contacts performed before this campaign and for this client (numeric)

16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

17 - Approved_no_yes - has the client subscribed to a __term deposit?__ (binary: "yes","no")

### Defining the schema to the data

In [7]:
## Define Schema
bankDataSchema = StructType([
    StructField("age", IntegerType(), True),
    StructField("job", StringType(), True),
    StructField("marital_status", StringType(), True),
    StructField("education", StringType(), True),
    StructField("default", StringType(), True),
    StructField("balance", DoubleType(), True),
    StructField("housing", StringType(), True),
    StructField("loan", StringType(), True),        
    StructField("contact", StringType(), True),
    StructField("day", IntegerType(), True),
    StructField("month", StringType(), True),
    StructField("duration", DoubleType(), True),
    StructField("campaign", DoubleType(), True),
    StructField("pdays", DoubleType(), True),
    StructField("previous", DoubleType(), True),
    StructField("poutcome", StringType(), True),
    StructField("Approved_no_yes", StringType(), True)])

### Reading the data and creating a dataframe

In [8]:
## Read data and create a dataframe
#data = spark.read.format("csv")\
#       .option("header", "false")\
#       .option("inferSchema", "false")\
#       .load("/user/chaithanyas/bank.csv", schema = bankDataSchema)

In [9]:
!hdfs dfs -cat /user/thomasj/bank.csv | head -10

cat: `/user/thomasj/bank.csv': No such file or directory


In [13]:
data = spark.read.csv(path='file:///home/thomasj/Batch52/bank.csv',
                      header=False,
                      schema=bankDataSchema)

In [14]:
type(data)

pyspark.sql.dataframe.DataFrame

### Understanding Data

#### Print Schema

Hint: Use printSchema()

In [15]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: double (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- campaign: double (nullable = true)
 |-- pdays: double (nullable = true)
 |-- previous: double (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- Approved_no_yes: string (nullable = true)



#### Another way to check the data type of each attribute

Hint: Use dtypes

In [16]:
data.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital_status', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'double'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'double'),
 ('campaign', 'double'),
 ('pdays', 'double'),
 ('previous', 'double'),
 ('poutcome', 'string'),
 ('Approved_no_yes', 'string')]

#### Total number of Columns and Records

Hint: Use columns and count() function

In [17]:
print("No. of Columns = {}".format(len(data.columns)))

print('No. of Records = {}'.format(data.count()))

No. of Columns = 17
No. of Records = 4521


#### Look at first 3 row of the dataframe

In [18]:
data.head(3)

[Row(age=30, job=u'unemployed', marital_status=u'married', education=u'primary', default=u'no', balance=1787.0, housing=u'no', loan=u'no', contact=u'cellular', day=19, month=u'oct', duration=79.0, campaign=1.0, pdays=-1.0, previous=0.0, poutcome=u'unknown', Approved_no_yes=u'no'),
 Row(age=33, job=u'services', marital_status=u'married', education=u'secondary', default=u'no', balance=4789.0, housing=u'yes', loan=u'yes', contact=u'cellular', day=11, month=u'may', duration=220.0, campaign=1.0, pdays=339.0, previous=4.0, poutcome=u'failure', Approved_no_yes=u'no'),
 Row(age=35, job=u'management', marital_status=u'single', education=u'tertiary', default=u'no', balance=1350.0, housing=u'yes', loan=u'no', contact=u'cellular', day=16, month=u'apr', duration=185.0, campaign=1.0, pdays=330.0, previous=1.0, poutcome=u'failure', Approved_no_yes=u'no')]

Hint: Use Show() function

In [19]:
data.show(3)

+---+----------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---------------+
|age|       job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|
+---+----------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---------------+
| 30|unemployed|       married|  primary|     no| 1787.0|     no|  no|cellular| 19|  oct|    79.0|     1.0| -1.0|     0.0| unknown|             no|
| 33|  services|       married|secondary|     no| 4789.0|    yes| yes|cellular| 11|  may|   220.0|     1.0|339.0|     4.0| failure|             no|
| 35|management|        single| tertiary|     no| 1350.0|    yes|  no|cellular| 16|  apr|   185.0|     1.0|330.0|     1.0| failure|             no|
+---+----------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----

#### Summary statistics

Hint: Use describe() function

In [20]:
data.describe().show()

+-------+------------------+-------+--------------+---------+-------+------------------+-------+----+--------+------------------+-----+------------------+------------------+------------------+------------------+--------+---------------+
|summary|               age|    job|marital_status|education|default|           balance|housing|loan| contact|               day|month|          duration|          campaign|             pdays|          previous|poutcome|Approved_no_yes|
+-------+------------------+-------+--------------+---------+-------+------------------+-------+----+--------+------------------+-----+------------------+------------------+------------------+------------------+--------+---------------+
|  count|              4521|   4521|          4521|     4521|   4521|              4521|   4521|4521|    4521|              4521| 4521|              4521|              4521|              4521|              4521|    4521|           4521|
|   mean| 41.17009511170095|   null|          null| 

Show only fixed set of colums 

In [21]:
data.describe().select('summary', 'age', 'loan', 'balance', 'pdays').show()

+-------+------------------+----+------------------+------------------+
|summary|               age|loan|           balance|             pdays|
+-------+------------------+----+------------------+------------------+
|  count|              4521|4521|              4521|              4521|
|   mean| 41.17009511170095|null|1422.6578190665782|39.766644547666445|
| stddev|10.576210958711263|null|3009.6381424673395|100.12112444301656|
|    min|                19|  no|           -3313.0|              -1.0|
|    max|                87| yes|           71188.0|             871.0|
+-------+------------------+----+------------------+------------------+



Observation

    Balance has -ve values

Count the number of records that have balance < 0

Hint: Use Where() and count()

In [22]:
data.where(data.balance < 0).count()

366

### Data Preprocessing

#### Replace negative balances with zeroes

Hint: Use withColumn() and when() functions

In [23]:
data = data.withColumn('balance', when(data.balance > 0, data.balance).otherwise(0))

In [24]:
data.where(data.balance < 0).count()

0

#### Handling missing values

Checking for null values at each column

In [25]:
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) 
             for c in data.columns]).show()

+---+---+--------------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---------------+
|age|job|marital_status|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|
+---+---+--------------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---------------+
|  0|  0|             0|        0|      0|      0|      0|   0|      0|  0|    0|       0|       0|    0|       0|       0|              0|
+---+---+--------------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---------------+



#### Split the data into training and test sets (30% held out for testing)

Hint: randomSplit()

In [26]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [27]:
testData.describe()

DataFrame[summary: string, age: string, job: string, marital_status: string, education: string, default: string, balance: string, housing: string, loan: string, contact: string, day: string, month: string, duration: string, campaign: string, pdays: string, previous: string, poutcome: string, Approved_no_yes: string]

In [28]:
# testData = testData.drop("loan")

In [32]:
trainingData.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital_status', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'double'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'double'),
 ('campaign', 'double'),
 ('pdays', 'double'),
 ('previous', 'double'),
 ('poutcome', 'string'),
 ('Approved_no_yes', 'string')]

In [33]:
num_col = [c for c in trainingData.dtypes if c[1] in ['int', 'double']]

In [34]:
trainingData.show()

+---+------------+--------------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+---------------+
|age|         job|marital_status|education|default|balance|housing|loan|  contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|
+---+------------+--------------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+---------------+
| 19|     student|        single|  primary|     no|  103.0|     no|  no| cellular| 10|  jul|   104.0|     2.0| -1.0|     0.0| unknown|            yes|
| 19|     student|        single|secondary|     no|  302.0|     no|  no| cellular| 16|  jul|   205.0|     1.0| -1.0|     0.0| unknown|            yes|
| 19|     student|        single|  unknown|     no| 1169.0|     no|  no| cellular|  6|  feb|   463.0|    18.0| -1.0|     0.0| unknown|             no|
| 20|     student|        single|secondary|     no|  291.0|     no|  no|telephone| 11|  may|  

#### Creating a list of categorical and numerical features

In [30]:
cat_Var_Names = ['job', 'marital_status', 'education', 'default', 'housing', 
                 'day', 'contact', 'month', 'poutcome', 'Approved_no_yes']

num_Var_Names = ['age', 'balance', 'duration', 'previous', 'pdays', 'campaign']

In [31]:
def generate_layout_bar(col_name):
    layout_bar = go.Layout(
        autosize=False, # auto size the graph? use False if you are specifying the height and width
        width=800, # height of the figure in pixels
        height=600, # height of the figure in pixels
        title = "Distribution of {} column".format(col_name), # title of the figure
        # more granular control on the title font 
        titlefont=dict( 
            family='Courier New, monospace', # font family
            size=14, # size of the font
            color='black' # color of the font
        ),
        # granular control on the axes objects 
        xaxis=dict( 
        tickfont=dict(
            family='Courier New, monospace', # font family
            size=14, # size of ticks displayed on the x axis
            color='black'  # color of the font
            )
        ),
        yaxis=dict(
#         range=[0,100],
            title='Percentage',
            titlefont=dict(
                size=14,
                color='black'
            ),
        tickfont=dict(
            family='Courier New, monospace', # font family
            size=14, # size of ticks displayed on the y axis
            color='black' # color of the font
            )
        ),
        font = dict(
            family='Courier New, monospace', # font family
            color = "white",# color of the font
            size = 12 # size of the font displayed on the bar
                )  
        )
    return layout_bar

In [27]:
import plotly.offline as pyoff
import plotly.graph_objs as go

from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [28]:
import numpy as np
from pyspark.sql import functions as F


In [29]:
def plot_bar(dataframe, col_name):
    # create a table with value counts
    temp =  trainingData.groupBy(col_name).count()
    temp = temp.toPandas()
    temp = temp.sort_values('count',ascending = False)
    temp[col_name] = temp[col_name].astype(str)
#     temp = dataframe[col_name].value_counts()
    # creating a Bar chart object of plotly
    data = [go.Bar(
            x=temp.loc[:,col_name].astype(str), # x axis values
            y=np.round(temp.loc[:,'count'].astype(float)/temp.loc[:,'count'].sum(),6)*100, # y axis values
            text = ['{}%'.format(i) for i in np.round(temp.loc[:,'count'].astype(float)/temp.loc[:,'count'].sum(),6)*100],
        # text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
            textposition = 'auto', # specify at which position on the bar the text should appear
        marker = dict(color = '#0047AB'),)] # change color of the bar
    # color used here Cobalt Blue
     
    layout_bar = generate_layout_bar(col_name=col_name)

    fig = go.Figure(data=data, layout=layout_bar)
    return iplot(fig)
    

In [30]:
for i in cat_Var_Names:
        plot_bar(trainingData ,i)

#### Use VectorAssembler to combine a given list of numcolumns into a single vector column.

In [31]:
from pyspark.ml.feature import VectorAssembler

assembler_Num = VectorAssembler(inputCols=num_Var_Names, outputCol="num_features")

#### Scale all the numeric attributes using MinMaxScaler

    MinMaxScaler transforms a dataset of Vector rows, rescaling each feature to a specific range (often [0, 1]). 

    MinMaxScaler computes summary statistics on a data set and produces a MinMaxScalerModel. The model can then transform each feature individually such that it is in the given range.

In [32]:
from pyspark.ml.feature import MinMaxScaler

min_Max_Scalar = MinMaxScaler(inputCol="num_features", outputCol="scaled_num_features")

#### Covert categorical to numeric : 

    OneHotEncoder, StringIndexer, VectorAssembler

In [33]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

indexers_Cat = [StringIndexer(inputCol=cat_Var_Name, outputCol="{0}_index".format(cat_Var_Name)) for cat_Var_Name in cat_Var_Names ]
encoders_Cat = [OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_vec".format(indexer.getInputCol())) for indexer in indexers_Cat]
assembler_Cat = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders_Cat], outputCol="cat_features")

assembler = VectorAssembler(inputCols=["scaled_num_features", "cat_features"], outputCol="features")

In [34]:
indexer_Label = StringIndexer(inputCol="loan", outputCol="label")

In [35]:
preprocessiong_Stages = [assembler_Num]+[min_Max_Scalar]+indexers_Cat+encoders_Cat+[assembler_Cat]+[assembler]+[indexer_Label]

### Model Building and Evaluation

#### Logistic Regression

In [36]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, labelCol="label", featuresCol="features")

In [37]:
from pyspark.ml import Pipeline

lr_Pipeline = Pipeline(stages=preprocessiong_Stages+[lr]) 

lr_Pipeline_model = lr_Pipeline.fit(trainingData)

In [38]:
print("Coefficients: " + str(lr_Pipeline_model.stages[-1].coefficients))
print("Intercept: " + str(lr_Pipeline_model.stages[-1].intercept))

Coefficients: [0.03327095103017062,-5.0799526719189645,0.5742351030678011,0.7524819861240976,-0.30849325418407897,0.19967107636224737,0.030964878987691506,0.18912866354078453,0.08452588336995794,0.21709437323453723,-0.023068707142850607,0.06137490624275288,0.2740859690531981,0.7386322479440705,-0.750762564743165,0.01830434313386811,-1.9694584244475852,-0.15714990935732034,-0.23963287056416394,0.6164190212070381,0.1080662608379649,0.23075994878946404,-1.3002494407805416,-0.03468388316525945,-0.7809695356817624,-0.04501529810387833,0.06372135262431104,-0.1913064987333461,0.08496384808258028,0.20238449815684767,0.4173072932434156,0.23456541549128834,0.01462433279830929,0.09901155401931762,0.28948838498484436,0.21186794928022823,0.8270908398716561,-0.5539639146771789,0.3017269200731973,-0.48769544130087966,0.04127247481264134,-0.2420651266320611,-0.5694343124288457,0.452093286580783,-0.039240294492228565,-0.07138981816966658,0.1607499467388126,0.2614013621200678,0.05836112428645387,0.09816

In [39]:
lr_Summary = lr_Pipeline_model.stages[-1].summary
objectiveHistory = lr_Summary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

objectiveHistory:
0.434916958777
0.426780506508
0.420165453969
0.402160794543
0.399061116392
0.396860556452
0.395585554228
0.394016440922
0.393249950504
0.393030300232
0.392559010787


In [40]:
train_predictions_lr = lr_Pipeline_model.transform(trainingData)
test_predictions_lr = lr_Pipeline_model.transform(testData)

In [41]:
test_predictions_lr.show(2,truncate=False)

+---+-------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---------------+---------------------------------+--------------------------------------------------------------------------------------------------------+---------+--------------------+---------------+-------------+-------------+---------+-------------+-----------+--------------+---------------------+---------------+------------------+-------------+-------------+-----------+---------------+-------------+--------------+-------------+-------------------+-----------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+----------------------------------------+-----------------------------------------+----------+
|age|job    |marital_status|education|default|balance|housing|loan|co

#### Evaluation : LR Model

In [42]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                              predictionCol="prediction",
                                              metricName="accuracy")

predictionAndLabels_train_lr = train_predictions_lr.select("prediction", "label")
train_accuracy_lr = evaluator.evaluate(predictionAndLabels_train_lr)

print("Train accuracy  = " + str(train_accuracy_lr))

predictionAndLabels_test_lr = test_predictions_lr.select("prediction", "label")
test_accuracy_lr = evaluator.evaluate(predictionAndLabels_test_lr)

print("Test accuracy = " + str(test_accuracy_lr))

Train accuracy  = 0.84445157152
Test accuracy = 0.857448325018


#### Tuning LR Model

In [43]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [44]:
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.2]) \
    .addGrid(lr.elasticNetParam, [0.5])\
    .build()
    
lr_crossval = CrossValidator(estimator=lr_Pipeline,
                             estimatorParamMaps=paramGrid,
                             evaluator=evaluator,
                             numFolds=2)     

In [45]:
# Run cross-validation, and choose the best set of parameters.
lr_crossval_Model = lr_crossval.fit(trainingData)

In [46]:
train_predictions_lrcv = lr_crossval_Model.transform(trainingData)
test_predictions_lrcv = lr_crossval_Model.transform(testData)

In [47]:
predictionAndLabels_train_lrcv = train_predictions_lrcv.select("prediction", "label")
train_accuracycv = evaluator.evaluate(predictionAndLabels_train_lrcv)
print("Train set accuracy  = " + str(train_accuracycv))

predictionAndLabels_test_lrcv = test_predictions_lrcv.select("prediction", "label")
test_accuracycv = evaluator.evaluate(predictionAndLabels_test_lrcv)
print("Test set accuracy = " + str(test_accuracycv))

Train set accuracy  = 0.842847979474
Test set accuracy = 0.856735566643


#### Decision Tree Classifier

In [48]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [49]:
dt_Pipeline = Pipeline(stages=preprocessiong_Stages+[dt]) 

dt_Pipeline_model = dt_Pipeline.fit(trainingData)

In [50]:
train_predictions_dt = dt_Pipeline_model.transform(trainingData)
test_predictions_dt = dt_Pipeline_model.transform(testData)

In [51]:
test_predictions_dt.show(2)

+---+-------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---------------+--------------------+--------------------+---------+--------------------+---------------+-------------+-------------+---------+-------------+-----------+--------------+---------------------+---------------+------------------+-------------+-------------+-----------+---------------+-------------+--------------+-------------+-------------------+--------------------+--------------------+-----+-------------+--------------------+----------+
|age|    job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|        num_features| scaled_num_features|job_index|marital_status_index|education_index|default_index|housing_index|day_index|contact_index|month_index|poutcome_index|Approved_no_yes_index|        job_vec|marital_status_vec|education_vec|  default_vec|housing_vec|        

#### Evaluation : DT Model

In [52]:
predictionAndLabels_train_dt = train_predictions_dt.select("prediction", "label")
train_accuracy_dt = evaluator.evaluate(predictionAndLabels_train_dt)

print("Train accuracy  = " + str(train_accuracy_dt))

predictionAndLabels_test_dt = test_predictions_dt.select("prediction", "label")
test_accuracy_dt = evaluator.evaluate(predictionAndLabels_test_dt)

print("Test accuracy = " + str(test_accuracy_dt))

Train accuracy  = 0.851828094933
Test accuracy = 0.847469707769


#### Tuning DT Model

In [53]:
paramGridDT = ParamGridBuilder()\
    .addGrid(dt.maxDepth, [1,6,10]) \
    .build()
    
dt_crossval = CrossValidator(estimator=dt_Pipeline,
                             estimatorParamMaps=paramGridDT,
                             evaluator=evaluator,
                             numFolds=2)     

In [54]:
# Run cross-validation, and choose the best set of parameters.
dt_crossval_Model = dt_crossval.fit(trainingData)

In [55]:
train_predictions_dtcv = dt_crossval_Model.transform(trainingData)
test_predictions_dtcv = dt_crossval_Model.transform(testData)

In [56]:
predictionAndLabels_train_dtcv = train_predictions_dtcv.select("prediction", "label")
train_accuracydtcv = evaluator.evaluate(predictionAndLabels_train_dtcv)
print("Train set accuracy  = " + str(train_accuracydtcv))

predictionAndLabels_test_dtcv = test_predictions_dtcv.select("prediction", "label")
test_accuracydtcv = evaluator.evaluate(predictionAndLabels_test_dtcv)
print("Test set accuracy = " + str(test_accuracydtcv))

Train set accuracy  = 0.842847979474
Test set accuracy = 0.856735566643


#### Random Forest Classifier

In [57]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features")

In [58]:
rf_Pipeline = Pipeline(stages=preprocessiong_Stages+[rf]) 

rf_Pipeline_model = rf_Pipeline.fit(trainingData)

In [59]:
train_predictions_rf = rf_Pipeline_model.transform(trainingData)
test_predictions_rf = rf_Pipeline_model.transform(testData)

In [60]:
test_predictions_rf.show(100)

+---+-------------+--------------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+---------------+--------------------+--------------------+---------+--------------------+---------------+-------------+-------------+---------+-------------+-----------+--------------+---------------------+---------------+------------------+-------------+-------------+-------------+---------------+-------------+--------------+-------------+-------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|age|          job|marital_status|education|default|balance|housing|loan|  contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|        num_features| scaled_num_features|job_index|marital_status_index|education_index|default_index|housing_index|day_index|contact_index|month_index|poutcome_index|Approved_no_yes_index|        job_vec|marital_status_vec|education_vec|  default_v

#### Evaluation : RF Model

In [61]:
predictionAndLabels_train_rf = train_predictions_rf.select("prediction", "label")
train_accuracy_rf = evaluator.evaluate(predictionAndLabels_train_rf)

print("Train accuracy  = " + str(train_accuracy_rf))

predictionAndLabels_test_rf = test_predictions_rf.select("prediction", "label")
test_accuracy_rf = evaluator.evaluate(predictionAndLabels_test_rf)

print("Test accuracy = " + str(test_accuracy_rf))

Train accuracy  = 0.842847979474
Test accuracy = 0.856735566643


#### Tuning RF Model

In [62]:
paramGridRF = ParamGridBuilder()\
            .addGrid(rf.maxDepth, [5])\
            .addGrid(rf.numTrees, [20])\
            .build()
    
rf_crossval = CrossValidator(estimator=rf_Pipeline,
                             estimatorParamMaps=paramGridRF,
                             evaluator=evaluator,
                             numFolds=2)     

In [63]:
# Run cross-validation, and choose the best set of parameters.
rf_crossval_Model = rf_crossval.fit(trainingData)

In [64]:
train_predictions_rfcv = rf_crossval_Model.transform(trainingData)
test_predictions_rfcv = rf_crossval_Model.transform(testData)

In [65]:
predictionAndLabels_train_rfcv = train_predictions_rfcv.select("prediction", "label")
train_accuracyrfcv = evaluator.evaluate(predictionAndLabels_train_rfcv)
print("Train set accuracy  = " + str(train_accuracyrfcv))

predictionAndLabels_test_rfcv = test_predictions_rfcv.select("prediction", "label")
test_accuracyrfcv = evaluator.evaluate(predictionAndLabels_test_rfcv)
print("Test set accuracy = " + str(test_accuracyrfcv))

Train set accuracy  = 0.842847979474
Test set accuracy = 0.856735566643


#### Gradient Boosted Tree Classifier

In [66]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="label", featuresCol="features")

In [67]:
gbt_Pipeline = Pipeline(stages=preprocessiong_Stages+[gbt]) 

gbt_Pipeline_model = gbt_Pipeline.fit(trainingData)

In [68]:
train_predictions_gbt = gbt_Pipeline_model.transform(trainingData)
test_predictions_gbt = gbt_Pipeline_model.transform(testData)

In [69]:
test_predictions_gbt.show(100, truncate= False)

+---+-------------+--------------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+---------------+----------------------------------+------------------------------------------------------------------------------------------------------------+---------+--------------------+---------------+-------------+-------------+---------+-------------+-----------+--------------+---------------------+---------------+------------------+-------------+-------------+-------------+---------------+-------------+--------------+-------------+-------------------+------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------------------------------------------+----------------------------------------+----------+
|age|job          |mari

In [70]:
test_predictions_gbt.select("loan","prediction", "probability").show(100)

+----+----------+--------------------+
|loan|prediction|         probability|
+----+----------+--------------------+
|  no|       0.0|[0.91275698169128...|
|  no|       0.0|[0.94604909234919...|
|  no|       0.0|[0.93760696157140...|
|  no|       0.0|[0.74890391714901...|
|  no|       0.0|[0.84891030381347...|
|  no|       1.0|[0.11941640752372...|
|  no|       0.0|[0.86259136082361...|
|  no|       0.0|[0.81758175010247...|
|  no|       0.0|[0.90851491876144...|
|  no|       0.0|[0.91362164356214...|
|  no|       0.0|[0.64740458842233...|
|  no|       0.0|[0.89221794376096...|
| yes|       0.0|[0.79782195943293...|
|  no|       0.0|[0.83419637529648...|
|  no|       0.0|[0.80388041039185...|
|  no|       0.0|[0.91473882189735...|
|  no|       0.0|[0.90974316174094...|
|  no|       0.0|[0.79527218567175...|
|  no|       0.0|[0.78647893036068...|
|  no|       0.0|[0.90837893858120...|
|  no|       0.0|[0.93120340987562...|
|  no|       0.0|[0.82567092093417...|
|  no|       0.0|[0.93020

#### Evaluation : gbt Model

In [71]:
predictionAndLabels_train_gbt = train_predictions_gbt.select("prediction", "label")
train_accuracy_gbt = evaluator.evaluate(predictionAndLabels_train_gbt)

print("Train accuracy  = " + str(train_accuracy_gbt))

predictionAndLabels_test_gbt = test_predictions_gbt.select("prediction", "label")
test_accuracy_gbt = evaluator.evaluate(predictionAndLabels_test_gbt)

print("Test accuracy = " + str(test_accuracy_gbt))

Train accuracy  = 0.866581141758
Test accuracy = 0.846756949394


#### Tuning GBT Model

In [72]:
paramGridGBT = ParamGridBuilder()\
            .addGrid(gbt.maxDepth, [5])\
            .addGrid(gbt.maxIter, [20])\
            .addGrid(gbt.stepSize, [0.1])\
            .build()
    
gbt_crossval = CrossValidator(estimator=gbt_Pipeline,
                             estimatorParamMaps=paramGridGBT,
                             evaluator=evaluator,
                             numFolds=2)     

In [73]:
# Run cross-validation, and choose the best set of parameters.
gbt_crossval_Model = gbt_crossval.fit(trainingData)