In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


#### Configure Spark Environment
Configure environment variables, Make sure you provide the correct Spark installation path/location.

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz

!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession \
      .builder \
      .appName("Universal Bank Data Set") \
      .master('local[*]') \
      .getOrCreate()

In [5]:
spark

### Loading the dependent libraries

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import isnan, when, count, col, countDistinct


#### Problem Statement
The dataset is from a bank, data related to direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, to access if the product (bank term deposit) would be (or not) subscribed. The data and attribute description are in the folder. 


#### Data Dictionary
 The dataset has the following attributes:

1 - age (numeric)

2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student",
                                    "blue-collar","self-employed","retired","technician","services") 

3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)

4 - education (categorical: "unknown","secondary","primary","tertiary")

5 - default: has credit in default? (binary: "yes","no")

#### 6 - balance: average yearly balance, in euros (numeric) 

7 - housing: has housing loan? (binary: "yes","no")

8 - loan: has personal loan? (binary: "yes","no")

9 - contact: contact communication type (categorical: "unknown","telephone","cellular") 

10 - day: last contact day of the month (numeric)

11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")

12 - duration: last contact duration, in seconds (numeric)

13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)

14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
  
15 - previous: number of contacts performed before this campaign and for this client (numeric)

16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

17 - Approved_no_yes - has the client subscribed to a __term deposit?__ (binary: "yes","no")

### Defining the schema to data

In [0]:
## Define Schema
bankDataSchema = StructType([
    StructField("age", IntegerType(), True),
    StructField("job", StringType(), True),
    StructField("marital_status", StringType(), True),
    StructField("education", StringType(), True),
    StructField("default", StringType(), True),
    StructField("balance", DoubleType(), True),
    StructField("housing", StringType(), True),
    StructField("loan", StringType(), True),        
    StructField("contact", StringType(), True),
    StructField("day", IntegerType(), True),
    StructField("month", StringType(), True),
    StructField("duration", DoubleType(), True),
    StructField("campaign", DoubleType(), True),
    StructField("pdays", DoubleType(), True),
    StructField("previous", DoubleType(), True),
    StructField("poutcome", StringType(), True),
    StructField("Approved_no_yes", StringType(), True)])

### Reading the data and creating a dataframe

In [0]:
## Read data and create a dataframe
data = spark.read.format("csv")\
       .option("header", "false")\
       .option("inferSchema", "true")\
       .load("drive/My Drive/20191221_CSE7312c_Batch75_SparkML/SparkML/data/bank.csv", schema = bankDataSchema)

### Understanding Data

#### Print Schema

In [10]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: double (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- campaign: double (nullable = true)
 |-- pdays: double (nullable = true)
 |-- previous: double (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- Approved_no_yes: string (nullable = true)



#### Another way to check the data type of each attribute

In [11]:
data.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital_status', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'double'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'double'),
 ('campaign', 'double'),
 ('pdays', 'double'),
 ('previous', 'double'),
 ('poutcome', 'string'),
 ('Approved_no_yes', 'string')]

#### Total number of Columns and Records

In [12]:
print("No. of Columns = {}".format(len(data.columns)))

print('No. of Records = {}'.format(data.count()))

No. of Columns = 17
No. of Records = 4521


#### Look at first 3 row of the dataframe

In [13]:
data.head(3)

[Row(age=30, job=u'unemployed', marital_status=u'married', education=u'primary', default=u'no', balance=1787.0, housing=u'no', loan=u'no', contact=u'cellular', day=19, month=u'oct', duration=79.0, campaign=1.0, pdays=-1.0, previous=0.0, poutcome=u'unknown', Approved_no_yes=u'no'),
 Row(age=33, job=u'services', marital_status=u'married', education=u'secondary', default=u'no', balance=4789.0, housing=u'yes', loan=u'yes', contact=u'cellular', day=11, month=u'may', duration=220.0, campaign=1.0, pdays=339.0, previous=4.0, poutcome=u'failure', Approved_no_yes=u'no'),
 Row(age=35, job=u'management', marital_status=u'single', education=u'tertiary', default=u'no', balance=1350.0, housing=u'yes', loan=u'no', contact=u'cellular', day=16, month=u'apr', duration=185.0, campaign=1.0, pdays=330.0, previous=1.0, poutcome=u'failure', Approved_no_yes=u'no')]

In [14]:
data.show(3)

+---+----------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---------------+
|age|       job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|
+---+----------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---------------+
| 30|unemployed|       married|  primary|     no| 1787.0|     no|  no|cellular| 19|  oct|    79.0|     1.0| -1.0|     0.0| unknown|             no|
| 33|  services|       married|secondary|     no| 4789.0|    yes| yes|cellular| 11|  may|   220.0|     1.0|339.0|     4.0| failure|             no|
| 35|management|        single| tertiary|     no| 1350.0|    yes|  no|cellular| 16|  apr|   185.0|     1.0|330.0|     1.0| failure|             no|
+---+----------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----

#### Summary statistics

In [15]:
data.describe().show()

+-------+------------------+-------+--------------+---------+-------+------------------+-------+----+--------+------------------+-----+------------------+------------------+------------------+------------------+--------+---------------+
|summary|               age|    job|marital_status|education|default|           balance|housing|loan| contact|               day|month|          duration|          campaign|             pdays|          previous|poutcome|Approved_no_yes|
+-------+------------------+-------+--------------+---------+-------+------------------+-------+----+--------+------------------+-----+------------------+------------------+------------------+------------------+--------+---------------+
|  count|              4521|   4521|          4521|     4521|   4521|              4521|   4521|4521|    4521|              4521| 4521|              4521|              4521|              4521|              4521|    4521|           4521|
|   mean| 41.17009511170095|   null|          null| 

Show only fixed set of colums 

In [16]:
data.describe().select('summary', 'age', 'loan', 'balance', 'pdays').show()

+-------+------------------+----+------------------+------------------+
|summary|               age|loan|           balance|             pdays|
+-------+------------------+----+------------------+------------------+
|  count|              4521|4521|              4521|              4521|
|   mean| 41.17009511170095|null|1422.6578190665782|39.766644547666445|
| stddev|10.576210958711263|null|3009.6381424673395|100.12112444301656|
|    min|                19|  no|           -3313.0|              -1.0|
|    max|                87| yes|           71188.0|             871.0|
+-------+------------------+----+------------------+------------------+



Observation

    Balance has -ve values

In [17]:
data.where(data.balance < 0).count()

366

### Data Preprocessing

#### Replace negative balances with zeroes

In [0]:
data = data.withColumn('balance', when(data.balance > 0, data.balance).otherwise(0))

#### Handling missing values

Checking for null values at each column

In [19]:
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+---+---+--------------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---------------+
|age|job|marital_status|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|
+---+---+--------------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---------------+
|  0|  0|             0|        0|      0|      0|      0|   0|      0|  0|    0|       0|       0|    0|       0|       0|              0|
+---+---+--------------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---------------+



#### Split the data into training and test sets (30% held out for testing)

In [0]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

#### Creating a list of categorical and numerical features

In [0]:
cat_Var_Names = ['job', 'marital_status', 'education', 'default', 'housing', 
                 'day', 'contact', 'month', 'poutcome', 'Approved_no_yes']

num_Var_Names = ['age', 'duration', 'previous', 'pdays', 'campaign']

#### Use VectorAssembler to combine a given list of numcolumns into a single vector column.

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler_Num = VectorAssembler(inputCols=num_Var_Names, outputCol="num_features")

#### Scale all the numeric attributes using MinMaxScaler

    MinMaxScaler transforms a dataset of Vector rows, rescaling each feature to a specific range (often [0, 1]). 

    MinMaxScaler computes summary statistics on a data set and produces a MinMaxScalerModel. The model can then transform each feature individually such that it is in the given range.

In [0]:
from pyspark.ml.feature import MinMaxScaler

min_Max_Scalar = MinMaxScaler(inputCol="num_features", outputCol="scaled_num_features")

#### Covert categorical to numeric : 

    OneHotEncoder, StringIndexer, VectorAssembler,  VectorIndexer

In [0]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

indexers_Cat = [StringIndexer(inputCol=cat_Var_Name, outputCol="{0}_index".format(cat_Var_Name)) for cat_Var_Name in cat_Var_Names ]
encoders_Cat = [OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_vec".format(indexer.getInputCol())) for indexer in indexers_Cat]
assembler_Cat = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders_Cat], outputCol="cat_features")

assembler = VectorAssembler(inputCols=["scaled_num_features","cat_features"], outputCol="features")


In [0]:
preprocessiong_Stages = [assembler_Num]+[min_Max_Scalar]+indexers_Cat+encoders_Cat+[assembler_Cat]+[assembler]

### Model Building and Evaluation

#### Linear Regression

In [0]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=100,labelCol="balance", featuresCol="features")

In [0]:
from pyspark.ml import Pipeline

lr_Pipeline = Pipeline(stages=preprocessiong_Stages+[lr]) 

lr_Pipeline_model = lr_Pipeline.fit(trainingData)

In [28]:
print("Coefficients: " + str(lr_Pipeline_model.stages[-1].coefficients))
print("Intercept: " + str(lr_Pipeline_model.stages[-1].intercept))

Coefficients: [1248.2294876957467,-184.11210257274044,789.6718488083674,-1320.5523561482878,103.35596308702335,-150.26990737699023,-419.5582351003783,-302.7640536990444,-390.21062991481847,-346.0401572890811,560.3898916697083,-443.10267658553516,147.8458502616132,-403.8540220827975,280.8667349081335,-87.5880054212444,423.23567303163105,566.0981120944193,-78.06915864141939,133.19141991770348,-106.88779544856492,1043.6550933860137,-38.51431053504276,524.6429425304407,622.4303902044569,249.22147436960455,1018.7021695244213,485.20850758848354,446.88108336888655,-300.28165846130713,507.9606002873643,552.0944204354311,839.8333153409166,768.9890321081105,201.3739040778598,518.3875823581851,292.0766338701231,222.7904149278321,765.3449954597302,54.597267314606675,-255.3139831386184,754.2705585032998,552.6271492348114,330.68817763548924,611.0471439490675,-143.03846169213745,89.24234395670919,267.45090508913853,-62.8254620970972,1136.4026352328706,424.2607349799009,31.738236716626787,415.04165517

In [0]:
train_predictions_lr = lr_Pipeline_model.transform(trainingData)
test_predictions_lr = lr_Pipeline_model.transform(testData)

In [30]:
test_predictions_lr.show(2)

+---+-------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---------------+--------------------+--------------------+---------+--------------------+---------------+-------------+-------------+---------+-------------+-----------+--------------+---------------------+---------------+------------------+-------------+-------------+-----------+---------------+-------------+--------------+-------------+-------------------+--------------------+--------------------+------------------+
|age|    job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|        num_features| scaled_num_features|job_index|marital_status_index|education_index|default_index|housing_index|day_index|contact_index|month_index|poutcome_index|Approved_no_yes_index|        job_vec|marital_status_vec|education_vec|  default_vec|housing_vec|        day_vec|  contact_vec|     month_

#### Evaluation : LR Model

In [0]:
# Find the error metric - RMSE
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="balance",
                            predictionCol="prediction",
                            metricName="rmse" )


In [32]:
lmRegTrain_rmse = evaluator.evaluate(train_predictions_lr)
print('RMSE value on Train data is', lmRegTrain_rmse)

lmRegTest_rmse = evaluator.evaluate(test_predictions_lr)
print('RMSE value on Test data is', lmRegTest_rmse)

('RMSE value on Train data is', 2911.154550416582)
('RMSE value on Test data is', 2907.2608293150506)


#### Tuning LR Model

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [0]:
paramGridLR = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1]) \
    .addGrid(lr.elasticNetParam, [0.5])\
    .addGrid(lr.maxIter, [100])\
    .build()
    
lr_crossval = CrossValidator(estimator=lr_Pipeline,
                             estimatorParamMaps=paramGridLR,
                             evaluator=RegressionEvaluator(labelCol="balance"),
                             numFolds=2)     

In [0]:
# Run cross-validation, and choose the best set of parameters.
lr_crossval_Model = lr_crossval.fit(trainingData)

In [0]:
train_predictions_lrcv = lr_crossval_Model.transform(trainingData)
test_predictions_lrcv = lr_crossval_Model.transform(testData)

In [37]:
lmRegTrain_rmsecv = evaluator.evaluate(train_predictions_lrcv)
print('RMSE value on Train data is', lmRegTrain_rmsecv)

lmRegTest_rmsecv = evaluator.evaluate(test_predictions_lrcv)
print('RMSE value on Test data is', lmRegTest_rmsecv)

('RMSE value on Train data is', 2911.217832832366)
('RMSE value on Test data is', 2906.5024027407926)


#### Decision Tree Regressor

In [0]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(labelCol="balance", featuresCol="features")

In [0]:
dt_Pipeline = Pipeline(stages=preprocessiong_Stages+[dt]) 

dt_Pipeline_model = dt_Pipeline.fit(trainingData)

In [0]:
train_predictions_dt = dt_Pipeline_model.transform(trainingData)
test_predictions_dt = dt_Pipeline_model.transform(testData)

In [41]:
test_predictions_dt.show(2)

+---+-------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---------------+--------------------+--------------------+---------+--------------------+---------------+-------------+-------------+---------+-------------+-----------+--------------+---------------------+---------------+------------------+-------------+-------------+-----------+---------------+-------------+--------------+-------------+-------------------+--------------------+--------------------+------------------+
|age|    job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|        num_features| scaled_num_features|job_index|marital_status_index|education_index|default_index|housing_index|day_index|contact_index|month_index|poutcome_index|Approved_no_yes_index|        job_vec|marital_status_vec|education_vec|  default_vec|housing_vec|        day_vec|  contact_vec|     month_

#### Evaluation : DT Model

In [42]:
dtTrain_rmse = evaluator.evaluate(train_predictions_dt)
print('RMSE value on Train data is', dtTrain_rmse)

dtTest_rmse = evaluator.evaluate(test_predictions_dt)
print('RMSE value on Test data is', dtTest_rmse)

('RMSE value on Train data is', 2589.6428287095523)
('RMSE value on Test data is', 2942.928448761656)


#### Tuning DT Model

In [0]:
paramGridDT = ParamGridBuilder()\
    .addGrid(dt.maxDepth, [1,6,10]) \
    .build()
    
dt_crossval = CrossValidator(estimator=dt_Pipeline,
                             estimatorParamMaps=paramGridDT,
                             evaluator=RegressionEvaluator(labelCol="balance"),
                             numFolds=2)     

In [0]:
# Run cross-validation, and choose the best set of parameters.
dt_crossval_Model = dt_crossval.fit(trainingData)

In [0]:
train_predictions_dtcv = dt_crossval_Model.transform(trainingData)
test_predictions_dtcv = dt_crossval_Model.transform(testData)

In [46]:
dtTrain_rmsecv = evaluator.evaluate(train_predictions_dtcv)
print('RMSE value on Train data is', dtTrain_rmsecv)

dtTest_rmsecv = evaluator.evaluate(test_predictions_dtcv)
print('RMSE value on Test data is', dtTest_rmsecv)

('RMSE value on Train data is', 2991.469247691134)
('RMSE value on Test data is', 2928.0217408692665)


#### Random Forest Regressor

In [0]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(labelCol="balance", featuresCol="features")

In [0]:
rf_Pipeline = Pipeline(stages=preprocessiong_Stages+[rf]) 

rf_Pipeline_model = rf_Pipeline.fit(trainingData)

In [0]:
train_predictions_rf = rf_Pipeline_model.transform(trainingData)
test_predictions_rf = rf_Pipeline_model.transform(testData)

In [50]:
test_predictions_rf.show(2)

+---+-------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---------------+--------------------+--------------------+---------+--------------------+---------------+-------------+-------------+---------+-------------+-----------+--------------+---------------------+---------------+------------------+-------------+-------------+-----------+---------------+-------------+--------------+-------------+-------------------+--------------------+--------------------+-----------------+
|age|    job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|        num_features| scaled_num_features|job_index|marital_status_index|education_index|default_index|housing_index|day_index|contact_index|month_index|poutcome_index|Approved_no_yes_index|        job_vec|marital_status_vec|education_vec|  default_vec|housing_vec|        day_vec|  contact_vec|     month_v

#### Evaluation : RF Model

In [51]:
rfTrain_rmse = evaluator.evaluate(train_predictions_rf)
print('RMSE value on Train data is', rfTrain_rmse)

rfTest_rmse = evaluator.evaluate(test_predictions_rf)
print('RMSE value on Test data is', rfTest_rmse)

('RMSE value on Train data is', 2621.0154453397718)
('RMSE value on Test data is', 2948.62275606305)


#### Tuning RF Model

In [0]:
paramGridRF = ParamGridBuilder()\
            .addGrid(rf.maxDepth, [5])\
            .addGrid(rf.numTrees, [20])\
            .build()
    
rf_crossval = CrossValidator(estimator=rf_Pipeline,
                             estimatorParamMaps=paramGridRF,
                             evaluator=RegressionEvaluator(labelCol="balance"),
                             numFolds=2)     

In [0]:
# Run cross-validation, and choose the best set of parameters.
rf_crossval_Model = rf_crossval.fit(trainingData)

In [0]:
train_predictions_rfcv = rf_crossval_Model.transform(trainingData)
test_predictions_rfcv = rf_crossval_Model.transform(testData)

In [55]:
rfTrain_rmsecv = evaluator.evaluate(train_predictions_rfcv)
print('RMSE value on Train data is', rfTrain_rmsecv)

rfTest_rmsecv = evaluator.evaluate(test_predictions_rfcv)
print('RMSE value on Test data is', rfTest_rmsecv)

('RMSE value on Train data is', 2621.0154453397718)
('RMSE value on Test data is', 2948.62275606305)


#### Gradient Boosted Tree Regressor

In [0]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(labelCol="balance", featuresCol="features")

In [0]:
gbt_Pipeline = Pipeline(stages=preprocessiong_Stages+[gbt]) 

gbt_Pipeline_model = gbt_Pipeline.fit(trainingData)

In [0]:
train_predictions_gbt = gbt_Pipeline_model.transform(trainingData)
test_predictions_gbt = gbt_Pipeline_model.transform(testData)

In [59]:
test_predictions_gbt.show(2)

+---+-------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---------------+--------------------+--------------------+---------+--------------------+---------------+-------------+-------------+---------+-------------+-----------+--------------+---------------------+---------------+------------------+-------------+-------------+-----------+---------------+-------------+--------------+-------------+-------------------+--------------------+--------------------+------------------+
|age|    job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|Approved_no_yes|        num_features| scaled_num_features|job_index|marital_status_index|education_index|default_index|housing_index|day_index|contact_index|month_index|poutcome_index|Approved_no_yes_index|        job_vec|marital_status_vec|education_vec|  default_vec|housing_vec|        day_vec|  contact_vec|     month_

#### Evaluation : gbt Model

In [60]:
gbtTrain_rmse = evaluator.evaluate(train_predictions_gbt)
print('RMSE value on Train data is', gbtTrain_rmse)

gbtTest_rmse = evaluator.evaluate(test_predictions_gbt)
print('RMSE value on Test data is', gbtTest_rmse)

('RMSE value on Train data is', 2143.5316176258943)
('RMSE value on Test data is', 3007.6400177150217)


#### Tuning GBT Model

In [0]:
paramGridGBT = ParamGridBuilder()\
            .addGrid(gbt.maxDepth, [5])\
            .addGrid(gbt.maxIter, [15])\
            .addGrid(gbt.stepSize, [0.1])\
            .build()
    
gbt_crossval = CrossValidator(estimator=gbt_Pipeline,
                             estimatorParamMaps=paramGridGBT,
                             evaluator=RegressionEvaluator(labelCol="balance"),
                             numFolds=2)     

In [0]:
# Run cross-validation, and choose the best set of parameters.
gbt_crossval_Model = gbt_crossval.fit(trainingData)

In [0]:
train_predictions_gbtcv = gbt_crossval_Model.transform(trainingData)
test_predictions_gbtcv = gbt_crossval_Model.transform(testData)

In [64]:
gbtTrain_rmsecv = evaluator.evaluate(train_predictions_gbtcv)
print('RMSE value on Train data is', gbtTrain_rmsecv)

gbtTest_rmsecv = evaluator.evaluate(test_predictions_gbtcv)
print('RMSE value on Test data is', gbtTest_rmsecv)

('RMSE value on Train data is', 2229.1568735597784)
('RMSE value on Test data is', 2972.165344298056)
