A. Basic

i)   Start hadoop
ii) Transfer 'train.csv' file from local file system to hadoop:  hdfs://localhost:9000/user/ashok/data_files/
iii) Read Data and perform basic data exploration. Especially see which all variables are 'string' data types and which not. This is our 'original' data

B. Objects creation:

i)  List all categorical columns and create objects to StringIndex all these categorical columns
ii) Prepare (one) object to OneHotEncode categorical columns (received from above)
iii)  Prepare a (one) list of all numerical and OneHotEncoded columns. Exclude 'loss' column from this list.
vi) Create a VectorAssembler object to assemble all the columns as in the (iii) above
v) Create an object to perform modeling using GBTRegressor
vi) Create a pipeline object

C. Fitting and Modeling:

i) Split your 'original' data into train and test sets
ii) Fit the pipeline object on the 'train' data
iii)  Make predictions for 'test' data
iv) Evaluate results using RegressionEvaluator as used in bikesharing dataset problem.

In [1]:
# 1.0 Call libraries
# 1.0 Encode 'string' column to index-column. 
#     Indexing begins from 0.
from pyspark.ml.feature import StringIndexer
# 1.1 OHE an indexed column after StringIndexing
#     and create one another column
from pyspark.ml.feature import OneHotEncoderEstimator
# 1.2 Assemble numerical and OHE data in one column
from pyspark.ml.feature import VectorAssembler
# 1.3 Scale Vector-Assmbled data. Individual columns
#     will have to be scaled 'manually' using withColumn()
#     statement and writing your own scaling formula
from pyspark.ml.feature import StandardScaler
# 1.4 Vector Index one column assembled+scaled as above
from pyspark.ml.feature import VectorIndexer

# 1.5 Import ML estimator. It is a classification problem
from pyspark.ml.classification import RandomForestClassifier

# 1.6 Create a pipeline model for all stages
from pyspark.ml import Pipeline

# 1.7 Misc functions
# 1.7.1 Call an important group of sql functions
from pyspark.sql.functions import col,sum
# 1.7.2 Unlike in other languages, in spark
#       type-classes are to be separateky imported
#       They are not part of core classes or modules
from pyspark.sql.types import DoubleType

# 1.8 Binary classification results evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# 1.9
import os, time
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
import time

In [2]:
# 2. Increase the width of notebook 
#    to display all columns of data
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
# 2.1 Show multiple outputs of a single cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
df = spark.read.csv(
                   path = "hdfs://localhost:9000/user/ashok/data_files/train.csv",   # path to hadoop
                   header = True,
                   inferSchema= True,           # Infer datatypes automatically
                   sep = ","                   # Can be any character (check \t)
                   )


In [7]:
# 3.1 Data shape
df.count()           
cols = df.columns
len(cols)            
print(cols)

188318

132

['id', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat110', '

In [6]:
df.show(3)

+---+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+--------+--------+--------+--------+--------+--------+--------+-------+-------+-------+--------+--------+--------+--------+-------+
| id|cat1|cat2|cat3|cat4|cat5|cat6|cat7|cat8|cat9|cat10|cat11|cat12|cat13|cat14|cat15|cat16|cat17|cat18|cat19|cat20|cat21|cat22|cat23|cat24|cat25|cat26|cat27|c

In [8]:
df.cache()
df.is_cached 

DataFrame[id: int, cat1: string, cat2: string, cat3: string, cat4: string, cat5: string, cat6: string, cat7: string, cat8: string, cat9: string, cat10: string, cat11: string, cat12: string, cat13: string, cat14: string, cat15: string, cat16: string, cat17: string, cat18: string, cat19: string, cat20: string, cat21: string, cat22: string, cat23: string, cat24: string, cat25: string, cat26: string, cat27: string, cat28: string, cat29: string, cat30: string, cat31: string, cat32: string, cat33: string, cat34: string, cat35: string, cat36: string, cat37: string, cat38: string, cat39: string, cat40: string, cat41: string, cat42: string, cat43: string, cat44: string, cat45: string, cat46: string, cat47: string, cat48: string, cat49: string, cat50: string, cat51: string, cat52: string, cat53: string, cat54: string, cat55: string, cat56: string, cat57: string, cat58: string, cat59: string, cat60: string, cat61: string, cat62: string, cat63: string, cat64: string, cat65: string, cat66: string, 

True

In [9]:
df.select(cols[:30]).show(3)

+---+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| id|cat1|cat2|cat3|cat4|cat5|cat6|cat7|cat8|cat9|cat10|cat11|cat12|cat13|cat14|cat15|cat16|cat17|cat18|cat19|cat20|cat21|cat22|cat23|cat24|cat25|cat26|cat27|cat28|cat29|
+---+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|  1|   A|   B|   A|   B|   A|   A|   A|   A|   B|    A|    B|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    B|    A|    A|    A|    A|    A|    A|
|  2|   A|   B|   A|   A|   A|   A|   A|   A|   B|    B|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|
|  5|   A|   B|   A|   A|   B|   A|   A|   A|   B|    B|    B|    B|    B|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A|    A

In [10]:
df.select(*(sum(df[c].isNull().cast("int")).alias(c) for c in cols[::])).show() 

+---+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+----+
| id|cat1|cat2|cat3|cat4|cat5|cat6|cat7|cat8|cat9|cat10|cat11|cat12|cat13|cat14|cat15|cat16|cat17|cat18|cat19|cat20|cat21|cat22|cat23|cat24|cat25|cat26|cat27|cat28|cat29|cat30|cat31|cat32|cat33|ca

In [11]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- cat1: string (nullable = true)
 |-- cat2: string (nullable = true)
 |-- cat3: string (nullable = true)
 |-- cat4: string (nullable = true)
 |-- cat5: string (nullable = true)
 |-- cat6: string (nullable = true)
 |-- cat7: string (nullable = true)
 |-- cat8: string (nullable = true)
 |-- cat9: string (nullable = true)
 |-- cat10: string (nullable = true)
 |-- cat11: string (nullable = true)
 |-- cat12: string (nullable = true)
 |-- cat13: string (nullable = true)
 |-- cat14: string (nullable = true)
 |-- cat15: string (nullable = true)
 |-- cat16: string (nullable = true)
 |-- cat17: string (nullable = true)
 |-- cat18: string (nullable = true)
 |-- cat19: string (nullable = true)
 |-- cat20: string (nullable = true)
 |-- cat21: string (nullable = true)
 |-- cat22: string (nullable = true)
 |-- cat23: string (nullable = true)
 |-- cat24: string (nullable = true)
 |-- cat25: string (nullable = true)
 |-- cat26: string (nullable = true)
 |-- ca

In [12]:
df=df.drop('id')

In [13]:
df.dtypes

[('cat1', 'string'),
 ('cat2', 'string'),
 ('cat3', 'string'),
 ('cat4', 'string'),
 ('cat5', 'string'),
 ('cat6', 'string'),
 ('cat7', 'string'),
 ('cat8', 'string'),
 ('cat9', 'string'),
 ('cat10', 'string'),
 ('cat11', 'string'),
 ('cat12', 'string'),
 ('cat13', 'string'),
 ('cat14', 'string'),
 ('cat15', 'string'),
 ('cat16', 'string'),
 ('cat17', 'string'),
 ('cat18', 'string'),
 ('cat19', 'string'),
 ('cat20', 'string'),
 ('cat21', 'string'),
 ('cat22', 'string'),
 ('cat23', 'string'),
 ('cat24', 'string'),
 ('cat25', 'string'),
 ('cat26', 'string'),
 ('cat27', 'string'),
 ('cat28', 'string'),
 ('cat29', 'string'),
 ('cat30', 'string'),
 ('cat31', 'string'),
 ('cat32', 'string'),
 ('cat33', 'string'),
 ('cat34', 'string'),
 ('cat35', 'string'),
 ('cat36', 'string'),
 ('cat37', 'string'),
 ('cat38', 'string'),
 ('cat39', 'string'),
 ('cat40', 'string'),
 ('cat41', 'string'),
 ('cat42', 'string'),
 ('cat43', 'string'),
 ('cat44', 'string'),
 ('cat45', 'string'),
 ('cat46', 'string'

In [14]:
int_columns=[i[0] for i in df.dtypes if i[1]=='double']
int_columns

['cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14',
 'loss']

In [15]:
str_columns=[i[0] for i in df.dtypes if i[1]=='string']
str_columns

['cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18',
 'cat19',
 'cat20',
 'cat21',
 'cat22',
 'cat23',
 'cat24',
 'cat25',
 'cat26',
 'cat27',
 'cat28',
 'cat29',
 'cat30',
 'cat31',
 'cat32',
 'cat33',
 'cat34',
 'cat35',
 'cat36',
 'cat37',
 'cat38',
 'cat39',
 'cat40',
 'cat41',
 'cat42',
 'cat43',
 'cat44',
 'cat45',
 'cat46',
 'cat47',
 'cat48',
 'cat49',
 'cat50',
 'cat51',
 'cat52',
 'cat53',
 'cat54',
 'cat55',
 'cat56',
 'cat57',
 'cat58',
 'cat59',
 'cat60',
 'cat61',
 'cat62',
 'cat63',
 'cat64',
 'cat65',
 'cat66',
 'cat67',
 'cat68',
 'cat69',
 'cat70',
 'cat71',
 'cat72',
 'cat73',
 'cat74',
 'cat75',
 'cat76',
 'cat77',
 'cat78',
 'cat79',
 'cat80',
 'cat81',
 'cat82',
 'cat83',
 'cat84',
 'cat85',
 'cat86',
 'cat87',
 'cat88',
 'cat89',
 'cat90',
 'cat91',
 'cat92',
 'cat93',
 'cat94',
 'cat95',
 'cat96',
 'cat97',
 'cat98',
 'cat99',
 'cat100',
 'cat101

In [16]:
df.dtypes

[('cat1', 'string'),
 ('cat2', 'string'),
 ('cat3', 'string'),
 ('cat4', 'string'),
 ('cat5', 'string'),
 ('cat6', 'string'),
 ('cat7', 'string'),
 ('cat8', 'string'),
 ('cat9', 'string'),
 ('cat10', 'string'),
 ('cat11', 'string'),
 ('cat12', 'string'),
 ('cat13', 'string'),
 ('cat14', 'string'),
 ('cat15', 'string'),
 ('cat16', 'string'),
 ('cat17', 'string'),
 ('cat18', 'string'),
 ('cat19', 'string'),
 ('cat20', 'string'),
 ('cat21', 'string'),
 ('cat22', 'string'),
 ('cat23', 'string'),
 ('cat24', 'string'),
 ('cat25', 'string'),
 ('cat26', 'string'),
 ('cat27', 'string'),
 ('cat28', 'string'),
 ('cat29', 'string'),
 ('cat30', 'string'),
 ('cat31', 'string'),
 ('cat32', 'string'),
 ('cat33', 'string'),
 ('cat34', 'string'),
 ('cat35', 'string'),
 ('cat36', 'string'),
 ('cat37', 'string'),
 ('cat38', 'string'),
 ('cat39', 'string'),
 ('cat40', 'string'),
 ('cat41', 'string'),
 ('cat42', 'string'),
 ('cat43', 'string'),
 ('cat44', 'string'),
 ('cat45', 'string'),
 ('cat46', 'string'

In [17]:
len(str_columns), len(int_columns)

(116, 15)

In [18]:
stringindexer_stages = [ StringIndexer(inputCol=c, outputCol='stringindexed_' + c) for c in str_columns]
stringindexer_stages

[StringIndexer_c009d93b09b0,
 StringIndexer_f5785f889254,
 StringIndexer_f62542965c38,
 StringIndexer_f7f4ef28f582,
 StringIndexer_0861c120a28f,
 StringIndexer_7bc68788411f,
 StringIndexer_e2f202c6448f,
 StringIndexer_8047ce6c4c97,
 StringIndexer_be0025a91c43,
 StringIndexer_3ae27d37c736,
 StringIndexer_77fcb2042432,
 StringIndexer_c734bb5688f7,
 StringIndexer_2acd84c40117,
 StringIndexer_e84161b3c64c,
 StringIndexer_c8d002409de9,
 StringIndexer_e506e80fd811,
 StringIndexer_d2edb679949d,
 StringIndexer_ca4cf0ce687f,
 StringIndexer_36f365166576,
 StringIndexer_f3b6ad07ca32,
 StringIndexer_b81ce204b318,
 StringIndexer_a2f1abde9b66,
 StringIndexer_c3846ade045d,
 StringIndexer_bc2be7aaefde,
 StringIndexer_7762f20275f0,
 StringIndexer_2170ce7cc8e8,
 StringIndexer_9dde68dc6017,
 StringIndexer_08ec56474dce,
 StringIndexer_ee6448e8a5bc,
 StringIndexer_b9c5b7a444ad,
 StringIndexer_041b731a7442,
 StringIndexer_410af5e00b4d,
 StringIndexer_ae5d487aa0b7,
 StringIndexer_a1cd9aa0b2a6,
 StringIndexer

In [19]:
in_cols = ['stringindexed_' + c for c in str_columns]
ohe_cols = ['onehotencoded_' + c  for c in str_columns]
onehotencoder_stages = [OneHotEncoderEstimator(inputCols=in_cols, outputCols=ohe_cols)]
onehotencoder_stages

[OneHotEncoderEstimator_d6637caf4f52]

In [20]:
ohe_cols, len(ohe_cols)

(['onehotencoded_cat1',
  'onehotencoded_cat2',
  'onehotencoded_cat3',
  'onehotencoded_cat4',
  'onehotencoded_cat5',
  'onehotencoded_cat6',
  'onehotencoded_cat7',
  'onehotencoded_cat8',
  'onehotencoded_cat9',
  'onehotencoded_cat10',
  'onehotencoded_cat11',
  'onehotencoded_cat12',
  'onehotencoded_cat13',
  'onehotencoded_cat14',
  'onehotencoded_cat15',
  'onehotencoded_cat16',
  'onehotencoded_cat17',
  'onehotencoded_cat18',
  'onehotencoded_cat19',
  'onehotencoded_cat20',
  'onehotencoded_cat21',
  'onehotencoded_cat22',
  'onehotencoded_cat23',
  'onehotencoded_cat24',
  'onehotencoded_cat25',
  'onehotencoded_cat26',
  'onehotencoded_cat27',
  'onehotencoded_cat28',
  'onehotencoded_cat29',
  'onehotencoded_cat30',
  'onehotencoded_cat31',
  'onehotencoded_cat32',
  'onehotencoded_cat33',
  'onehotencoded_cat34',
  'onehotencoded_cat35',
  'onehotencoded_cat36',
  'onehotencoded_cat37',
  'onehotencoded_cat38',
  'onehotencoded_cat39',
  'onehotencoded_cat40',
  'onehot

In [21]:
int_columns

['cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14',
 'loss']

In [24]:
int_columns.remove('loss')

ValueError: list.remove(x): x not in list

In [23]:
featuresCols = int_columns + ohe_cols
print(featuresCols)
len(featuresCols)

['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14', 'onehotencoded_cat1', 'onehotencoded_cat2', 'onehotencoded_cat3', 'onehotencoded_cat4', 'onehotencoded_cat5', 'onehotencoded_cat6', 'onehotencoded_cat7', 'onehotencoded_cat8', 'onehotencoded_cat9', 'onehotencoded_cat10', 'onehotencoded_cat11', 'onehotencoded_cat12', 'onehotencoded_cat13', 'onehotencoded_cat14', 'onehotencoded_cat15', 'onehotencoded_cat16', 'onehotencoded_cat17', 'onehotencoded_cat18', 'onehotencoded_cat19', 'onehotencoded_cat20', 'onehotencoded_cat21', 'onehotencoded_cat22', 'onehotencoded_cat23', 'onehotencoded_cat24', 'onehotencoded_cat25', 'onehotencoded_cat26', 'onehotencoded_cat27', 'onehotencoded_cat28', 'onehotencoded_cat29', 'onehotencoded_cat30', 'onehotencoded_cat31', 'onehotencoded_cat32', 'onehotencoded_cat33', 'onehotencoded_cat34', 'onehotencoded_cat35', 'onehotencoded_cat36', 'onehotencoded_cat37', 'onehotencoded_cat38', 'on

130

In [25]:
vectorassembler = VectorAssembler(
                                  inputCols=featuresCols,
                                  outputCol="rawFeatures"
                                 )

In [26]:
vectorassembler

VectorAssembler_a51e0edbeb8e

# Data splitting and modeling 

In [27]:
train, test = df.randomSplit([0.7, 0.3])

In [28]:
train.count(), test.count()

(131567, 56751)

In [29]:
##scaler = StandardScaler(inputCol="rawFeatures",
##                        outputCol="scaledFeatures",
##                        withStd=True,
##                        withMean=True
                       )

IndentationError: unexpected indent (<ipython-input-29-5da0aab24951>, line 5)

In [30]:
vectorindexer = VectorIndexer(inputCol="rawFeatures",
                              outputCol="features",
                              maxCategories=5       # Reindex, if no of distinct values less than 5
                            )

# Build Cross Validation Object on GBTRegressor Algorithm with Params and Metrics for Evaluator

In [32]:
gbt = GBTRegressor(labelCol="loss")

In [33]:
paramGrid = ParamGridBuilder()\
                         .addGrid(gbt.maxDepth, [2, 5]) \
                         .addGrid(gbt.maxIter, [10, 100])\
                         .build()


In [34]:
evaluator = RegressionEvaluator(
                               metricName="rmse",                   # This is default
                               labelCol="loss",          # Actual col value
                               predictionCol=gbt.getPredictionCol() # predicted col value
                               )

In [35]:
cv = CrossValidator(estimator=gbt,
                    evaluator=evaluator,
                    estimatorParamMaps=paramGrid
                    )

# 9.2 Create pipeline model

In [36]:

pipeline = Pipeline(stages=[*stringindexer_stages,*onehotencoder_stages,vectorassembler,vectorindexer, gbt])

# Run the Pipeline

In [37]:
start = time.time()
pipelineModel = pipeline.fit(train)
end = time.time()
(end - start)/60

21.596311223506927

# Test Predictions & evaluation

In [38]:
TestPred = pipelineModel.transform(test)
type(TestPred)               # Spark Dataframe
TestPred.columns

pyspark.sql.dataframe.DataFrame

['cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18',
 'cat19',
 'cat20',
 'cat21',
 'cat22',
 'cat23',
 'cat24',
 'cat25',
 'cat26',
 'cat27',
 'cat28',
 'cat29',
 'cat30',
 'cat31',
 'cat32',
 'cat33',
 'cat34',
 'cat35',
 'cat36',
 'cat37',
 'cat38',
 'cat39',
 'cat40',
 'cat41',
 'cat42',
 'cat43',
 'cat44',
 'cat45',
 'cat46',
 'cat47',
 'cat48',
 'cat49',
 'cat50',
 'cat51',
 'cat52',
 'cat53',
 'cat54',
 'cat55',
 'cat56',
 'cat57',
 'cat58',
 'cat59',
 'cat60',
 'cat61',
 'cat62',
 'cat63',
 'cat64',
 'cat65',
 'cat66',
 'cat67',
 'cat68',
 'cat69',
 'cat70',
 'cat71',
 'cat72',
 'cat73',
 'cat74',
 'cat75',
 'cat76',
 'cat77',
 'cat78',
 'cat79',
 'cat80',
 'cat81',
 'cat82',
 'cat83',
 'cat84',
 'cat85',
 'cat86',
 'cat87',
 'cat88',
 'cat89',
 'cat90',
 'cat91',
 'cat92',
 'cat93',
 'cat94',
 'cat95',
 'cat96',
 'cat97',
 'cat98',
 'cat99',
 'cat100',
 'cat101

In [40]:

TestPred.select("loss", "prediction","rawFeatures","features").show(4)

+--------+------------------+--------------------+--------------------+
|    loss|        prediction|         rawFeatures|            features|
+--------+------------------+--------------------+--------------------+
|13060.49|3852.4314130148123|(1002,[0,1,2,3,4,...|(1002,[0,1,2,3,4,...|
|  1736.8|3695.1504840652365|(1002,[0,1,2,3,4,...|(1002,[0,1,2,3,4,...|
| 2377.72| 2438.393046529025|(1002,[0,1,2,3,4,...|(1002,[0,1,2,3,4,...|
| 1347.16|3415.5908951531414|(1002,[0,1,2,3,4,...|(1002,[0,1,2,3,4,...|
+--------+------------------+--------------------+--------------------+
only showing top 4 rows



# Evaluate results

In [41]:
# Ref: http://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.evaluation.RegressionEvaluator
# Create evaluator object.  class is, as:
#  RegressionEvaluator(self, predictionCol="prediction", labelCol="label", metricName="rmse")
# 10.4
eval = RegressionEvaluator(predictionCol="prediction",
                           labelCol = 'loss',
                            metricName="rmse"
                          )

In [42]:
eval

RegressionEvaluator_f2719243033a

In [43]:
RMSE = eval.evaluate(TestPred)
# 10.6
print ("RMSE on our test set: %g" % RMSE)

Py4JJavaError: An error occurred while calling o7429.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 529.0 failed 1 times, most recent failure: Lost task 1.0 in stage 529.0 (TID 2027, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$9: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_12$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Unseen label: AS.  To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:260)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:246)
	... 35 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1092)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary$lzycompute(RegressionMetrics.scala:57)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary(RegressionMetrics.scala:54)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr$lzycompute(RegressionMetrics.scala:65)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr(RegressionMetrics.scala:65)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.meanSquaredError(RegressionMetrics.scala:100)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.rootMeanSquaredError(RegressionMetrics.scala:109)
	at org.apache.spark.ml.evaluation.RegressionEvaluator.evaluate(RegressionEvaluator.scala:86)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$9: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_12$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:214)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1334)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1145)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$25.apply(RDD.scala:1146)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.SparkException: Unseen label: AS.  To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:260)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:246)
	... 35 more
