# CEBD 1261 Winter 2020
## Final Project: Mushroom classification (Poisonous (p) vs. Edible (e))
### Data source: https://www.kaggle.com/uciml/mushroom-classification 
### By: Pawel Kaluski


Searching for data to use for my project I found this one. It is a classification problem. The challenges with this dataset are that it only has characters and no numbers. It requires alot of encoding. The column names use a '-' to separate words. This creates issues.

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql as sparksql
spark = SparkSession.builder.appName('mushrooms').getOrCreate()
train = spark.read.csv('mushrooms.csv', inferSchema=True,header=True)
import pandas as pd

### Used Python to make sure there were no nan in any columns

In [None]:
# testing data for nan
df = pd.read_csv('mushrooms.csv')

In [None]:
# get info of missing data for each col by creading data frame that contains col's name and its NaN value counts
nan_info = pd.DataFrame(df.isnull().sum()).reset_index()
nan_info.columns = ['col','nan_cnt']
nan_info.sort_values(by = 'nan_cnt',ascending=False,inplace=True)
nan_info

### We see there are no nan values in any columns

In [None]:
train.printSchema()

## Next we will look at te different features to determine what's in them

In [None]:
# Our Target
train.groupBy('class').count().show()

In [None]:
train.groupBy('cap-shape').count().show()

In [None]:
train.groupBy('cap-surface').count().show()

In [None]:
train.groupBy('cap-color').count().show()

In [None]:
train.groupBy('bruises').count().show()

In [None]:
train.groupBy('odor').count().show()

In [None]:
train.groupBy('gill-attachment').count().show()

In [None]:
train.groupBy('gill-spacing').count().show()

In [None]:
train.groupBy('gill-size').count().show()

In [None]:
train.groupBy('gill-color').count().show()

In [None]:
train.groupBy('stalk-shape').count().show()

In [None]:
train.groupBy('stalk-root').count().show()

#### We can see we have 2480 missing values we can exclude this column in the MVP

In [None]:
train.groupBy('stalk-surface-above-ring').count().show()

In [None]:
train.groupBy('stalk-surface-below-ring').count().show()

In [None]:
train.groupBy('stalk-color-above-ring').count().show()

In [None]:
train.groupBy('stalk-color-below-ring').count().show()

In [None]:
train.groupBy('veil-color').count().show()

In [None]:
train.groupBy('veil-type').count().show()

#### Since this feature adds no value it will not be used in our model

In [None]:
train.groupBy('ring-number').count().show()

In [None]:
train.groupBy('ring-type').count().show()

In [None]:
train.groupBy('spore-print-color').count().show()

In [None]:
train.groupBy('population').count().show()

In [None]:
train.groupBy('habitat').count().show()

### We will remove 'veil-type' and 'stalk-root'

In [2]:
train = train.select('class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat')
cols = train.columns
train.printSchema()

root
 |-- class: string (nullable = true)
 |-- cap-shape: string (nullable = true)
 |-- cap-surface: string (nullable = true)
 |-- cap-color: string (nullable = true)
 |-- bruises: string (nullable = true)
 |-- odor: string (nullable = true)
 |-- gill-attachment: string (nullable = true)
 |-- gill-spacing: string (nullable = true)
 |-- gill-size: string (nullable = true)
 |-- gill-color: string (nullable = true)
 |-- stalk-shape: string (nullable = true)
 |-- stalk-surface-above-ring: string (nullable = true)
 |-- stalk-surface-below-ring: string (nullable = true)
 |-- stalk-color-above-ring: string (nullable = true)
 |-- stalk-color-below-ring: string (nullable = true)
 |-- veil-color: string (nullable = true)
 |-- ring-number: string (nullable = true)
 |-- ring-type: string (nullable = true)
 |-- spore-print-color: string (nullable = true)
 |-- population: string (nullable = true)
 |-- habitat: string (nullable = true)



In [None]:
# we will look at the first 5 rows to see if data is still ok and confirm the columns were removed
import pandas as pd
pd.DataFrame(train.take(5), columns=train.columns).transpose()

### This part is where the encoding takes place. (Converting labels to numbers)

In [None]:
from pyspark.ml.feature import (VectorAssembler,OneHotEncoder,StringIndexer)

In [None]:
cap_shape_indexer = StringIndexer(inputCol='cap-shape',outputCol='cap_shapeIndex')
cap_shape_encoder = OneHotEncoder(inputCol='cap_shapeIndex',outputCol='cap_shapeVec')

In [None]:
cap_surface_indexer = StringIndexer(inputCol='cap-surface',outputCol='cap_surfaceIndex')
cap_surface_encoder = OneHotEncoder(inputCol='cap_surfaceIndex',outputCol='cap_surfaceVec')

In [None]:
cap_color_indexer = StringIndexer(inputCol='cap-color',outputCol='cap_colorIndex')
cap_color_encoder = OneHotEncoder(inputCol='cap_colorIndex',outputCol='cap_colorVec')

In [None]:
bruises_indexer = StringIndexer(inputCol='bruises',outputCol='bruisesIndex')
bruises_encoder = OneHotEncoder(inputCol='bruisesIndex',outputCol='bruisesVec')

In [None]:
odor_indexer = StringIndexer(inputCol='odor',outputCol='odorIndex')
odor_encoder = OneHotEncoder(inputCol='odorIndex',outputCol='odorVec')

In [None]:
gill_attachment_indexer = StringIndexer(inputCol='gill-attachment',outputCol='gill_attachmentIndex')
gill_attachment_encoder = OneHotEncoder(inputCol='gill_attachmentIndex',outputCol='gill_attachmentVec')

In [None]:
gill_spacing_indexer = StringIndexer(inputCol='gill-spacing',outputCol='gill_spacingIndex')
gill_spacing_encoder = OneHotEncoder(inputCol='gill_spacingIndex',outputCol='gill_spacingVec')

In [None]:
gill_size_indexer = StringIndexer(inputCol='gill-size',outputCol='gill_sizeIndex')
gill_size_encoder = OneHotEncoder(inputCol='gill_sizeIndex',outputCol='gill_sizeVec')

In [None]:
gill_color_indexer = StringIndexer(inputCol='gill-color',outputCol='gill_colorIndex')
gill_color_encoder = OneHotEncoder(inputCol='gill_colorIndex',outputCol='gill_colorVec')

In [None]:
stalk_shape_indexer = StringIndexer(inputCol='stalk-shape',outputCol='stalk_shapeIndex')
stalk_shape_encoder = OneHotEncoder(inputCol='stalk_shapeIndex',outputCol='stalk_shapeVec')

In [None]:
# Excluded due to 2480 missing values 
#stalk_root_indexer = StringIndexer(inputCol='stalk-root',outputCol='stalk_rootIndex')
#stalk_root_encoder = OneHotEncoder(inputCol='stalk_rootIndex',outputCol='stalk_rootVec')

In [None]:
stalk_surface_above_ring_indexer = StringIndexer(inputCol='stalk-surface-above-ring',outputCol='stalk_surface_above_ringIndex')
stalk_surface_above_ring_encoder = OneHotEncoder(inputCol='stalk_surface_above_ringIndex',outputCol='stalk_surface_above_ringVec')

In [None]:
stalk_surface_below_ring_indexer = StringIndexer(inputCol='stalk-surface-below-ring',outputCol='stalk_surface_below_ringIndex')
stalk_surface_below_ring_encoder = OneHotEncoder(inputCol='stalk_surface_below_ringIndex',outputCol='stalk_surface_below_ringVec')

In [None]:
stalk_color_above_ring_indexer = StringIndexer(inputCol='stalk-color-above-ring',outputCol='stalk_color_above_ringIndex')
stalk_color_above_ring_encoder = OneHotEncoder(inputCol='stalk_color_above_ringIndex',outputCol='stalk_color_above_ringVec')

In [None]:
stalk_color_below_ring_indexer = StringIndexer(inputCol='stalk-color-below-ring',outputCol='stalk_color_below_ringIndex')
stalk_color_below_ring_encoder = OneHotEncoder(inputCol='stalk_color_below_ringIndex',outputCol='stalk_color_below_ringVec')

In [None]:
# Commented out since we will not be using it
#veil_type_indexer = StringIndexer(inputCol='veil-type',outputCol='veil_typeIndex')
#veil_type_encoder = OneHotEncoder(inputCol='veil_typeIndex',outputCol='veil_typeVec')

In [None]:
veil_color_indexer = StringIndexer(inputCol='veil-color',outputCol='veil_colorIndex')
veil_color_encoder = OneHotEncoder(inputCol='veil_colorIndex',outputCol='veil_colorVec')

In [None]:
ring_number_indexer = StringIndexer(inputCol='ring-number',outputCol='ring_numberIndex')
ring_number_encoder = OneHotEncoder(inputCol='ring_numberIndex',outputCol='ring_numberVec')

In [None]:
ring_type_indexer = StringIndexer(inputCol='ring-type',outputCol='ring_typeIndex')
ring_type_encoder = OneHotEncoder(inputCol='ring_typeIndex',outputCol='ring_typeVec')

In [None]:
spore_print_color_indexer = StringIndexer(inputCol='spore-print-color',outputCol='spore_print_colorIndex')
spore_print_color_encoder = OneHotEncoder(inputCol='spore_print_colorIndex',outputCol='spore_print_colorVec')

In [None]:
population_indexer = StringIndexer(inputCol='population',outputCol='populationIndex')
population_encoder = OneHotEncoder(inputCol='populationIndex',outputCol='populationVec')

In [None]:
habitat_indexer = StringIndexer(inputCol='habitat',outputCol='habitatIndex')
habitat_encoder = OneHotEncoder(inputCol='habitatIndex',outputCol='habitatVec')

In [None]:
#veil_type and stalk_root exluded
assembler = VectorAssembler(inputCols=['cap_shapeVec', 'cap_surfaceVec', 'cap_colorVec',
 'bruisesVec', 'odorVec', 'gill_attachmentVec', 'gill_spacingVec', 'gill_sizeVec',
 'gill_colorVec', 'stalk_shapeVec', 'stalk_surface_above_ringVec', 'stalk_surface_below_ringVec',
 'stalk_color_above_ringVec', 'stalk_color_below_ringVec', 'veil_colorVec', 'ring_numberVec',
 'ring_typeVec', 'spore_print_colorVec', 'populationVec', 'habitatVec'], outputCol= 'features')

### this was a different way I tried to make it work without success

In [3]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

categoricalColumns = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
    label_stringIdx = StringIndexer(inputCol = 'class', outputCol = 'label')
    assemblerInputs = [c + "classVec" for c in categoricalColumns]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [4]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(train)
train = pipelineModel.transform(train)
selectedCols = ['label','features'] + cols
train = train.select(selectedCols)
train.printSchema()

AnalysisException: "cannot resolve '`label`' given input columns: [ring-typeclassVec, stalk-color-above-ringclassVec, gill-colorclassVec, ring-typeIndex, cap-shapeIndex, stalk-color-below-ring, veil-colorIndex, stalk-color-below-ringIndex, gill-spacingclassVec, spore-print-colorclassVec, bruises, odorIndex, cap-surfaceIndex, cap-surfaceclassVec, stalk-surface-below-ring, ring-numberIndex, habitat, gill-sizeclassVec, cap-colorclassVec, gill-size, gill-attachmentclassVec, stalk-surface-below-ringclassVec, gill-colorIndex, habitatIndex, features, populationclassVec, gill-color, ring-numberclassVec, stalk-shapeclassVec, cap-surface, cap-colorIndex, stalk-color-above-ringIndex, stalk-surface-above-ringIndex, gill-spacingIndex, gill-spacing, ring-number, veil-color, stalk-surface-above-ringclassVec, population, stalk-color-below-ringclassVec, gill-sizeIndex, gill-attachmentIndex, spore-print-color, class, stalk-surface-above-ring, cap-shape, gill-attachment, stalk-shape, spore-print-colorIndex, cap-shapeclassVec, odorclassVec, cap-color, stalk-shapeIndex, stalk-color-above-ring, bruisesIndex, bruisesclassVec, populationIndex, veil-colorclassVec, odor, stalk-surface-below-ringIndex, ring-type, habitatclassVec];;\n'Project ['label, features#4334, class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32]\n+- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 38 more fields]\n   +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 37 more fields]\n      +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 36 more fields]\n         +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 35 more fields]\n            +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 34 more fields]\n               +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 33 more fields]\n                  +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 32 more fields]\n                     +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 31 more fields]\n                        +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 30 more fields]\n                           +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 29 more fields]\n                              +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 28 more fields]\n                                 +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 27 more fields]\n                                    +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 26 more fields]\n                                       +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 25 more fields]\n                                          +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 24 more fields]\n                                             +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 23 more fields]\n                                                +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 22 more fields]\n                                                   +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 21 more fields]\n                                                      +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 20 more fields]\n                                                         +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 19 more fields]\n                                                            +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 18 more fields]\n                                                               +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 17 more fields]\n                                                                  +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 16 more fields]\n                                                                     +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 15 more fields]\n                                                                        +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 14 more fields]\n                                                                           +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 13 more fields]\n                                                                              +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 12 more fields]\n                                                                                 +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 11 more fields]\n                                                                                    +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 10 more fields]\n                                                                                       +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 9 more fields]\n                                                                                          +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 8 more fields]\n                                                                                             +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 7 more fields]\n                                                                                                +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 6 more fields]\n                                                                                                   +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 5 more fields]\n                                                                                                      +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 4 more fields]\n                                                                                                         +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 3 more fields]\n                                                                                                            +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, ... 2 more fields]\n                                                                                                               +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, cap-surfaceIndex#2662, if ((isnull(cast(cap-surfaceIndex#2662 as double)) || isnull(0))) null else UDF(cast(cap-surfaceIndex#2662 as double), 0) AS cap-surfaceclassVec#2688]\n                                                                                                                  +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, cap-shapeclassVec#2638, UDF(cast(cap-surface#12 as string)) AS cap-surfaceIndex#2662]\n                                                                                                                     +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, cap-shapeIndex#2614, if ((isnull(cast(cap-shapeIndex#2614 as double)) || isnull(0))) null else UDF(cast(cap-shapeIndex#2614 as double), 0) AS cap-shapeclassVec#2638]\n                                                                                                                        +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32, UDF(cast(cap-shape#11 as string)) AS cap-shapeIndex#2614]\n                                                                                                                           +- Project [class#10, cap-shape#11, cap-surface#12, cap-color#13, bruises#14, odor#15, gill-attachment#16, gill-spacing#17, gill-size#18, gill-color#19, stalk-shape#20, stalk-surface-above-ring#22, stalk-surface-below-ring#23, stalk-color-above-ring#24, stalk-color-below-ring#25, veil-color#27, ring-number#28, ring-type#29, spore-print-color#30, population#31, habitat#32]\n                                                                                                                              +- Relation[class#10,cap-shape#11,cap-surface#12,cap-color#13,bruises#14,odor#15,gill-attachment#16,gill-spacing#17,gill-size#18,gill-color#19,stalk-shape#20,stalk-root#21,stalk-surface-above-ring#22,stalk-surface-below-ring#23,stalk-color-above-ring#24,stalk-color-below-ring#25,veil-type#26,veil-color#27,ring-number#28,ring-type#29,spore-print-color#30,population#31,habitat#32] csv\n"

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
dtc = DecisionTreeClassifier(labelCol='class',featuresCol='features')

In [None]:
# Creating the Pipline, I will remove 'veil-type' in this stage as it only has one character

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[cap_shape_indexer, cap_surface_indexer, cap_color_indexer, bruises_indexer, 
                            odor_indexer, gill_attachment_indexer, gill_spacing_indexer, gill_size_indexer,
                            gill_color_indexer, stalk_shape_indexer, stalk_surface_above_ring_indexer,
                            stalk_surface_below_ring_indexer, stalk_color_above_ring_indexer,stalk_color_below_ring_indexer,
                            veil_color_indexer, ring_number_indexer, ring_type_indexer, 
                            spore_print_color_indexer, population_indexer, habitat_indexer,
                            cap_shape_encoder, cap_surface_encoder, cap_color_encoder,bruises_encoder, 
                            odor_encoder, gill_attachment_encoder, gill_spacing_encoder, gill_size_encoder, 
                            gill_color_encoder, stalk_shape_encoder, stalk_surface_above_ring_encoder,
                            stalk_surface_below_ring_encoder, stalk_color_above_ring_encoder, stalk_color_below_ring_encoder,
                            veil_color_encoder, ring_number_encoder, ring_type_encoder, 
                            spore_print_color_encoder, population_encoder, habitat_encoder, assembler, dtc])

In [None]:
# Splitnig the data train/test (73/23) using random split
train_data,test_data = train.randomSplit([0.77,0.23])

In [None]:
# Fitting the model by using the pipline and train_data
model = pipeline.fit(train_data)

In [None]:
pd.DataFrame(test_data.take(5), columns=test_data.columns).transpose()

In [None]:
pd.DataFrame(train_d.take(5), columns=train_d.columns).transpose()