# CEBD 1261 Winter 2020
## Final Project: Mushroom classification (Poisonous (p) vs. Edible (e))
### Data source: https://www.kaggle.com/uciml/mushroom-classification 
### By: Pawel Kaluski


Searching for data to use for my project I found this one. It is a classification problem. The challenge with this dataset are that it only has characters and no numbers. It requires alot of encoding. The second issue was to make it fit the model with the pipeline.

In [8]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql as sparksql
spark = SparkSession.builder.appName('mushrooms').getOrCreate()
train = spark.read.csv('mushrooms.csv', inferSchema=True,header=True)
import pandas as pd

### Used Python to make sure there were no nan in any columns

In [None]:
# testing data for nan
df = pd.read_csv('mushrooms.csv')

In [None]:
# get info of missing data for each col by creading data frame that contains col's name and its NaN value counts
nan_info = pd.DataFrame(df.isnull().sum()).reset_index()
nan_info.columns = ['col','nan_cnt']
nan_info.sort_values(by = 'nan_cnt',ascending=False,inplace=True)
nan_info

### We see there are no nan values in any columns

In [3]:
train.printSchema()

root
 |-- class: string (nullable = true)
 |-- cap-shape: string (nullable = true)
 |-- cap-surface: string (nullable = true)
 |-- cap-color: string (nullable = true)
 |-- bruises: string (nullable = true)
 |-- odor: string (nullable = true)
 |-- gill-attachment: string (nullable = true)
 |-- gill-spacing: string (nullable = true)
 |-- gill-size: string (nullable = true)
 |-- gill-color: string (nullable = true)
 |-- stalk-shape: string (nullable = true)
 |-- stalk-root: string (nullable = true)
 |-- stalk-surface-above-ring: string (nullable = true)
 |-- stalk-surface-below-ring: string (nullable = true)
 |-- stalk-color-above-ring: string (nullable = true)
 |-- stalk-color-below-ring: string (nullable = true)
 |-- veil-type: string (nullable = true)
 |-- veil-color: string (nullable = true)
 |-- ring-number: string (nullable = true)
 |-- ring-type: string (nullable = true)
 |-- spore-print-color: string (nullable = true)
 |-- population: string (nullable = true)
 |-- habitat: string 

## Next we will look at te different features to determine what's in them

In [None]:
# Our Target
train.groupBy('class').count().show()

In [None]:
train.groupBy('cap-shape').count().show()

In [None]:
train.groupBy('cap-surface').count().show()

In [None]:
train.groupBy('cap-color').count().show()

In [None]:
train.groupBy('bruises').count().show()

In [None]:
train.groupBy('odor').count().show()

In [None]:
train.groupBy('gill-attachment').count().show()

In [None]:
train.groupBy('gill-spacing').count().show()

In [None]:
train.groupBy('gill-size').count().show()

In [None]:
train.groupBy('gill-color').count().show()

In [None]:
train.groupBy('stalk-shape').count().show()

In [None]:
train.groupBy('stalk-root').count().show()

#### We can see we have 2480 missing values we can exclude this column in the MVP

In [None]:
train.groupBy('stalk-surface-above-ring').count().show()

In [None]:
train.groupBy('stalk-surface-below-ring').count().show()

In [None]:
train.groupBy('stalk-color-above-ring').count().show()

In [None]:
train.groupBy('stalk-color-below-ring').count().show()

In [None]:
train.groupBy('veil-color').count().show()

In [None]:
train.groupBy('veil-type').count().show()

#### Since this feature adds no value it will not be used in our model

In [None]:
train.groupBy('ring-number').count().show()

In [None]:
train.groupBy('ring-type').count().show()

In [None]:
train.groupBy('spore-print-color').count().show()

In [None]:
train.groupBy('population').count().show()

In [None]:
train.groupBy('habitat').count().show()

### We will remove 'veil-type' and 'stalk-root'

In [9]:
train = train.select('class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat')
cols = train.columns
train.printSchema()

root
 |-- class: string (nullable = true)
 |-- cap-shape: string (nullable = true)
 |-- cap-surface: string (nullable = true)
 |-- cap-color: string (nullable = true)
 |-- bruises: string (nullable = true)
 |-- odor: string (nullable = true)
 |-- gill-attachment: string (nullable = true)
 |-- gill-spacing: string (nullable = true)
 |-- gill-size: string (nullable = true)
 |-- gill-color: string (nullable = true)
 |-- stalk-shape: string (nullable = true)
 |-- stalk-surface-above-ring: string (nullable = true)
 |-- stalk-surface-below-ring: string (nullable = true)
 |-- stalk-color-above-ring: string (nullable = true)
 |-- stalk-color-below-ring: string (nullable = true)
 |-- veil-color: string (nullable = true)
 |-- ring-number: string (nullable = true)
 |-- ring-type: string (nullable = true)
 |-- spore-print-color: string (nullable = true)
 |-- population: string (nullable = true)
 |-- habitat: string (nullable = true)



In [None]:
# we will look at the first 5 rows to see if data is still ok and confirm the columns were removed
import pandas as pd
pd.DataFrame(train.take(5), columns=train.columns).transpose()

### This part is where the encoding takes place. (Converting labels to numbers) DataBriks example was used

In [10]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

In [11]:
# DataBrick example
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
categoricalColumns = ["cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment", "gill-spacing", 
                      "gill-size", "gill-color", "stalk-shape", "stalk-surface-above-ring", "stalk-surface-below-ring", 
                      "stalk-color-above-ring", "stalk-color-below-ring", "veil-color", "ring-number", "ring-type", 
                      "spore-print-color", "population", "habitat"]

stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]
    
    # Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="class", outputCol="label")
stages += [label_stringIdx]

assemblerInputs = [c + "classVec" for c in categoricalColumns]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [12]:
from pyspark.ml.classification import LogisticRegression
  
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(train)
preppedDataDF = pipelineModel.transform(train)

# Fit model to prepped data
lrModel = LogisticRegression().fit(preppedDataDF)

In [13]:
preppedDataDF.printSchema()

root
 |-- class: string (nullable = true)
 |-- cap-shape: string (nullable = true)
 |-- cap-surface: string (nullable = true)
 |-- cap-color: string (nullable = true)
 |-- bruises: string (nullable = true)
 |-- odor: string (nullable = true)
 |-- gill-attachment: string (nullable = true)
 |-- gill-spacing: string (nullable = true)
 |-- gill-size: string (nullable = true)
 |-- gill-color: string (nullable = true)
 |-- stalk-shape: string (nullable = true)
 |-- stalk-surface-above-ring: string (nullable = true)
 |-- stalk-surface-below-ring: string (nullable = true)
 |-- stalk-color-above-ring: string (nullable = true)
 |-- stalk-color-below-ring: string (nullable = true)
 |-- veil-color: string (nullable = true)
 |-- ring-number: string (nullable = true)
 |-- ring-type: string (nullable = true)
 |-- spore-print-color: string (nullable = true)
 |-- population: string (nullable = true)
 |-- habitat: string (nullable = true)
 |-- cap-shapeIndex: double (nullable = false)
 |-- cap-shapeclas

In [14]:
selectedcols = ["label", "features"]
dataset = preppedDataDF.select(selectedcols)
display(dataset)

DataFrame[label: double, features: vector]

In [21]:
dataset.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)



In [23]:
dataset.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 4208|
|  1.0| 3916|
+-----+-----+



In [24]:
dataset.groupBy('features').count().show()

+--------------------+-----+
|            features|count|
+--------------------+-----+
|(91,[3,5,12,23,26...|    1|
|(91,[0,6,11,23,26...|    1|
|(91,[0,5,12,23,26...|    1|
|(91,[1,6,11,22,26...|    1|
|(91,[1,5,12,24,26...|    1|
|(91,[0,7,9,18,26,...|    1|
|(91,[1,7,9,17,18,...|    1|
|(91,[1,7,8,18,26,...|    1|
|(91,[0,7,9,17,19,...|    1|
|(91,[0,7,9,18,26,...|    1|
|(91,[1,5,9,18,26,...|    1|
|(91,[1,7,10,18,26...|    1|
|(91,[1,7,9,17,19,...|    1|
|(91,[1,7,9,17,19,...|    1|
|(91,[1,7,11,17,19...|    1|
|(91,[1,7,9,18,26,...|    1|
|(91,[1,5,11,17,19...|    1|
|(91,[0,7,11,17,19...|    1|
|(91,[0,5,12,17,18...|    1|
|(91,[12,18,26,31,...|    1|
+--------------------+-----+
only showing top 20 rows



In [15]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())

5739
2385


In [26]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(trainingData)

In [27]:
# Make predictions on test data using the transform() method.
# LogisticRegression.transform() will only use the 'features' column.
predictions = lrModel.transform(testData)