In [1]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt


In [4]:
# Visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

from matplotlib import rcParams
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (18,4)})
rcParams['figure.figsize'] = 18,4

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [5]:
# setting random seed for notebook reproducability
rnd_seed=23
np.random.seed=rnd_seed
np.random.set_state=rnd_seed


In [6]:
spark = SparkSession.builder.master("local[2]").appName("mushrooms").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/15 13:44:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
spark


In [8]:
sc = spark.sparkContext
sc

In [9]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext



<pyspark.sql.context.SQLContext at 0x15a68eb10>

In [10]:
MUSHROOMS_DATA = 'mushrooms.csv'


In [12]:
df_pyspark = spark.read.csv("mushrooms.csv",inferSchema=True, header=True)

                                                                                

In [13]:
df_pyspark.show(5)


+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|class|cap-shape|cap-surface|cap-color|bruises|odor|gill-attachment|gill-spacing|gill-size|gill-color|stalk-shape|stalk-root|stalk-surface-above-ring|stalk-surface-below-ring|stalk-color-above-ring|stalk-color-below-ring|veil-type|veil-color|ring-number|ring-type|spore-print-color|population|habitat|
+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|    p|        x|          s|        n|      t|   p|              f|           c|        n|   

In [14]:
df_pyspark.printSchema()

root
 |-- class: string (nullable = true)
 |-- cap-shape: string (nullable = true)
 |-- cap-surface: string (nullable = true)
 |-- cap-color: string (nullable = true)
 |-- bruises: string (nullable = true)
 |-- odor: string (nullable = true)
 |-- gill-attachment: string (nullable = true)
 |-- gill-spacing: string (nullable = true)
 |-- gill-size: string (nullable = true)
 |-- gill-color: string (nullable = true)
 |-- stalk-shape: string (nullable = true)
 |-- stalk-root: string (nullable = true)
 |-- stalk-surface-above-ring: string (nullable = true)
 |-- stalk-surface-below-ring: string (nullable = true)
 |-- stalk-color-above-ring: string (nullable = true)
 |-- stalk-color-below-ring: string (nullable = true)
 |-- veil-type: string (nullable = true)
 |-- veil-color: string (nullable = true)
 |-- ring-number: string (nullable = true)
 |-- ring-type: string (nullable = true)
 |-- spore-print-color: string (nullable = true)
 |-- population: string (nullable = true)
 |-- habitat: string 

In [15]:
from pyspark.ml.feature import StringIndexer


In [16]:
columns = df_pyspark.columns

In [17]:
for categoricalCol in columns:

    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol+"_encoded").fit(df_pyspark)

    df_pyspark = stringIndexer.transform(df_pyspark)

    df_pyspark = df_pyspark.withColumn(categoricalCol+"_encoded", df_pyspark[categoricalCol+"_encoded"].cast('int'))

                                                                                

In [18]:
col_encoded = [col+'_encoded'for col in columns]

encoded_df=df_pyspark.select(col_encoded)

In [19]:
encoded_df.show(5)

+-------------+-----------------+-------------------+-----------------+---------------+------------+-----------------------+--------------------+-----------------+------------------+-------------------+------------------+--------------------------------+--------------------------------+------------------------------+------------------------------+-----------------+------------------+-------------------+-----------------+-------------------------+------------------+---------------+
|class_encoded|cap-shape_encoded|cap-surface_encoded|cap-color_encoded|bruises_encoded|odor_encoded|gill-attachment_encoded|gill-spacing_encoded|gill-size_encoded|gill-color_encoded|stalk-shape_encoded|stalk-root_encoded|stalk-surface-above-ring_encoded|stalk-surface-below-ring_encoded|stalk-color-above-ring_encoded|stalk-color-below-ring_encoded|veil-type_encoded|veil-color_encoded|ring-number_encoded|ring-type_encoded|spore-print-color_encoded|population_encoded|habitat_encoded|
+-------------+-------------

In [20]:
from pyspark.ml.feature import VectorAssembler


In [21]:
col_encoded.remove('class_encoded')

In [25]:
col_encoded

['cap-shape_encoded',
 'cap-surface_encoded',
 'cap-color_encoded',
 'bruises_encoded',
 'odor_encoded',
 'gill-attachment_encoded',
 'gill-spacing_encoded',
 'gill-size_encoded',
 'gill-color_encoded',
 'stalk-shape_encoded',
 'stalk-root_encoded',
 'stalk-surface-above-ring_encoded',
 'stalk-surface-below-ring_encoded',
 'stalk-color-above-ring_encoded',
 'stalk-color-below-ring_encoded',
 'veil-type_encoded',
 'veil-color_encoded',
 'ring-number_encoded',
 'ring-type_encoded',
 'spore-print-color_encoded',
 'population_encoded',
 'habitat_encoded']

In [22]:
featureAssembler = VectorAssembler(inputCols=col_encoded,outputCol="features")

In [23]:
output = featureAssembler.transform(encoded_df)


In [24]:
output.select("features","class_encoded").show(5)


23/09/15 14:19:32 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------+-------------+
|            features|class_encoded|
+--------------------+-------------+
|(22,[1,3,4,7,8,9,...|            1|
|(22,[1,2,3,4,8,9,...|            0|
|(22,[0,1,2,3,4,8,...|            0|
|(22,[2,3,4,7,8,9,...|            1|
|(22,[1,2,6,8,10,1...|            0|
+--------------------+-------------+
only showing top 5 rows



In [26]:
train, test = output.randomSplit([0.8, 0.2], seed=17)


In [28]:
print("size of training data:", train.count())
print("size of testing data:", test.count())

size of training data: 6471
size of testing data: 1653


In [29]:
from pyspark.ml.classification import LogisticRegression


In [30]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'class_encoded', maxIter=10)

In [31]:
lrModel = lr.fit(train)


23/09/15 14:56:57 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [32]:
predictions = lrModel.transform(test)


In [33]:
predictions.show(5)


                                                                                

+-------------+-----------------+-------------------+-----------------+---------------+------------+-----------------------+--------------------+-----------------+------------------+-------------------+------------------+--------------------------------+--------------------------------+------------------------------+------------------------------+-----------------+------------------+-------------------+-----------------+-------------------------+------------------+---------------+--------------------+--------------------+--------------------+----------+
|class_encoded|cap-shape_encoded|cap-surface_encoded|cap-color_encoded|bruises_encoded|odor_encoded|gill-attachment_encoded|gill-spacing_encoded|gill-size_encoded|gill-color_encoded|stalk-shape_encoded|stalk-root_encoded|stalk-surface-above-ring_encoded|stalk-surface-below-ring_encoded|stalk-color-above-ring_encoded|stalk-color-below-ring_encoded|veil-type_encoded|veil-color_encoded|ring-number_encoded|ring-type_encoded|spore-print-colo

In [34]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [35]:
evaluator = MulticlassClassificationEvaluator()


In [39]:
evaluator.setLabelCol("class_encoded")


MulticlassClassificationEvaluator_f53a3fe73754

In [40]:
evaluator.setPredictionCol("prediction")


MulticlassClassificationEvaluator_f53a3fe73754

In [41]:
evaluator.evaluate(predictions)


                                                                                

0.9915305505142167