<span style="color:green;font-size:xx-large">Set up PySpark</span>

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark ML Basics").getOrCreate()
sc = spark.sparkContext



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/07 09:16:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/07 09:16:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


<span style="color:green;font-size:xx-large">Vectors</span>

In [2]:
from pyspark.ml.linalg import Vectors
import numpy as np
x = np.array([3.2,0,0,0,4.7,1.6,0,0,0,0,10.2,0,0,11.1])
data_dense = Vectors.dense(x) 
data_sparse = Vectors.sparse(14,np.array([0,4,5,10,13]),np.array([3.2,4.7,1.6,10.2,11.1])) #using numpy array to create

In [3]:
data_dense

DenseVector([3.2, 0.0, 0.0, 0.0, 4.7, 1.6, 0.0, 0.0, 0.0, 0.0, 10.2, 0.0, 0.0, 11.1])

In [4]:
data_sparse #dictionary

SparseVector(14, {0: 3.2, 4: 4.7, 5: 1.6, 10: 10.2, 13: 11.1})

In [5]:
print(Vectors.norm(data_dense,1)) #returns the p=1 norm (taxicab)
print(Vectors.norm(data_dense,2)) #returns the euclidean norm
print(Vectors.squared_distance(data_dense,data_sparse)) #distance between two vectors

30.799999999999997
16.190738093119784
0.0


<span style="color:green;font-size:xx-large">Matrices</span>

In [6]:

from pyspark.ml.linalg import Matrices

data = np.array([1.0, 0.0, 4.0, 0.0, 3.0, 5.0, 2.0, 0.0, 6.0])
dense_m = Matrices.dense(3,3,data)
sparse_m = dense_m.toSparse()

In [7]:
dense_m


DenseMatrix(3, 3, [1.0, 0.0, 4.0, 0.0, 3.0, 5.0, 2.0, 0.0, 6.0], False)

In [8]:
sparse_m

SparseMatrix(3, 3, [0, 2, 4, 6], [0, 2, 1, 2, 0, 2], [1.0, 4.0, 3.0, 5.0, 2.0, 6.0], False)

<br><br><br>
<h2 style="color:red;font-size:50px">feature transformers</h2>
<br>

<span style="color:green;font-size:xx-large">Vector Assembler</span>

<li>Combines a set of columns into a single <b>sparse</b> vector</li>
<li>In supervised learning, the independent features are combined into a single vector</li>
<li>As a result, each case is represented by a pair (dv,iv-vector)</li>
<li><a href="https://spark.apache.org/docs/latest/ml-features#vectorassembler">https://spark.apache.org/docs/latest/ml-features#vectorassembler</a></li>

In [9]:
from pyspark.ml.feature import VectorAssembler

df = spark.createDataFrame([
  (22.0, 23.1,3),
  (12.2, 13.0,2),
  (43.7, 16.2,4),
  (36.4, 34.8,3),
  (6.1, 71.0,3),
  (28.2, 22.1,7)
]).toDF("feature1", "feature2","dv")

#Create an assembler object identifying the columns that need to be vectorized
assembler = VectorAssembler()\
  .setInputCols(["feature1","feature2"])\
  .setOutputCol("features")

#Call transform on the dataframe. This creates a new dataframe using the specifications
#(specs = which columns to keep)
#by default, spark ml models assume the dv is in a column called label and the iv in a column called features
df_lr = assembler.transform(df)\
    .select("dv","features")\
    .withColumnRenamed("dv","label")

df_lr.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-----+-----------+
|label|   features|
+-----+-----------+
|    3|[22.0,23.1]|
|    2|[12.2,13.0]|
|    4|[43.7,16.2]|
|    3|[36.4,34.8]|
|    3| [6.1,71.0]|
|    7|[28.2,22.1]|
+-----+-----------+



                                                                                


<span style="color:green;font-size:xx-large">StringIndexer</span>

<li>ML algorithms need numbers!</li>
<li>Any string variables need to be converted into numbers before they can be used</li>
<li><a href="https://spark.apache.org/docs/latest/api/scala/org/apache/spark/ml/feature/StringIndexer.html">StringIndexer</a> is a spark feature transofrmer that does this</li>
<li>The most frequent category is given the value 1, second most 2, etc.</li>

In [10]:
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame([
  ("MIA", 17.2,2),  
  ("NYC", 23.1,3),
  ("SFO", 13.0,2),
  ("NYC", 16.2,4),
  ("CHI", 34.8,3),
  ("SFO", 71.0,3),
  ("SFO", 22.12,6),
  ("LAX", 22.1,7)
]).toDF("feature1", "feature2","dv")

#Create a StringIndexer object with the column specifications
indexer = StringIndexer()\
  .setInputCol("feature1")\
  .setOutputCol("feature1Index")

#The "fit" operation determines the category to number relationship
#The "transform" operation does the actual assigning of values
indexed = indexer\
                .fit(df)\
                .transform(df)
indexed.show()

+--------+--------+---+-------------+
|feature1|feature2| dv|feature1Index|
+--------+--------+---+-------------+
|     MIA|    17.2|  2|          4.0|
|     NYC|    23.1|  3|          1.0|
|     SFO|    13.0|  2|          0.0|
|     NYC|    16.2|  4|          1.0|
|     CHI|    34.8|  3|          2.0|
|     SFO|    71.0|  3|          0.0|
|     SFO|   22.12|  6|          0.0|
|     LAX|    22.1|  7|          3.0|
+--------+--------+---+-------------+



<br><br><br>
<span style="color:green;font-size:xx-large">One Hot Encoding</span>
<br>

In [11]:
from pyspark.ml.feature import OneHotEncoder

df = spark.createDataFrame([
    ("Jack","A","IEOR"),
    ("Jill","B","IEOR"),
    ("Jiahuo","A","CS"),
    ("Pierre","C","APAM"),
    ("Clemence","B","APAM"),
    ("Savitri","A","CS"),
    ("Bjorn","A","QMSS")]).toDF("student","grade","department")

In [None]:
from pyspark.ml.feature import StringIndexer


indexer = StringIndexer()\
  .setInputCols(("grade","department"))\
  .setOutputCols(("gradeIndex","departmentIndex"))

#The "fit" operation determines the category to number relationship
#The "transform" operation does the actual assigning of values
indexedDf = indexer\
                .fit(df)\
                .transform(df)
indexedDf.show()

In [None]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder()\
  .setInputCols(("gradeIndex", "departmentIndex"))\
  .setOutputCols(("gradeVec", "departmentVec"))

In [None]:
model = encoder.fit(indexedDf)

In [None]:
encoded = model.transform(indexedDf)
encoded.show()

<span style="color:blue;font-size:large">toPandas</span>

<li>Handy function that converts a Spark DF into a Pandas DF</li>
<li>But, beware, a pandas dataframe is not lazy. Only use this when you're sure that the dataframe is small enough</li>
<li>toArray() converts a vector into a numpy array</li>

In [None]:
encoded.toPandas()

In [None]:
df = spark.createDataFrame((
  ("MIA", 17.2,2,4),  
  ("NYC", 23.1,3,4),
  ("SFO", 13.0,2,2),
  ("NYC", 16.2,1,1),
  ("CHI", 34.8,3,7),
  ("SFO", 71.0,3,6),
  ("SFO", 22.12,3,3),  
  ("LAX", 22.1,2,4)
)).toDF("feature1", "feature2","feature3","dv")

df.show()

<br><br><br><br>
<h2 style="color:red;font-size:50px">Example: California home values</h2>
<br><br>

In [12]:
from pyspark.sql.types import *
df = spark.read.format("csv")\
        .option("header","false")\
        .option("inferschema","true")\
        .load("cal_housing.data")\
        .toDF("Longitude","Latitude","MedianAge",\
                     "TotalRooms","TotalBedrooms","Population","Households",\
                     "MedianIncome","MedianHomeValue")


In [13]:
df.printSchema()

root
 |-- Longitude: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- MedianAge: double (nullable = true)
 |-- TotalRooms: double (nullable = true)
 |-- TotalBedrooms: double (nullable = true)
 |-- Population: double (nullable = true)
 |-- Households: double (nullable = true)
 |-- MedianIncome: double (nullable = true)
 |-- MedianHomeValue: double (nullable = true)



<span style="color:blue;font-size:large">Setting up the dependent variable</span>
<li>We'll simplify the median home value by dividing it by 100,000
<li>With PySpark, we can't use the dollar sign to represent a column and need to explicitly use col</li>

In [14]:
from pyspark.sql.types import *
from pyspark.sql.functions import col
df = spark.read.format("csv")\
        .option("header","false")\
        .option("inferschema","true")\
        .load("cal_housing.data")\
        .toDF("Longitude","Latitude","MedianAge",\
                     "TotalRooms","TotalBedrooms","Population","Households",\
                     "MedianIncome","MedianHomeValue")\
        .withColumn("MedianHomeValue",col("MedianHomeValue")/100000)

<span style="color:blue;font-size:large">Setting up independent variables</span>
<li>We'll divide total rooms, total bedrooms, and population by the number of households to get per household data</li>


In [15]:
from pyspark.sql.types import *
from pyspark.sql.functions import col
df = spark.read.format("csv")\
        .option("header","false")\
        .option("inferschema","true")\
        .load("cal_housing.data")\
        .toDF("Longitude","Latitude","MedianAge",\
                     "TotalRooms","TotalBedrooms","Population","Households",\
                     "MedianIncome","MedianHomeValue")\
        .withColumn("MedianHomeValue",col("MedianHomeValue")/100000)    \
    .withColumn("RoomsPerHouse", col("TotalRooms")/col("Households")) \
    .withColumn("PeoplePerHouse", col("Population")/col("Households")) \
    .withColumn("BedroomsPerHouse", col("TotalBedrooms")/col("Households"))

<span style="color:blue;font-size:large">Select the features we need</span>


In [16]:
from pyspark.sql.types import *
from pyspark.sql.functions import col
df = spark.read.format("csv")\
        .option("header","false")\
        .option("inferschema","true")\
        .load("cal_housing.data")\
        .toDF("Longitude","Latitude","MedianAge",\
                     "TotalRooms","TotalBedrooms","Population","Households",\
                     "MedianIncome","MedianHomeValue")\
        .withColumn("MedianHomeValue",col("MedianHomeValue")/100000)    \
    .withColumn("RoomsPerHouse", col("TotalRooms")/col("Households")) \
    .withColumn("PeoplePerHouse", col("Population")/col("Households")) \
    .withColumn("BedroomsPerHouse", col("TotalBedrooms")/col("Households"))\
    .select("MedianHomeValue", 
              "MedianAge", 
              "Population", 
              "Households", 
              "MedianIncome", 
              "RoomsPerHouse", 
              "PeoplePerHouse", 
              "BedroomsPerHouse",
               "Latitude",
               "Longitude")


<h2 style="color:green;font-size:xx-large">Machine Learning Pipelines</h2>

<span style="color:blue;font-size:large">Read the data from a file and split into train and test</span>

In [17]:
from pyspark.sql import DataFrame


def readData(): 
    df = spark.read.format("csv")\
        .option("header","false")\
        .option("inferschema","true")\
        .load("cal_housing.data")\
        .toDF("Longitude","Latitude","MedianAge",\
                     "TotalRooms","TotalBedrooms","Population","Households",\
                     "MedianIncome","MedianHomeValue")
    train,test = df.randomSplit((0.8,0.2),seed=1234)
    return train,test


In [18]:
train,test = readData()

<span style="color:blue;font-size:large">Do the preprocessing steps</span>
<li>train and test can be separately passed through this function</li>

In [19]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col


def prepareData(df):
    return df.withColumn("MedianHomeValue",col("MedianHomeValue")/100000)\
        .withColumn("RoomsPerHouse", col("TotalRooms")/col("Households"))\
        .withColumn("PeoplePerHouse", col("Population")/col("Households"))\
        .withColumn("BedroomsPerHouse", col("TotalBedrooms")/col("Households"))\
        .select("MedianHomeValue", \
                  "MedianAge", \
                  "Population", \
                  "Households", \
                  "MedianIncome", \
                  "RoomsPerHouse", \
                  "PeoplePerHouse", \
                  "BedroomsPerHouse",\
                   "Latitude",\
                   "Longitude")\
        .withColumnRenamed("MedianHomeValue","label")
    


In [20]:
prepareData(train)

DataFrame[label: double, MedianAge: double, Population: double, Households: double, MedianIncome: double, RoomsPerHouse: double, PeoplePerHouse: double, BedroomsPerHouse: double, Latitude: double, Longitude: double]

<span style="color:blue;font-size:large">Vector Assembler</span>


In [21]:
cols = list(filter(lambda l : l if (l != "label")  else None,prepareData(train).columns))

cols

['MedianAge',
 'Population',
 'Households',
 'MedianIncome',
 'RoomsPerHouse',
 'PeoplePerHouse',
 'BedroomsPerHouse',
 'Latitude',
 'Longitude']

In [22]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Matrix
from pyspark.ml.linalg import Vectors

#Get the names of all columns except MedianHomeValue (label)
cols = list(filter(lambda l : l if (l != "label")  else None,prepareData(train).columns))



#Create a vectorassembler from the list of columns and specify the name of the column of vectors
assembler = VectorAssembler()\
  .setInputCols(cols)\
  .setOutputCol("features")

#Apply the transform function on the data frame, select the dv and features column
#And rename the dv column to label

vector_df = assembler.transform(prepareData(train))\
    .select("label","features")

vector_df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|0.946|[52.0,806.0,270.0...|
|1.036|[17.0,1244.0,456....|
| 0.79|[36.0,1194.0,465....|
|0.761|[32.0,434.0,187.0...|
|1.067|[52.0,1152.0,435....|
|0.508|[52.0,544.0,172.0...|
|0.732|[11.0,1343.0,479....|
|0.783|[28.0,1530.0,653....|
|0.581|[32.0,620.0,268.0...|
|0.669|[20.0,1993.0,721....|
|0.684|[17.0,1947.0,647....|
|0.901|[21.0,2907.0,972....|
| 0.69|[30.0,1367.0,583....|
|  0.7|[37.0,640.0,260.0...|
|0.746|[15.0,1645.0,640....|
| 1.07|[35.0,480.0,179.0...|
|0.722|[33.0,656.0,236.0...|
| 0.67|[34.0,950.0,317.0...|
|0.702|[37.0,867.0,310.0...|
|0.646|[40.0,788.0,279.0...|
+-----+--------------------+
only showing top 20 rows



<span style="color:blue;font-size:large">Scaling</span>


In [23]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler()\
      .setInputCol("features")\
      .setOutputCol("scaledFeatures")\
      .setWithStd(True)\
      .setWithMean(True)

#Generate the parameters (fit the scaling object to the data)
fitted_scaler = scaler.fit(vector_df)

#scale the data
scaled_df = fitted_scaler.transform(vector_df)
scaled_df.show()

+-----+--------------------+--------------------+
|label|            features|      scaledFeatures|
+-----+--------------------+--------------------+
|0.946|[52.0,806.0,270.0...|[1.85293575512378...|
|1.036|[17.0,1244.0,456....|[-0.9237482035396...|
| 0.79|[36.0,1194.0,465....|[0.58359451687762...|
|0.761|[32.0,434.0,187.0...|[0.26625920731607...|
|1.067|[52.0,1152.0,435....|[1.85293575512378...|
|0.508|[52.0,544.0,172.0...|[1.85293575512378...|
|0.732|[11.0,1343.0,479....|[-1.3997511678820...|
|0.783|[28.0,1530.0,653....|[-0.0510761022454...|
|0.581|[32.0,620.0,268.0...|[0.26625920731607...|
|0.669|[20.0,1993.0,721....|[-0.6857467213685...|
|0.684|[17.0,1947.0,647....|[-0.9237482035396...|
|0.901|[21.0,2907.0,972....|[-0.6064128939781...|
| 0.69|[30.0,1367.0,583....|[0.10759155253530...|
|  0.7|[37.0,640.0,260.0...|[0.66292834426800...|
|0.746|[15.0,1645.0,640....|[-1.0824158583204...|
| 1.07|[35.0,480.0,179.0...|[0.50426068948723...|
|0.722|[33.0,656.0,236.0...|[0.34559303470646...|


<h3>Regression model</h3>

In [24]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression()\
    .setMaxIter(10)\
    .setRegParam(0.3) \
    .setElasticNetParam(0.8) \
    .setFeaturesCol("scaledFeatures") \
    .setLabelCol("label") 

lrModel = lr.fit(scaled_df) 



22/12/07 10:02:06 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/12/07 10:02:06 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


In [25]:
lr.fit(scaled_df)

LinearRegressionModel: uid=LinearRegression_a4d23de06dcc, numFeatures=9

<h2>The pipeline</h2>

In [26]:
from pyspark.ml import Pipeline, PipelineModel

pipeline = Pipeline().setStages((assembler,scaler,lr))
model = pipeline.fit(prepareData(train))

<br><br><br>
<span style="color:green;font-size:xx-large">Model evaluation</span>

In [27]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator()\
  .setLabelCol("label")\
  .setPredictionCol("prediction")
  

In [28]:
predictions = model.transform(prepareData(test))

In [29]:
rmse_test = evaluator.setMetricName("rmse").evaluate(predictions)
r2_test = evaluator.setMetricName("r2").evaluate(predictions)
print("Test  RMSE: ",rmse_test," Test  r2: ",r2_test)

Test  RMSE:  0.8771558134520177  Test  r2:  0.41378266806495867


In [30]:
from pyspark.ml.regression import LinearRegressionModel

lrModel = model.stages[2]
print("Coefficients: ",lrModel.coefficients) #untyped
print("Intercept: ",lrModel.intercept)

Coefficients:  [0.0,0.0,0.0,0.5287039916961691,0.0,0.0,0.0,0.0,0.0]
Intercept:  2.074400715885009


<br><br><br>
<span style="color:green;font-size:xx-large">Hyperparameter tuning</span>
<br><br>

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, (0.1, 0.01))\
    .addGrid(lr.elasticNetParam,(0.7,0.8, 0.9))\
  .build()

In [None]:
cv = CrossValidator()\
    .setEstimator(pipeline)\
    .setEvaluator(RegressionEvaluator())\
    .setEstimatorParamMaps(paramGrid)\
    .setNumFolds(3)\
    .setParallelism(3) 

In [None]:
cvModel = cv.fit(prepareData(train))

In [None]:
test_r = cvModel.transform(prepareData(test))

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator()\
  .setLabelCol("label")\
  .setPredictionCol("prediction")
  

rmse = evaluator.setMetricName("rmse").evaluate(test_r)
r2 = evaluator.setMetricName("r2").evaluate(test_r)

In [None]:
print(cvModel.bestModel\
    .stages[2]\
    .coefficients)

print(cvModel.bestModel
    .stages[2]
    .intercept)