<span style="color:green;font-size:xx-large">Set up PySpark</span>

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark ML Basics").getOrCreate()
sc = spark.sparkContext



<span style="color:green;font-size:xx-large">Vectors</span>

In [None]:
from pyspark.ml.linalg import Vectors
import numpy as np
x = np.array([3.2,0,0,0,4.7,1.6,0,0,0,0,10.2,0,0,11.1])
data_dense = Vectors.dense(x) 
data_sparse = Vectors.sparse(14,np.array([0,4,5,10,13]),np.array([3.2,4.7,1.6,10.2,11.1]))

In [None]:
data_dense

In [None]:
data_sparse

In [None]:
print(Vectors.norm(data_dense,1)) #returns the p=1 norm (taxicab)
print(Vectors.norm(data_dense,2)) #returns the euclidean norm
print(Vectors.squared_distance(data_dense,data_sparse)) #distance between two vectors

<span style="color:green;font-size:xx-large">Matrices</span>

In [None]:

from pyspark.ml.linalg import Matrices

data = np.array([1.0, 0.0, 4.0, 0.0, 3.0, 5.0, 2.0, 0.0, 6.0])
dense_m = Matrices.dense(3,3,data)
sparse_m = dense_m.toSparse()

In [None]:
dense_m


In [None]:
sparse_m

<br><br><br>
<h2 style="color:red;font-size:50px">feature transformers</h2>
<br>

<span style="color:green;font-size:xx-large">Vector Assembler</span>

In [None]:
from pyspark.ml.feature import VectorAssembler

df = spark.createDataFrame([
  (22.0, 23.1,3),
  (12.2, 13.0,2),
  (43.7, 16.2,4),
  (36.4, 34.8,3),
  (6.1, 71.0,3),
  (28.2, 22.1,7)
]).toDF("feature1", "feature2","dv")

#Create an assembler object identifying the columns that need to be vectorized
assembler = VectorAssembler()\
  .setInputCols(["feature1","feature2"])\
  .setOutputCol("features")

#Call transform on the dataframe. This creates a new dataframe using the specifications
#(specs = which columns to keep)
#by default, spark ml models assume the dv is in a column called label and the iv in a column called features
df_lr = assembler.transform(df)\
    .select("dv","features")\
    .withColumnRenamed("dv","label")

df_lr.show()


<span style="color:green;font-size:xx-large">StringIndexer</span>

In [None]:
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame([
  ("MIA", 17.2,2),  
  ("NYC", 23.1,3),
  ("SFO", 13.0,2),
  ("NYC", 16.2,4),
  ("CHI", 34.8,3),
  ("SFO", 71.0,3),
  ("SFO", 22.12,6),
  ("LAX", 22.1,7)
]).toDF("feature1", "feature2","dv")

#Create a StringIndexer object with the column specifications
indexer = StringIndexer()\
  .setInputCol("feature1")\
  .setOutputCol("feature1Index")

#The "fit" operation determines the category to number relationship
#The "transform" operation does the actual assigning of values
indexed = indexer\
                .fit(df)\
                .transform(df)
indexed.show()

<br><br><br>
<span style="color:green;font-size:xx-large">One Hot Encoding</span>
<br>

In [None]:
from pyspark.ml.feature import OneHotEncoder

df = spark.createDataFrame([
    ("Jack","A","IEOR"),
    ("Jill","B","IEOR"),
    ("Jiahuo","A","CS"),
    ("Pierre","C","APAM"),
    ("Clemence","B","APAM"),
    ("Savitri","A","CS"),
    ("Bjorn","A","QMSS")]).toDF("student","grade","department")

In [None]:
from pyspark.ml.feature import StringIndexer


indexer = StringIndexer()\
  .setInputCols(("grade","department"))\
  .setOutputCols(("gradeIndex","departmentIndex"))

#The "fit" operation determines the category to number relationship
#The "transform" operation does the actual assigning of values
indexedDf = indexer\
                .fit(df)\
                .transform(df)
indexedDf.show()

In [None]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder()\
  .setInputCols(("gradeIndex", "departmentIndex"))\
  .setOutputCols(("gradeVec", "departmentVec"))

In [None]:
model = encoder.fit(indexedDf)

In [None]:
encoded = model.transform(indexedDf)
encoded.show()

<span style="color:blue;font-size:large">toPandas</span>

<li>Handy function that converts a Spark DF into a Pandas DF</li>
<li>But, beware, a pandas dataframe is not lazy. Only use this when you're sure that the dataframe is small enough</li>
<li>toArray() converts a vector into a numpy array</li>

In [None]:
encoded.toPandas()

In [None]:
df = spark.createDataFrame((
  ("MIA", 17.2,2,4),  
  ("NYC", 23.1,3,4),
  ("SFO", 13.0,2,2),
  ("NYC", 16.2,1,1),
  ("CHI", 34.8,3,7),
  ("SFO", 71.0,3,6),
  ("SFO", 22.12,3,3),  
  ("LAX", 22.1,2,4)
)).toDF("feature1", "feature2","feature3","dv")

df.show()

<br><br><br><br>
<h2 style="color:red;font-size:50px">Example: California home values</h2>
<br><br>

In [None]:
from pyspark.sql.types import *
df = spark.read.format("csv")\
        .option("header","false")\
        .option("inferschema","true")\
        .load("../Module06-sparkstreaming/cal_housing.data")\
        .toDF("Longitude","Latitude","MedianAge",\
                     "TotalRooms","TotalBedrooms","Population","Households",\
                     "MedianIncome","MedianHomeValue")


In [None]:
df.printSchema()

<span style="color:blue;font-size:large">Setting up the dependent variable</span>
<li>We'll simplify the median home value by dividing it by 100,000
<li>With PySpark, we can't use the dollar sign to represent a column and need to explicitly use col</li>

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import col
df = spark.read.format("csv")\
        .option("header","false")\
        .option("inferschema","true")\
        .load("../Module06-sparkstreaming/cal_housing.data")\
        .toDF("Longitude","Latitude","MedianAge",\
                     "TotalRooms","TotalBedrooms","Population","Households",\
                     "MedianIncome","MedianHomeValue")\
        .withColumn("MedianHomeValue",col("MedianHomeValue")/100000)

<span style="color:blue;font-size:large">Setting up independent variables</span>
<li>We'll divide total rooms, total bedrooms, and population by the number of households to get per household data</li>


In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import col
df = spark.read.format("csv")\
        .option("header","false")\
        .option("inferschema","true")\
        .load("../Module06-sparkstreaming/cal_housing.data")\
        .toDF("Longitude","Latitude","MedianAge",\
                     "TotalRooms","TotalBedrooms","Population","Households",\
                     "MedianIncome","MedianHomeValue")\
        .withColumn("MedianHomeValue",col("MedianHomeValue")/100000)    \
    .withColumn("RoomsPerHouse", col("TotalRooms")/col("Households")) \
    .withColumn("PeoplePerHouse", col("Population")/col("Households")) \
    .withColumn("BedroomsPerHouse", col("TotalBedrooms")/col("Households"))

<span style="color:blue;font-size:large">Select the features we need</span>


In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import col
df = spark.read.format("csv")\
        .option("header","false")\
        .option("inferschema","true")\
        .load("../Module06-sparkstreaming/cal_housing.data")\
        .toDF("Longitude","Latitude","MedianAge",\
                     "TotalRooms","TotalBedrooms","Population","Households",\
                     "MedianIncome","MedianHomeValue")\
        .withColumn("MedianHomeValue",col("MedianHomeValue")/100000)    \
    .withColumn("RoomsPerHouse", col("TotalRooms")/col("Households")) \
    .withColumn("PeoplePerHouse", col("Population")/col("Households")) \
    .withColumn("BedroomsPerHouse", col("TotalBedrooms")/col("Households"))\
    .select("MedianHomeValue", 
              "MedianAge", 
              "Population", 
              "Households", 
              "MedianIncome", 
              "RoomsPerHouse", 
              "PeoplePerHouse", 
              "BedroomsPerHouse",
               "Latitude",
               "Longitude")


<h2 style="color:green;font-size:xx-large">Machine Learning Pipelines</h2>

<span style="color:blue;font-size:large">Read the data from a file and split into train and test</span>

In [None]:
from pyspark.sql import DataFrame


def readData(): 
    df = spark.read.format("csv")\
        .option("header","false")\
        .option("inferschema","true")\
        .load("../Module06-sparkstreaming/cal_housing.data")\
        .toDF("Longitude","Latitude","MedianAge",\
                     "TotalRooms","TotalBedrooms","Population","Households",\
                     "MedianIncome","MedianHomeValue")
    train,test = df.randomSplit((0.8,0.2),seed=1234)
    return train,test


In [None]:
train,test = readData()

<span style="color:blue;font-size:large">Do the preprocessing steps</span>
<li>train and test can be separately passed through this function</li>

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col


def prepareData(df):
    return df.withColumn("MedianHomeValue",col("MedianHomeValue")/100000)\
        .withColumn("RoomsPerHouse", col("TotalRooms")/col("Households"))\
        .withColumn("PeoplePerHouse", col("Population")/col("Households"))\
        .withColumn("BedroomsPerHouse", col("TotalBedrooms")/col("Households"))\
        .select("MedianHomeValue", \
                  "MedianAge", \
                  "Population", \
                  "Households", \
                  "MedianIncome", \
                  "RoomsPerHouse", \
                  "PeoplePerHouse", \
                  "BedroomsPerHouse",\
                   "Latitude",\
                   "Longitude")\
        .withColumnRenamed("MedianHomeValue","label")
    


In [None]:
prepareData(train)

<span style="color:blue;font-size:large">Vector Assembler</span>


In [None]:
cols = list(filter(lambda l : l if (l != "label")  else None,prepareData(train).columns))

cols

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Matrix
from pyspark.ml.linalg import Vectors

#Get the names of all columns except MedianHomeValue (label)
cols = list(filter(lambda l : l if (l != "label")  else None,prepareData(train).columns))



#Create a vectorassembler from the list of columns and specify the name of the column of vectors
assembler = VectorAssembler()\
  .setInputCols(cols)\
  .setOutputCol("features")

#Apply the transform function on the data frame, select the dv and features column
#And rename the dv column to label

vector_df = assembler.transform(prepareData(train))\
    .select("label","features")

vector_df.show()

<span style="color:blue;font-size:large">Scaling</span>


In [None]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler()\
      .setInputCol("features")\
      .setOutputCol("scaledFeatures")\
      .setWithStd(True)\
      .setWithMean(True)

#Generate the parameters (fit the scaling object to the data)
fitted_scaler = scaler.fit(vector_df)

#scale the data
scaled_df = fitted_scaler.transform(vector_df)
scaled_df.show()

<h3>Regression model</h3>

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression()\
    .setMaxIter(10)\
    .setRegParam(0.3) \
    .setElasticNetParam(0.8) \
    .setFeaturesCol("scaledFeatures") \
    .setLabelCol("label") 

lrModel = lr.fit(scaled_df) 



In [None]:
lr.fit(scaled_df)

<h2>The pipeline</h2>

In [None]:
from pyspark.ml import Pipeline, PipelineModel

pipeline = Pipeline().setStages((assembler,scaler,lr))
model = pipeline.fit(prepareData(train))

<br><br><br>
<span style="color:green;font-size:xx-large">Model evaluation</span>

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator()\
  .setLabelCol("label")\
  .setPredictionCol("prediction")
  

In [None]:
predictions = model.transform(prepareData(test))

In [None]:
rmse_test = evaluator.setMetricName("rmse").evaluate(predictions)
r2_test = evaluator.setMetricName("r2").evaluate(predictions)
print("Test  RMSE: ",rmse_test," Test  r2: ",r2_test)

In [None]:
from pyspark.ml.regression import LinearRegressionModel

lrModel = model.stages[2]
print("Coefficients: ",lrModel.coefficients)
print("Intercept: ",lrModel.intercept)

<br><br><br>
<span style="color:green;font-size:xx-large">Hyperparameter tuning</span>
<br><br>

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, (0.1, 0.01))\
    .addGrid(lr.elasticNetParam,(0.7,0.8, 0.9))\
  .build()

In [None]:
cv = CrossValidator()\
    .setEstimator(pipeline)\
    .setEvaluator(RegressionEvaluator())\
    .setEstimatorParamMaps(paramGrid)\
    .setNumFolds(3)\
    .setParallelism(3) 

In [None]:
cvModel = cv.fit(prepareData(train))

In [None]:
test_r = cvModel.transform(prepareData(test))

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator()\
  .setLabelCol("label")\
  .setPredictionCol("prediction")
  

rmse = evaluator.setMetricName("rmse").evaluate(test_r)
r2 = evaluator.setMetricName("r2").evaluate(test_r)

In [None]:
print(cvModel.bestModel\
    .stages[2]\
    .coefficients)

print(cvModel.bestModel
    .stages[2]
    .intercept)