# Supervised Machine Learning in PySpark

Intro

Cover basic structure, training and validation split, model selection, pipelines, Cross validation

In [1]:
# First let's create our PySpark instance
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
# May take awhile locally
spark = SparkSession.builder.appName("Review2").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

You are working with 1 core(s)


## Format Data 

MLlib requires all input columns of your dataframe to be vectorized. You will see that we rename our dependent var to label as that is what is expected for all MLlib applications. If rename once here, we never have to do it again!

In [12]:
# Data Prep function

from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import * 
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer

def MLClassifierDFPrep(df,input_columns,dependent_var):
    assembler = VectorAssembler(inputCols=input_columns,outputCol='features')
    output = assembler.transform(df)
    
    # change label to string type to prep for reindexing
    # Pyspark is expecting a zero indexed integer for the label column. 
    # Just incase our data is not in that format... we will treat it by using the StringIndexer built in method
    renamed = output.withColumn("label_str", df[dependent_var].cast(StringType()))
    # print(renamed.printSchema())
    indexer = StringIndexer(inputCol="label_str", outputCol="label")
    indexed = indexer.fit(renamed).transform(renamed)
    #QA
    print("New DataFrame Format (note new indexing):")
    print(" ")
    indexed.show(5)

    final_data = indexed.select('features','label')
    
    return final_data

In [13]:
df = spark.createDataFrame([(3,69,57,56,678,345),(3,67,56,58,678,345),(3,67,54,57,678,345),(3,68,55,58,678,345),(3,68,53,52,678,345)
                           ,(2,11,10,907,16,458),(2,12,14,909,12,456),(2,11,13,910,10,459),(2,12,11,905,16,459),(2,10,13,902,10,459)
                           ,(1,30,11,123,568,891),(1,32,12,124,567,890),(1,34,10,123,566,895),(1,35,15,121,564,894),(1,30,12,124,560,896)], 
                           ['flower_type', 'sepal_len','sepal_width','R','G','B'])

input_columns = ['sepal_len','sepal_width','R','G','B']
dependent_var = 'flower_type'

final_data = MLClassifierDFPrep(df,input_columns,dependent_var)
final_data.show(5)
train,test = final_data.randomSplit([0.7,0.3])

New DataFrame Format (note new indexing):
 
+-----------+---------+-----------+---+---+---+--------------------+---------+-----+
|flower_type|sepal_len|sepal_width|  R|  G|  B|            features|label_str|label|
+-----------+---------+-----------+---+---+---+--------------------+---------+-----+
|          3|       69|         57| 56|678|345|[69.0,57.0,56.0,6...|        3|  2.0|
|          3|       67|         56| 58|678|345|[67.0,56.0,58.0,6...|        3|  2.0|
|          3|       67|         54| 57|678|345|[67.0,54.0,57.0,6...|        3|  2.0|
|          3|       68|         55| 58|678|345|[68.0,55.0,58.0,6...|        3|  2.0|
|          3|       68|         53| 52|678|345|[68.0,53.0,52.0,6...|        3|  2.0|
+-----------+---------+-----------+---+---+---+--------------------+---------+-----+
only showing top 5 rows

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[69.0,57.0,56.0,6...|  2.0|
|[67.0,56.0,58.0,6...|  2.0|
|[67.0,54.0,57.0,6...

## Multi-Class Classification

Use this function if your dependent variable (the variable you want to predict) is a descrete value (i.e. whole numbers like 1,2,3,4) typically the dependent variable will represent some class or group like types of flowers or colors, as opposed to a continuous variable like a sales database where every row of data is a transaction and the dependent variable is something like the dollar amount of the sale.

In [None]:
# Read in sample dataset
# final_data = spark.read.format("libsvm").load("C:/spark-2.3.3-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")
# train,test = df.randomSplit([0.7,0.3])

In [None]:
# train,test = df.randomSplit([0.7,0.3])

# models = [LinearRegression(),DecisionTreeRegressor(),RandomForestRegressor(),GBTRegressor()]

# for model in models:

#     # Fit our model
#     M = model
#     fitModel = M.fit(train)

#     # Load the Summary
#     trainingSummary = fitModel.summary

# #     trainingSummary.residuals.show()
#     print("Training RMSE: %f" % trainingSummary.rootMeanSquaredError)
#     print("Training r2: %f" % trainingSummary.r2)

In [None]:
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.sql import functions as F

def ClassTrainEval(classifier,final_data,features,classes):

    def FindMtype(classifier):
        # Intstantiate Model
        M = classifier
        # Learn what it is
        Mtype = type(M).__name__
        
        return Mtype
    
    Mtype = FindMtype(classifier)
    print(Mtype)


    def IntanceFitModel(Mtype,classifier,classes,final_data,features):
        if Mtype == "OneVsRest":
            # instantiate the base classifier.
            lr = LogisticRegression()
            # instantiate the One Vs Rest Classifier.
            OVRclassifier = OneVsRest(classifier=lr)
            fitModel = OVRclassifier.fit(train)
            return fitModel
        if Mtype == "MultilayerPerceptronClassifier":
            # specify layers for the neural network:
            # input layer of size features, two intermediate of features+1 and same size as features
            # and output of size number of classes
#             features = final_data.select(['features']).collect() #Collecting Results as Python Objects
            features_count = len(features[0][0])
    #         class_count = final_data.select(F.countDistinct("label"))
            layers = [features_count, features_count+1, features_count, classes]
            MPC_classifier = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
            fitModel = MPC_classifier.fit(train)
            return fitModel
        if Mtype in("LinearSVC","GBTClassifier") and classes != 2: #These classifiers currently only accept binary classification
            print("Classifier could not be used because it currectly only accepts binary classification data")
            fitModel = "Not Acceptable"
            return fitModel
        if Mtype in("LogisticRegression","NaiveBayes","RandomForestClassifier","GBTClassifier","LinearSVC","DecisionTreeClassifier"):
            # Fit Model
            fitModel = classifier.fit(train)
            return fitModel
    
    fitModel = IntanceFitModel(Mtype,classifier,classes,final_data,features)
    print(fitModel)
        
    if fitModel != "Not Acceptable":
        if classes == 0 or classes == 1:
            print("Not Enough Classes")
        elif classes == 2 and Mtype in("LogisticRegression","GBTClassifier","LinearSVC"):
            # Select (prediction, true label) and compute test error
            predictionAndLabels = fitModel.evaluate(test)
            predictionAndLabels = predictionAndLabels.predictions.select('label','prediction')
            Bin_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction') #labelCol='label'
            auc = Bin_evaluator.evaluate(predictionAndLabels)
            print(Mtype," AUC:",auc)
        else:
            predictions = fitModel.transform(test)
            MC_evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # redictionCol="prediction",
            accuracy = (MC_evaluator.evaluate(predictions))*100
            print(Mtype," Accuracy: {0:.2f}".format(accuracy),"%") #     print("Test Error = %g " % (1.0 - accuracy))
    else:
        print(" ")

In [None]:
# Run!
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.sql import functions as F

classifiers = [LogisticRegression()
               ,NaiveBayes(),OneVsRest(),LinearSVC()
              ,RandomForestClassifier(),GBTClassifier(),DecisionTreeClassifier()
              ,MultilayerPerceptronClassifier()] 

features = final_data.select(['features']).collect()
# Learn how many classes there are in order to specify evaluation type based on binary or multi and turn the df into an object
class_count = final_data.select(F.countDistinct("label")).collect()
classes = class_count[0][0]
    
for classifier in classifiers:
    ClassTrainEval(classifier,final_data,features,classes)

### Classification Diagnostics

In [18]:
from pyspark.ml.evaluation import *
from pyspark.ml.classification import *

def ClassDiag(classifier):
    
    # Fit our model
    C = classifier
    fitModel = C.fit(train)

    # Load the Summary
    trainingSummary = fitModel.summary

    # General Describe
    trainingSummary.predictions.describe().show()

    # View Predictions
    pred_and_labels = fitModel.evaluate(test)
    pred_and_labels.predictions.show()

    # Print the coefficients and intercept for multinomial logistic regression
    print("Coefficients: \n" + str(fitModel.coefficientMatrix))
    print(" ")
    print("Intercept: " + str(fitModel.interceptVector))
    print(" ")

    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print(" ")
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)

    # for multiclass, we can inspect metrics on a per-label basis
    print(" ")
    print("False positive rate by label:")
    for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
        print("label %d: %s" % (i, rate))

    print(" ")
    print("True positive rate by label:")
    for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
        print("label %d: %s" % (i, rate))

    print(" ")
    print("Precision by label:")
    for i, prec in enumerate(trainingSummary.precisionByLabel):
        print("label %d: %s" % (i, prec))

    print(" ")
    print("Recall by label:")
    for i, rec in enumerate(trainingSummary.recallByLabel):
        print("label %d: %s" % (i, rec))

    print(" ")
    print("F-measure by label:")
    for i, f in enumerate(trainingSummary.fMeasureByLabel()):
        print("label %d: %s" % (i, f))

    accuracy = trainingSummary.accuracy
    falsePositiveRate = trainingSummary.weightedFalsePositiveRate
    truePositiveRate = trainingSummary.weightedTruePositiveRate
    fMeasure = trainingSummary.weightedFMeasure()
    precision = trainingSummary.weightedPrecision
    recall = trainingSummary.weightedRecall
    print(" ")
    print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
          % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

In [19]:
# classifier = LogisticRegression()
ClassDiag(LogisticRegression())

+-------+------------------+------------------+
|summary|             label|        prediction|
+-------+------------------+------------------+
|  count|                11|                11|
|   mean|               1.0|               1.0|
| stddev|0.8944271909999159|0.8944271909999159|
|    min|               0.0|               0.0|
|    max|               2.0|               2.0|
+-------+------------------+------------------+

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[68.0,55.0,58.0,6...|  2.0|[-5.5957328177030...|[5.31986839129448...|       2.0|
|[11.0,13.0,910.0,...|  1.0|[-5.9167187452156...|[9.79042480321198...|       1.0|
|[12.0,14.0,909.0,...|  1.0|[-6.2156086590671...|[7.90920813357862...|       1.0|
|[30.0,11.0,123.0,...|  0.0|[14.9548482822278...|[0.99999999957734...|     

## Regression

**Note:**
We did not include Generalized Linear Regression here since it requires a much different implementation method and evaluation strategy than most regressions. 

In [20]:
# # Load the data
# df = spark.read.format("libsvm").load("C:/spark-2.3.3-bin-hadoop2.7/data/mllib/sample_linear_regression_data.txt")
# train,test = df.randomSplit([0.7,0.3])

In [41]:
# Declare data prep function
from pyspark.ml.feature import VectorAssembler

def MLRegressDFPrep(df,input_columns,dependent_var):
    assembler = VectorAssembler(inputCols=input_columns,outputCol='features')
    output = assembler.transform(df)
    renamed = output.withColumnRenamed(dependent_var,'label')
    final_data = renamed.select('features','label')
    
    return final_data

In [42]:
df = spark.createDataFrame([(96,72.2,144,14.4,19),
                            (80,12.5,120,12.2,68),
                            (72,10.8,108,10.8,36),
                            (52,7.8,78,7.8,49),
                            (97,14.55,145,14.55,63),
                            (42,6.3,63,6.3,61),
                            (20,3.7,30,3.4,22),
                            (5,0.75,7,0.75,24),
                            (89,13.35,133,13.35,63),
                            (19,2.85,28,2.85,26)], 
                           ['sales_m', 'advertising_k','employees','locations','compage'])

input_columns = ['advertising_k','employees','locations','compage']
dependent_var = 'sales_m'

final_data = MLRegressDFPrep(df,input_columns,dependent_var)
final_data.show(5)
train,test = final_data.randomSplit([0.7,0.3])

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[72.2,144.0,14.4,...|   96|
|[12.5,120.0,12.2,...|   80|
|[10.8,108.0,10.8,...|   72|
| [7.8,78.0,7.8,49.0]|   52|
|[14.55,145.0,14.5...|   97|
+--------------------+-----+
only showing top 5 rows



In [33]:
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *

def RegressTrainEval(regressor):

    def FindMtype(regressor):
        # Intstantiate Model
        M = regressor
        # Learn what it is
        Mtype = type(M).__name__
        
        return Mtype
    
    Mtype = FindMtype(regressor)
    print(Mtype)


#     def Evaluate(Mtype,classifier):

    if Mtype == "LinearRegression":

        # Fit our model
        fitModel = regressor.fit(train)

        # Load the Summary
        trainingSummary = fitModel.summary
        
        # Print the coefficients and intercept for linear regression
        print("Coefficients: %s" % str(fitModel.coefficients))
        print("Intercept: %s" % str(fitModel.intercept))
        print("")

        # Summarize the model over the training set and print out some metrics
        print("numIterations: %d" % trainingSummary.totalIterations)
        print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
        print("")
        
        # Print the Errors
        print("Training RMSE: %f" % trainingSummary.rootMeanSquaredError)
        print("Training r2: %f" % trainingSummary.r2)
        print("")

        # Now load the test results
        test_results = fitModel.evaluate(test)

        # And print them
        print("Test RMSE: {}".format(test_results.rootMeanSquaredError))
        print("Test r2: {}".format(test_results.r2))
        print("")

    else:
        
        # Fit our model
        fitModel = regressor.fit(train)
                    
        # Make predictions.
        predictions = fitModel.transform(test)
        # Select (prediction, true label) and compute test error
        evaluator = RegressionEvaluator(metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
        print("")

In [34]:
# Run!
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.sql import functions as F

regressors = [LinearRegression()
              ,RandomForestRegressor(),GBTRegressor(),DecisionTreeRegressor()] 
    
for regressor in regressors:
    RegressTrainEval(regressor)

LinearRegression
Coefficients: [-0.4903672321785257,1.2264876672553044,-0.9743544462783574,1.8766639577498148,0.6671339513374913,1.7730018647921157,-0.10889904866592723,-0.030261305723280115,-0.7796826355203386,0.20381413983516675]
Intercept: -0.021291243629179005

numIterations: 1
objectiveHistory: [0.0]

Training RMSE: 9.849372
Training r2: 0.032913

Test RMSE: 10.933152836069855
Test r2: -0.0007608531921694528

RandomForestRegressor
Root Mean Squared Error (RMSE) on test data = 11.0362

GBTRegressor
Root Mean Squared Error (RMSE) on test data = 13.2638

DecisionTreeRegressor
Root Mean Squared Error (RMSE) on test data = 12.2536

