# Supervised Machine Learning in PySpark

Intro

Cover basic structure, training and validation split, model selection, pipelines, Cross validation

## Regression

**Note:**
We did not include Generalized Linear Regression here since it requires a much different implementation method and evaluation strategy than most regressions. 

## Import necessary libraries

In [None]:
from pyspark.ml.feature import VectorAssembler


## Format Data 

MLlib requires all input columns of your dataframe to be vectorized. You will see that we rename our dependent var to label as that is what is expected for all MLlib applications. If rename once here, we never have to do it again!

In [41]:
# Declare data prep function

def MLRegressDFPrep(df,input_columns,dependent_var):
    assembler = VectorAssembler(inputCols=input_columns,outputCol='features')
    output = assembler.transform(df)
    renamed = output.withColumnRenamed(dependent_var,'label')
    final_data = renamed.select('features','label')
    
    return final_data

In [42]:
df = spark.createDataFrame([(96,72.2,144,14.4,19),
                            (80,12.5,120,12.2,68),
                            (72,10.8,108,10.8,36),
                            (52,7.8,78,7.8,49),
                            (97,14.55,145,14.55,63),
                            (42,6.3,63,6.3,61),
                            (20,3.7,30,3.4,22),
                            (5,0.75,7,0.75,24),
                            (89,13.35,133,13.35,63),
                            (19,2.85,28,2.85,26)], 
                           ['sales_m', 'advertising_k','employees','locations','compage'])

input_columns = ['advertising_k','employees','locations','compage']
dependent_var = 'sales_m'

final_data = MLRegressDFPrep(df,input_columns,dependent_var)
final_data.show(5)
train,test = final_data.randomSplit([0.7,0.3])

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[72.2,144.0,14.4,...|   96|
|[12.5,120.0,12.2,...|   80|
|[10.8,108.0,10.8,...|   72|
| [7.8,78.0,7.8,49.0]|   52|
|[14.55,145.0,14.5...|   97|
+--------------------+-----+
only showing top 5 rows



In [33]:
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *

def RegressTrainEval(regressor):

    def FindMtype(regressor):
        # Intstantiate Model
        M = regressor
        # Learn what it is
        Mtype = type(M).__name__
        
        return Mtype
    
    Mtype = FindMtype(regressor)
    print(Mtype)


#     def Evaluate(Mtype,classifier):

    if Mtype == "LinearRegression":

        # Fit our model
        fitModel = regressor.fit(train)

        # Load the Summary
        trainingSummary = fitModel.summary
        
        # Print the coefficients and intercept for linear regression
        print("Coefficients: %s" % str(fitModel.coefficients))
        print("Intercept: %s" % str(fitModel.intercept))
        print("")

        # Summarize the model over the training set and print out some metrics
        print("numIterations: %d" % trainingSummary.totalIterations)
        print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
        print("")
        
        # Print the Errors
        print("Training RMSE: %f" % trainingSummary.rootMeanSquaredError)
        print("Training r2: %f" % trainingSummary.r2)
        print("")

        # Now load the test results
        test_results = fitModel.evaluate(test)

        # And print them
        print("Test RMSE: {}".format(test_results.rootMeanSquaredError))
        print("Test r2: {}".format(test_results.r2))
        print("")

    else:
        
        # Fit our model
        fitModel = regressor.fit(train)
                    
        # Make predictions.
        predictions = fitModel.transform(test)
        # Select (prediction, true label) and compute test error
        evaluator = RegressionEvaluator(metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
        print("")

In [34]:
# Run!
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.sql import functions as F

regressors = [LinearRegression()
              ,RandomForestRegressor(),GBTRegressor(),DecisionTreeRegressor()] 
    
for regressor in regressors:
    RegressTrainEval(regressor)

LinearRegression
Coefficients: [-0.4903672321785257,1.2264876672553044,-0.9743544462783574,1.8766639577498148,0.6671339513374913,1.7730018647921157,-0.10889904866592723,-0.030261305723280115,-0.7796826355203386,0.20381413983516675]
Intercept: -0.021291243629179005

numIterations: 1
objectiveHistory: [0.0]

Training RMSE: 9.849372
Training r2: 0.032913

Test RMSE: 10.933152836069855
Test r2: -0.0007608531921694528

RandomForestRegressor
Root Mean Squared Error (RMSE) on test data = 11.0362

GBTRegressor
Root Mean Squared Error (RMSE) on test data = 13.2638

DecisionTreeRegressor
Root Mean Squared Error (RMSE) on test data = 12.2536

