In [163]:
# load standard packages
from pyspark.sql import SparkSession
from pyspark.ml import feature, regression, evaluation, Pipeline
from pyspark.sql import functions as fn, Row
import matplotlib.pyplot as plt
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [164]:
# Object for creating sequences of transformations
from pyspark.ml import Pipeline
# Functionality for computing features
from pyspark.ml import feature
# Functionality for regression
from pyspark.ml import regression
# Funcionality for classification
from pyspark.ml import classification
# Hyperparameter optimization
from pyspark.ml import tuning
# Estimation of errors
from pyspark.ml import evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [165]:
# we will use pandas to display the data nicely
import pandas as pd
datasource = pd.read_csv('http://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', delimiter='\t')
datasource.columns = ['age', 'sex', 'bmi', 'map', 'tc', 'ldl', 'hdl', 'tch', 'ltg', 'glu', 'disease_progression']
diabetes_df = spark.createDataFrame(datasource)

In [166]:
# we will split the data
training_df, validation_df, test_df = diabetes_df.randomSplit([0.6, 0.3, 0.1])

# Part 1

## Question 1

We will use all the data to examine the relationship between `bmi`, `sex`, and `age` to predict `disease_progression`.

## Question 1.1 (10 pts):

Estimate the model using part of the code below. You need to traing on the training data. Save the trained model in variable `model1`

In [167]:
model1 = Pipeline(stages=[
    feature.VectorAssembler(inputCols=['bmi','sex','age'], outputCol='features'),
    regression.LinearRegression(featuresCol='features', labelCol='disease_progression')
]).fit(training_df)

## Question 1.2 (5 pts)

Use the evaluator below to estimate the RMSE of the model on validation data

In [168]:
evaluator = evaluation.RegressionEvaluator(predictionCol='prediction', 
                                           labelCol='disease_progression', 
                                           metricName='rmse')

In [169]:
evaluator.evaluate(model1.transform(validation_df))

64.16642511816536

## Question 1.3 (5 pts)

Investigate the definition of $R^2$. Create a new evaluator with the metric $R^2$ and estimate it for `model1` on validation data.

In [170]:
# code here
evaluator2 = evaluation.RegressionEvaluator(predictionCol='prediction', 
                                           labelCol='disease_progression', 
                                           metricName='r2')

In [171]:
evaluator2.evaluate(model1.transform(validation_df))

0.276048948590072

## Question 1.4 (5 pts)

Use the code below to interpret the coefficients. Comment on how you can use these coefficients to gain insights about the relationship between bmi, sex, and age on disease progression. In particular, do you think you can tell which of the coefficients is more important?

In [172]:
pd.DataFrame({'features': model1.stages[0].getInputCols(), 'weights': model1.stages[1].coefficients})

Unnamed: 0,features,weights
0,bmi,10.392855
1,sex,0.108281
2,age,0.383465


## Question  1.5 (10 pts)

Add a standardization (mean subtraction and division by standard deviation) step to the pipeline in another model `model2`. Train on the training data.

In [173]:
model2 = Pipeline(stages=[feature.VectorAssembler(inputCols=['bmi','sex','age'], outputCol='features'),
    feature.StandardScaler(withMean=True, inputCol='features', outputCol='features_std'),
    regression.LinearRegression(featuresCol='features', labelCol='disease_progression')
]).fit(training_df)

## Question 1.6 (5 pts)

Reinterpret the parameters of `model2`. Can you compare them against each other?

In [174]:
model2.stages

[VectorAssembler_cfe80c891343,
 StandardScaler_cac87408914f,
 LinearRegression_62f06fcafe22]

In [175]:
pd.DataFrame({'features': model2.stages[0].getInputCols(), 'weights': model2.stages[2].coefficients})

Unnamed: 0,features,weights
0,bmi,10.392855
1,sex,0.108281
2,age,0.383465


## Question 2 (20 pts)

Sometimes, you want to systematically evaluate several models by sweeping through hyperparameters while keeping the same features.

Define a pipeline (don't fit it) with all features in the diabetes dataset with the goal of predicting disease progression. Store this pipeline in `pipe_all`.

Investigate how to use `tuning.ParamGridBuilder` and `tuning.CrossValidator` to sweep through the regularization parameter of linear regression (0, 0.1, 0.2). Finally, estimate the RMSE validation performance of your method.

In [189]:
pipe_all = Pipeline(stages=[feature.VectorAssembler(inputCols=['age', 'sex', 'bmi', 'map', 'tc', 'ldl', 'hdl', 'tch', 'ltg', 'glu'],\
                                                    outputCol='features'),regression.LinearRegression(featuresCol='features', \
                                                                                                      labelCol='disease_progression')
])

In [183]:
pipe_all.stages  #Two stages

[VectorAssembler_52f8e7e5f2b1, LinearRegression_e3fd066cea35]

In [190]:
rf0=pipe_all.getStages()[1].regParam

In [192]:
paramGrid = tuning.ParamGridBuilder() \
    .addGrid(pipe_all.getStages()[1].regParam, [0,0.1,0.2]) \
    .build()

In [193]:
cv = tuning.CrossValidator(estimator=pipe_all,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)

In [194]:
cv_model = cv.fit(training_df)

In [195]:

evaluator.evaluate(cv_model.transform(validation_df))

57.36380767271839