In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("ML").getOrCreate()

## Attributes on original data


- AT = Atmospheric Temperature in C
- V = Exhaust Vaccum Speed
- AP = Atmospheric Pressure
- RH = Relative Humidity
- PE = Power Output

## Getting the data

In [5]:
originData = spark.read.csv("/FileStore/tables/powerDataForTP.csv", inferSchema=True, header = True,sep=";")
originData.show()

There are 9570 lines on the whole dataset but there are only 4519 rows with non null values. We will see what model what model is the best if we use only these 4519 rows or if we fill the null values by the mans of the column.

In [7]:
from pyspark.sql.functions import mean
meanAT = originData.select(mean(originData['AT']))
meanV = originData.select(mean(originData['V']))
meanAP = originData.select(mean(originData['AP']))
meanRH = originData.select(mean(originData['RH']))
meanPE = originData.select(mean(originData['PE']))

avgAT = meanAT.collect()[0][0]
avgV = meanV.collect()[0][0]
avgAP = meanAP.collect()[0][0]
avgRH = meanRH.collect()[0][0]
avgPE = meanPE.collect()[0][0]

rowData_avg = originData.na.fill(avgAT, subset=['AT'])
rowData_avg = rowData_avg.na.fill(avgV, subset=['V'])
rowData_avg = rowData_avg.na.fill(avgAP, subset=['AP'])
rowData_avg = rowData_avg.na.fill(avgRH, subset=['RH'])
rowData_avg = rowData_avg.na.fill(avgPE, subset=['PE'])

rowData_avg.show()

rowData=originData.na.drop() # We delete all rows containing null values

print("There are ", rowData.count(), " lines with non null values among " ,originData.count(), " lines on the whole dataset")
rowData.describe().show()
rowData_avg.describe().show()
originData.describe().show()


In [8]:
rowData.printSchema()
rowData.columns

In [9]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
vec = VectorAssembler(
  inputCols= [
    'AT',
    'V',
    'AP',
    'RH'
    ],
   outputCol = 'features'                  
 )

In [10]:
data = vec.transform(rowData)
data_avg=vec.transform(rowData_avg)
data.show(truncate=False)
data_avg.show(truncate=False)

In [11]:
modelData = data.selectExpr('features', 'PE as label') 


modelData_avg = data_avg.selectExpr('features', 'PE as label')

In [12]:
modelData.show(5) 
modelData_avg.show(5) 

Split the data : 70% for training set, 30% for test set. We do that for both modelData and modelData_avg

In [14]:
trainData, testData = modelData.randomSplit([0.7, 0.3])
trainData_avg, testData_avg = modelData_avg.randomSplit([0.7, 0.3])

#Linear regression

In [16]:
lr = LinearRegression(featuresCol='features', labelCol='label')

In [17]:
lrModel = lr.fit(trainData) #Linear regression model for data with null values dropped
lrModel_avg = lr.fit(trainData_avg) #Linear regression model for data with null values replace by means

summary = lrModel.summary  # MODEL summary for lrModel
summary_avg = lrModel_avg.summary  # MODEL summary for lrModel_avg

summary.predictions.show(n=3, truncate = False)  #Three first predictions
summary_avg.predictions.show(n=3, truncate = False)  #Three first predictions

As we can see above, both models seem good, we will now see how good they are both on train set and test set

In [19]:
from pyspark.sql.functions import abs
print ("explained Variance for non null values={}".format(summary.explainedVariance))
print ("meanAbsoluteError for non null values=%g" %summary.meanAbsoluteError)

print ("explainedVariance for values replaced={}".format(summary_avg.explainedVariance))
print ("meanAbsoluteError for values replaced=%g" %summary_avg.meanAbsoluteError)

#For printing the Mean Asbolute error, we could also have wirtten this : 
trainResults=lrModel.evaluate(trainData)  #evaluation on the training set
trainResults_avg=lrModel_avg.evaluate(trainData_avg)  #evaluation on the training set
df1= trainResults.residuals #Difference between prediction and reality
df1_avg= trainResults_avg.residuals #Difference between prediction and reality 
df1.select(abs(df1.residuals)).groupBy().avg().show()
df1_avg.select(abs(df1_avg.residuals)).groupBy().avg().show() 

The dataset with null values seems better. Let's now apply  our model on test set

In [21]:
testResults = lrModel.evaluate(testData) #Use the model on our test data
testResults_avg = lrModel_avg.evaluate(testData_avg) #Use the model on our test data

testResults.residuals.show(n=10)  #Show difference between prediction and reality
testResults_avg.residuals.show(n=10)  #Show difference between prediction and reality

In [22]:
print ("r2 for non null values=%g"%testResults.r2)   # my model explains x % of the variance of the data
print ("r2 for values replaced=%g"%testResults_avg.r2)   # my model explains x % of the variance of the data

print ("rootMeanSquaredError for non null values=%g"%testResults.rootMeanSquaredError)   # RMSE
print ("rootMeanSquaredError for values replaced=%g"%testResults_avg.rootMeanSquaredError)   # RMSE

print ("meanAbsoluteError for non null values=%g"%testResults.meanAbsoluteError)
print ("meanAbsoluteError for values replaced=%g"%testResults_avg.meanAbsoluteError)

Our R2 score is closer  to 1 with our data with all null values dropped, as is the RMSE lower and the mean asbolute error lower than with null values replaces by the average. So for the following we will use the "rowData" dataset and no longer the rowData_avg dataset.

## CrossValidation and ParamGridBuilder

In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


evaluator = RegressionEvaluator(metricName='rmse')
evaluator.explainParam('metricName')
pipeLine=Pipeline()
pipeLine.setStages([lr])

paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build()

crossval = CrossValidator(estimator=pipeLine,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)
cvModel = crossval.fit(trainData)

In [26]:
cross_prediction = cvModel.transform(testData)
cross_prediction.show()


In [27]:
test = cross_prediction.withColumn('Abs_diff', abs(cross_prediction.label - cross_prediction.prediction))
cross_prediction=test
cross_prediction.printSchema()
cross_prediction.select('Abs_diff').groupBy().avg().show()

cvm = crossval.fit(trainData)
cross_prediction_test = cvm.transform(testData)
evaluator.evaluate(cross_prediction_test)