
***
# Regression - ML Section

***


## Importing Packages 

In [None]:
 pip install pyspark 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 41 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 49.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=f16d81012740b855645abed0a27c233ea46050661b91eeaf1b9f4a7c14f77e6b
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [None]:
import warnings


import pyspark
from pyspark.sql import *
from pyspark.sql import functions as F
from pyspark.sql import types
from pyspark.sql.functions import col, isnan, when, count
from pyspark.sql.functions import regexp_extract, regexp_replace
from pyspark.sql.functions import substring_index
from pyspark.ml import Pipeline

#for EDA 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np 


# for ml Classification 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor, GBTRegressor, LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import Imputer,StandardScaler,StringIndexer, VectorAssembler, VectorIndexer, OneHotEncoder
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col, trim, lower


warnings.filterwarnings('ignore')

## Loading Dataset 

First Connect to spark server >> to access to spark cluster, we using SparkSession

In [None]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [None]:
spark = SparkSession.builder.config('spark.executor.instances', 8).getOrCreate()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#read csv file with header 
df = spark.read.csv('/content/drive/MyDrive/SaudiWeather.csv', header='true', inferSchema='true', sep=',')

In [None]:
df.show()

+----+--------------------+-----------+-------------------+----------+----------+---------+--------------------+---------+---------------+---------+-------------------+---------------+-------------------------+--------------------+-----+---+----+------+------+-----------+--------------+--------------------------+
|YEAR|        station_name| station_id|   observation_date|  latitude| longitude|elevation|wind_direction_angle|wind_type|wind_speed_rate|sky_cavok|visibility_distance|air_temperature|air_temperature_dew_point|            GEOPOINT|month|day|hour|minute|season|Season_name|humidity_level|air_temperature_categories|
+----+--------------------+-----------+-------------------+----------+----------+---------+--------------------+---------+---------------+---------+-------------------+---------------+-------------------------+--------------------+-----+---+----+------+------+-----------+--------------+--------------------------+
|2022|             AL BAHA|41055099999|2022-12-10 00:00

In [None]:
df.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- station_name: string (nullable = true)
 |-- station_id: long (nullable = true)
 |-- observation_date: timestamp (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- wind_direction_angle: integer (nullable = true)
 |-- wind_type: string (nullable = true)
 |-- wind_speed_rate: double (nullable = true)
 |-- sky_cavok: string (nullable = true)
 |-- visibility_distance: integer (nullable = true)
 |-- air_temperature: double (nullable = true)
 |-- air_temperature_dew_point: integer (nullable = true)
 |-- GEOPOINT: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- season: integer (nullable = true)
 |-- Season_name: string (nullable = true)
 |-- humidity_level: string (nullable = true)
 |-- air_temperature_categories: string (nul

***
# ML Regression Section 

***

## Feature Engineering and Feature Selection

Here we will encode all the categorical columns choosing as a features using StringIndexer and drop the original columns.

In [None]:
## define variable contain all columns needed

cat_cols=['station_name','wind_type','sky_cavok','Season_name','humidity_level','air_temperature_categories']

In [None]:
#This step will label encode all the categorical columns and store them in different columns with the same name + '_', 

for col in cat_cols:
    indexer = StringIndexer(inputCol=col, outputCol=col+'_')
    df = indexer.fit(df).transform(df) #here we fit and transform the data altogether
    
df = df.drop(*cat_cols) #we will drop all the categorical columns we defined earlier

In [None]:
## drop unedded column in ML 

drop_cols = ['GEOPOINT' , 'station_id','observation_date']

df = df.drop(*drop_cols)


>> First Combining Feature Columns

In [None]:
#define columns
cols = df.columns
cols.remove('air_temperature') #remove air_temperature -> we need this to be our label

#vector assembler will take all the columns and convert them into one column called features
assembler = VectorAssembler(inputCols=cols, outputCol='features')

#the .transform will apply the changes here
df = assembler.transform(df)

In [None]:
#Initialize Standard Scaler
stdScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

#Fit the Standard Scaler to the indexed Dataframe
scaler = stdScaler.fit(df)

#Transform the dataframe
df_scaled =scaler.transform(df)

In [None]:
#show database after scaled
df_scaled.show(5)

+----+----------+----------+---------+--------------------+---------------+-------------------+---------------+-------------------------+-----+---+----+------+------+-------------+----------+----------+------------+---------------+---------------------------+--------------------+--------------------+
|YEAR|  latitude| longitude|elevation|wind_direction_angle|wind_speed_rate|visibility_distance|air_temperature|air_temperature_dew_point|month|day|hour|minute|season|station_name_|wind_type_|sky_cavok_|Season_name_|humidity_level_|air_temperature_categories_|            features|     features_scaled|
+----+----------+----------+---------+--------------------+---------------+-------------------+---------------+-------------------------+-----+---+----+------+------+-------------+----------+----------+------------+---------------+---------------------------+--------------------+--------------------+
|2022| 20.296139| 41.634277|  1672.13|                 350|            4.6|               9900

## Split Data

>> Just like always, before building a model we shall split our scaled dataset into training & test sets. Training Dataset = 80% Test Dataset = 20%

In [None]:
# We have created a new dataframe only consisting of the features column and the label column 
df_data = df_scaled.select(F.col('features_scaled'), F.col('air_temperature').alias('label'))

#simple data splitting
df_train, df_test = df_data.randomSplit([0.8, 0.2])

In [None]:
df_data.show()

+--------------------+-----+
|     features_scaled|label|
+--------------------+-----+
|[1288.69145462652...| 17.0|
|[1288.69145462652...| 16.1|
|[1288.69145462652...| 17.0|
|[1288.69145462652...| 14.0|
|[1288.69145462652...| 13.0|
|[1288.69145462652...| 11.0|
|[1288.69145462652...| 20.0|
|[1288.69145462652...| 24.8|
|[1288.69145462652...| 21.0|
|[1288.69145462652...| 14.0|
|[1288.69145462652...| 20.3|
|[1288.69145462652...| 14.0|
|[1288.69145462652...| 13.0|
|[1288.69145462652...| 21.0|
|[1288.69145462652...| 20.0|
|[1288.69145462652...| 24.8|
|[1288.69145462652...| 23.0|
|[1288.69145462652...| 17.1|
|[1288.69145462652...| 19.0|
|[1288.69145462652...| 16.0|
+--------------------+-----+
only showing top 20 rows



>> define evaluater for **RegressionEvaluator** 

In [None]:
#define evaluater

evaluator_R = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='r2')

evaluator_RMSE = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='rmse')

evaluator_MAE = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='mae')

## Train and Evaluate Models



### 1st Regression Model: **Linear Regression**


In [None]:

# -- Linear Regression --

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, featuresCol='features_scaled', labelCol='label')

# train the multiclass model.
r_lr = lr.fit(df_train)

#train the model
model_predictions= r_lr.transform(df_test)

model_predictions.show(7,False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------------------+
|features_scaled                                                                                                                                                                                                                             |label|prediction        |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------------------+
|(19,[0,1,2,3,4,5,6,7,8,9,12],[1287.4167845428162,5.970844001351544,11.320917795754413,1.1466549813034426,0.18535327339703406,2.6844414965790655,5.75685742699869,0.8282601033716479,2.67385906985167,2.49933189

In [None]:
#Evaluate the Model

lr_r = evaluator_R.evaluate(model_predictions)
lr_rms = evaluator_RMSE.evaluate(model_predictions)
lr_mae = evaluator_MAE.evaluate(model_predictions)


#print r2, RMSE, MAE
print("Linear Regression r2 =", '{:.2%}'.format(lr_r))

print("Linear Regression RMSE =", '{:.2%}'.format(lr_rms))

print("Linear Regression MAE =", '{:.2%}'.format(lr_mae))



Linear Regression r2 = 65.61%
Linear Regression RMSE = 517.83%
Linear Regression MAE = 410.00%


### 2nd Regression Model: **Random Forest Regressor**


In [None]:
# -- Random Forest Regressor --

rf = RandomForestRegressor(featuresCol='features_scaled', labelCol='label')

# train the multiclass model.
r_rf = rf.fit(df_train)


#train the model
rf_pred= r_rf.transform(df_test)

rf_pred.show(7,False)


+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------------------+
|features_scaled                                                                                                                                                                                                                             |label|prediction        |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------------------+
|(19,[0,1,2,3,4,5,6,7,8,9,12],[1287.4167845428162,5.970844001351544,11.320917795754413,1.1466549813034426,0.18535327339703406,2.6844414965790655,5.75685742699869,0.8282601033716479,2.67385906985167,2.49933189

In [None]:
#Evaluate the Model

rf_r = evaluator_R.evaluate(rf_pred)
rf_rms = evaluator_RMSE.evaluate(rf_pred)
rf_mae = evaluator_MAE.evaluate(rf_pred)


#print r2, RMSE, MAE
print("Random Forest Regressor r2 =", '{:.2%}'.format(rf_r))

print("Random Forest Regressor RMSE =", '{:.2%}'.format(rf_rms))

print("Random Forest Regressor MAE =", '{:.2%}'.format(rf_mae))

Random Forest Regressor r2 = 91.63%
Random Forest Regressor RMSE = 255.51%
Random Forest Regressor MAE = 201.99%


### 3rd Regression Model: **Gradient Boosted Tree Regressorr**

In [None]:
# --Gradient Boosted Tree Regressor--


gbt = GBTRegressor(featuresCol="features_scaled", labelCol='label', maxIter=10)

# train the multiclass model.
r_gbt = gbt.fit(df_train)

#train the model
gbt_pred= r_gbt.transform(df_test)

gbt_pred.show(7,False)


+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------------------+
|features_scaled                                                                                                                                                                                                                             |label|prediction        |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------------------+
|(19,[0,1,2,3,4,5,6,7,8,9,12],[1287.4167845428162,5.970844001351544,11.320917795754413,1.1466549813034426,0.18535327339703406,2.6844414965790655,5.75685742699869,0.8282601033716479,2.67385906985167,2.49933189

In [None]:
#Evaluate the Model

gb_r = evaluator_R.evaluate(gbt_pred)
gb_rms = evaluator_RMSE.evaluate(gbt_pred)
gb_mae = evaluator_MAE.evaluate(gbt_pred)


#print r2, RMSE, MAE
print("Gradient Boosted Tree Regressorr r2 =", '{:.2%}'.format(gb_r))

print("Gradient Boosted Tree Regressorr RMSE =", '{:.2%}'.format(gb_rms))

print("Gradient Boosted Tree Regressorr MAE =", '{:.2%}'.format(gb_mae))

Gradient Boosted Tree Regressorr r2 = 93.60%
Gradient Boosted Tree Regressorr RMSE = 223.42%
Gradient Boosted Tree Regressorr MAE = 183.48%


### 4th Regression Model: **Decision Tree Regressor**  

In [None]:
#--Decision Tree Regressor--

dt = DecisionTreeRegressor(featuresCol="features_scaled", labelCol='label')

# train the multiclass model.
r_dt = dt.fit(df_train)


#train the model
dt_pred= r_dt.transform(df_test)
dt_pred.show(7,False)


+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------------------+
|features_scaled                                                                                                                                                                                                                             |label|prediction        |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------------------+
|(19,[0,1,2,3,4,5,6,7,8,9,12],[1287.4167845428162,5.970844001351544,11.320917795754413,1.1466549813034426,0.18535327339703406,2.6844414965790655,5.75685742699869,0.8282601033716479,2.67385906985167,2.49933189

In [None]:
#Evaluate the Model

dt_r = evaluator_R.evaluate(dt_pred)
dt_rms = evaluator_RMSE.evaluate(dt_pred)
dt_mae = evaluator_MAE.evaluate(dt_pred)


#print r2, RMSE, MAE
print("Decision Tree Regressor r2 =", '{:.2%}'.format(dt_r))

print("Decision Tree Regressor RMSE =", '{:.2%}'.format(dt_rms))

print("Decision Tree Regressor MAE =", '{:.2%}'.format(dt_mae))

Decision Tree Regressor r2 = 91.53%
Decision Tree Regressor RMSE = 256.93%
Decision Tree Regressor MAE = 209.52%


## Show ML Evaluation as Dataframe


In [None]:
models = [model_predictions, rf_pred, gbt_pred, dt_pred] #list of models we have

evaluator_R = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='r2')

evaluator_RMSE = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='rmse')

evaluator_MAE = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='mae')

# Empty lists that will store the scores for each metric for each model.
R2 = []
RMSE = []
MAE = []

# Simple loop to populate the empty lists with scores of models for each metric.
for model in models:
    R2.append(evaluator_R.evaluate(model))
    RMSE.append(evaluator_RMSE.evaluate(model))
    MAE.append(evaluator_MAE.evaluate(model))

In [None]:
# convert all lists created above into a dataframe .
df_ev = pd.DataFrame(list(zip(R2, RMSE, MAE)), 
                     columns = ['R-squared', 'Root Mean Squared Error', 'Mean Absolute Error'],
                     index = ['Linear Regression','Random Forest Regressor', 'Gradient Boosted Trees Regressor', 'Decision Tree Regressor'])

In [None]:
#print the final form of result tabel

df_ev

Unnamed: 0,R-squared,Root Mean Squared Error,Mean Absolute Error
Linear Regression,0.656077,5.178277,4.099968
Random Forest Regressor,0.916265,2.555108,2.019896
Gradient Boosted Trees Regressor,0.935978,2.234195,1.834844
Decision Tree Regressor,0.915333,2.569284,2.095159


## Model Optimization - Hyperparameter Tuning 



### Grid Search-- **1: Linear Regression** 

In [None]:
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build()

cv_lr = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator_R,
                          numFolds=5)  

# Run cross-validation, and choose the best set of parameters.
cv_lrModel = cv_lr.fit(df_train)

In [None]:
# accuracy of Linear Regression on the testing set

print("Accuracy of Linear Regression on the testing set: ",evaluator_R.evaluate(cv_lrModel.bestModel.transform(df_test)))

Accuracy of Linear Regression on the testing set:  0.6701259930162194



### Grid Search-- **2: Random Forest Regressor** 

In [None]:
# Create ParamGrid for Cross Validation
rfparamGrid = (ParamGridBuilder().addGrid(rf.maxDepth, [2, 5, 10]).addGrid(rf.numTrees, [5, 20, 50]).build())

#CrossValidator 
cv_rf = CrossValidator(estimator = rf,
                      estimatorParamMaps = rfparamGrid,
                      evaluator = evaluator_R,
                      numFolds = 5)

#fit training set
cv_rfModel = cv_rf.fit(df_train)


In [None]:
# accuracy of Random Forest Regressor on the testing set

print("Accuracy of Random Forest Regressor on the testing set: ", evaluator_R.evaluate(cv_rfModel.bestModel.transform(df_test)))

Accuracy of Random Forest Regressor on the testing set:  0.9473550872966533


In [None]:
# Create ParamGrid for Cross Validation

grid_gbt = ParamGridBuilder().addGrid(gbt.maxIter, [25, 30, 35, 40]).build()

#CrossValidator 
cv_gbt = CrossValidator(estimator=gbt,
                        estimatorParamMaps=grid_gbt,
                        evaluator=evaluator_R,
                        parallelism=2,
                        numFolds=5)

#fit training set
cvModel2 = cv_gbt.fit(df_train)


### Model Optimization - Hyperparameter Tuning - **Result**

###  

In [None]:
# accuracy of Models before optimazation

print(black('Accuracy of Models -- Before optimazation -- :\n_____________________________________', ['bold']))

print(black("Linear Regression :",['bold']) ,'{:.2%}'.format(evaluator_R.evaluate(model_predictions)))

print(black("Random Forest Regressor : ",['bold']), '{:.2%}'.format(evaluator_R.evaluate(rf_pred)))

print(black("Gradient Boosted Tree Regressor : ",['bold']),'{:.2%}'.format(evaluator_R.evaluate(gbt_pred)))

print(black("Decision Tree Regressor : ",['bold']),'{:.2%}'.format(evaluator_R.evaluate(dt_pred)))




#print("Tree Regressor: ",evaluator_R.evaluate(cv_dtModel.bestModel.transform(df_test)))

[1;30mAccuracy of Models -- Before optimazation -- :
_____________________________________[0m
[1;30mLinear Regression :[0m 65.61%
[1;30mRandom Forest Regressor : [0m 91.63%
[1;30mGradient Boosted Tree Regressor : [0m 93.60%
[1;30mDecision Tree Regressor : [0m 91.53%


In [None]:
# accuracy of Models on the testing set

print(black('Accuracy of Models -- After optimazation --:\n_____________________________________', ['bold']))

print(black("Linear Regression :",['bold']) ,'{:.2%}'.format(evaluator_R.evaluate(cv_lrModel.bestModel.transform(df_test))))

print(black("Random Forest Regressor : ",['bold']), '{:.2%}'.format(evaluator_R.evaluate(cv_rfModel.bestModel.transform(df_test))))

print(black("Gradient Boosted Tree Regressor : ",['bold']),'{:.2%}'.format(evaluator_R.evaluate(cvModel2.bestModel.transform(df_test))))

print(black("Decision Tree Regressor : ",['bold']))


#print("Tree Regressor: ",evaluator_R.evaluate(cv_dtModel.bestModel.transform(df_test)))

[1;30mAccuracy of Models -- After optimazation --:
_____________________________________[0m
[1;30mLinear Regression :[0m 67.01%
[1;30mRandom Forest Regressor : [0m 94.74%
[1;30mGradient Boosted Tree Regressor : [0m 95.10%
[1;30mDecision Tree Regressor : [0m
