In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
pd.set_option('display.max_rows', 20000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 500)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import sys
from pyspark.sql import SparkSession


spark = SparkSession.builder.master("local").appName("Exercise3").getOrCreate()
url = "https://raw.githubusercontent.com/Xiru1024/BigDataExercise/refs/heads/main/exampleData.csv"
local_path = "/home/jovyan/BigDataExercise/exercise3/example.csv"
with open(local_path, "wb") as f:
    f.write(requests.get(url).content)
df = spark.read.csv(local_path, header=True, inferSchema=True) 

17430727

In [5]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="dateTime" ,outputCol="indexedTime")
indexed = indexer.fit(df).transform(df)
# indexed.show(vertical=True)

In [7]:
parsed_data = indexed.selectExpr("cast(dateTime as string)dateTime",
 "cast(relative_humidity as float) relative_humidity","cast(wind_speed as float) wind_speed",
 "cast(indexedTime as double) indexedTime","cast(air_temperature as float) label","cast(msl as float) msl")

In [8]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['relative_humidity',
'wind_speed', 'indexedTime', 'msl'], handleInvalid="skip",outputCol =
'features')
vectorized_df = vectorAssembler.transform(parsed_data)
dataset = vectorized_df.select("features", "label")

In [9]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3])


In [10]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol="features", maxIter=30, maxDepth = 11)
# Train model. This also runs the indexer.
model = gbt.fit(trainingData)
# Make predictions.
prediction = model.transform(testData)

In [11]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
# Split the data into training and test sets (30% held out for
testing)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3])
# Create a VectorIndexer (do not pre-fit; it will be fit as part of
the pipeline)
featureIndexer = VectorIndexer(inputCol="features",
outputCol="indexedFeatures", handleInvalid="skip")
# Train a GBT model.
gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=30,
maxDepth=11)
# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, gbt])
# Train model. This also fits the indexer on the training data.
model_gbt = pipeline.fit(trainingData)
# Make predictions.
preds = model_gbt.transform(testData)
# Evaluate the model by computing the R Squared on test data.
evaluator = RegressionEvaluator(labelCol="label",
predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(preds)
print("R Squared on test data = %g" % r2)


SyntaxError: unmatched ')' (2091508349.py, line 6)

In [None]:
evaluator = RegressionEvaluator(
 labelCol="label", predictionCol="prediction", metricName="mae")
mae= evaluator.evaluate(preds)
print("MAE on test data = %g" % mae)


In [None]:
evaluator = RegressionEvaluator(
 labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(preds)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [None]:
pred_pd = preds.toPandas()
import seaborn
from scipy.stats import *
seaborn.set(style="whitegrid", font_scale = 1.8)
fig, ax = plt.subplots()
seaborn.set(color_codes=True)
seaborn.set(rc={'figure.figsize':(20, 10)})
seaborn.regplot(x="label", y="prediction", fit_reg=False, ax=ax,data
=pred_pd,scatter_kws={"color": "b"});
seaborn.regplot(x="label", y="prediction",scatter=False, ax=ax, data
=pred_pd, line_kws={"color": "red"});

In [21]:
from pyspark.sql.functions import col
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.classification import MultilayerPerceptronClassifier
# Cast relevant columns to double
df_numeric = indexed.withColumn("air_temperature",
col("air_temperature").cast("double")) \
.withColumn("precipitation",
col("precipitation").cast("double")) \
.withColumn("wind_from_direction",
col("wind_from_direction").cast("double")) \
.withColumn("relative_humidity",
col("relative_humidity").cast("double")) \
.withColumn("wind_speed",
col("wind_speed").cast("double")) \
.withColumn("msl", col("msl").cast("double"))
# Bin air_temperature into 3 classes (for classification using MLP)
discretizer = QuantileDiscretizer(numBuckets=3,
inputCol="air_temperature", outputCol="binnedLabel")
binned_df = discretizer.fit(df_numeric).transform(df_numeric)
# Rename binnedLabel to 'label' for use as the target
binned_df = binned_df.withColumnRenamed("binnedLabel", "label")
# Prepare features by assembling selected numeric columns
vectorAssembler = VectorAssembler(
inputCols=['indexedTime', 'precipitation', 'wind_from_direction',
'relative_humidity', 'wind_speed', 'msl'],
handleInvalid="skip",
outputCol='features'
)
vectorized_df = vectorAssembler.transform(binned_df)
# Select only the features and label for training
dataset_class = vectorized_df.select("features", "label")
dataset_class.show(5)
# Split the data
(trainingData, testData) = dataset_class.randomSplit([0.7, 0.3])
# Define the MLP classifier; adjust the input layer size as per your features
layers = [6, 12,3]
mlp = MultilayerPerceptronClassifier(maxIter=800, layers=layers,
blockSize=128, seed=1234)
# Train the model
model_mlp = mlp.fit(trainingData)
# Make predictions
predictions_mlp = model_mlp.transform(testData)
predictions_mlp.select("features", "label", "prediction").show(5)
# Evaluate using a suitable classification evaluator, e.g., accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
accuracy = evaluator.evaluate(predictions_mlp)
print("Test set accuracy = %g" % accuracy)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[2667.0,0.3,190.0...|  1.0|
|[2668.0,0.2,220.0...|  0.0|
|[2669.0,0.0,190.0...|  0.0|
|[2670.0,0.0,190.0...|  0.0|
|[2671.0,0.0,220.0...|  0.0|
+--------------------+-----+
only showing top 5 rows

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(6,[0,5],[235846....|  2.0|       1.0|
|(6,[0,5],[235851....|  2.0|       1.0|
|(6,[0,5],[235857....|  2.0|       1.0|
|(6,[0,5],[235858....|  2.0|       1.0|
|(6,[0,5],[235859....|  2.0|       1.0|
+--------------------+-----+----------+
only showing top 5 rows

Test set accuracy = 0.366047


200 iterations - 0.36806

In [None]:
300 iterations -  0.368668

In [None]:
500 iterations  0.370278

In [None]:
block size=64 - 0.362132

block size 64 and iteration 200 and add additional layer 24 - 0.366763


In [None]:
base - 0.364516