# PySpark ML

In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkML').getOrCreate()

23/12/12 14:13:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [16]:
training = spark.read.csv('open-meteo-53.44N20.25E193m.csv', header=True, inferSchema=True)
training.show()

+-------------------+----------------+-------+-----------+------------+------------+------------+-------------------+-------------------+
|               time|temperature_2m_c|rain_mm|snowfall_cm|snow_depth_m|weather_code|visibility_m|wind_speed_10m_km_h|wind_speed_80m_km_h|
+-------------------+----------------+-------+-----------+------------+------------+------------+-------------------+-------------------+
|2023-12-12 00:00:00|             1.8|    0.0|        0.0|        0.06|          45|        80.0|                4.2|               10.8|
|2023-12-12 01:00:00|             1.5|    0.0|        0.0|        0.06|           3|       100.0|                3.6|                9.2|
|2023-12-12 02:00:00|             1.1|    0.0|        0.0|        0.06|           3|       100.0|                2.9|                8.4|
|2023-12-12 03:00:00|             0.9|    0.0|        0.0|        0.06|           3|       100.0|                3.4|                8.6|
|2023-12-12 04:00:00|             

In [17]:
training.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- temperature_2m_c: double (nullable = true)
 |-- rain_mm: double (nullable = true)
 |-- snowfall_cm: double (nullable = true)
 |-- snow_depth_m: double (nullable = true)
 |-- weather_code: integer (nullable = true)
 |-- visibility_m: double (nullable = true)
 |-- wind_speed_10m_km_h: double (nullable = true)
 |-- wind_speed_80m_km_h: double (nullable = true)


In [18]:
training.columns

['time',
 'temperature_2m_c',
 'rain_mm',
 'snowfall_cm',
 'snow_depth_m',
 'weather_code',
 'visibility_m',
 'wind_speed_10m_km_h',
 'wind_speed_80m_km_h']

In [19]:
from pyspark.ml.feature import VectorAssembler

featureassembler = VectorAssembler(
    inputCols=['rain_mm', 'snowfall_cm', 'wind_speed_10m_km_h'],
    outputCol='Independent Features',
)

In [20]:
output=featureassembler.transform(training)
output.show()

+-------------------+----------------+-------+-----------+------------+------------+------------+-------------------+-------------------+--------------------+
|               time|temperature_2m_c|rain_mm|snowfall_cm|snow_depth_m|weather_code|visibility_m|wind_speed_10m_km_h|wind_speed_80m_km_h|Independent Features|
+-------------------+----------------+-------+-----------+------------+------------+------------+-------------------+-------------------+--------------------+
|2023-12-12 00:00:00|             1.8|    0.0|        0.0|        0.06|          45|        80.0|                4.2|               10.8|       [0.0,0.0,4.2]|
|2023-12-12 01:00:00|             1.5|    0.0|        0.0|        0.06|           3|       100.0|                3.6|                9.2|       [0.0,0.0,3.6]|
|2023-12-12 02:00:00|             1.1|    0.0|        0.0|        0.06|           3|       100.0|                2.9|                8.4|       [0.0,0.0,2.9]|
|2023-12-12 03:00:00|             0.9|    0.0|

In [21]:
finalized_data=output.select('Independent Features', 'temperature_2m_c')
finalized_data.show()

+--------------------+----------------+
|Independent Features|temperature_2m_c|
+--------------------+----------------+
|       [0.0,0.0,4.2]|             1.8|
|       [0.0,0.0,3.6]|             1.5|
|       [0.0,0.0,2.9]|             1.1|
|       [0.0,0.0,3.4]|             0.9|
|       [0.0,0.0,3.6]|             0.9|
|       [0.0,0.0,3.6]|             0.7|
|       [0.0,0.0,4.7]|             0.6|
|       [0.0,0.0,5.4]|             0.6|
|       [0.0,0.0,3.3]|             0.6|
|       [0.0,0.0,5.5]|             0.7|
|       [0.0,0.0,7.4]|             1.0|
|       [0.0,0.0,8.6]|             1.1|
|       [0.0,0.0,8.4]|             1.1|
|       [0.0,0.0,9.0]|             1.1|
|       [0.1,0.0,7.9]|             1.2|
|       [0.0,0.0,7.8]|             1.2|
|       [0.0,0.0,7.7]|             1.2|
|       [0.0,0.0,6.6]|             1.3|
|       [0.0,0.0,6.9]|             1.4|
|       [0.0,0.0,6.6]|             1.4|
+--------------------+----------------+


In [22]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = finalized_data.randomSplit([0.7, 0.3])
regressor = LinearRegression(
    featuresCol='Independent Features',
    labelCol='temperature_2m_c',
)
regressor = regressor.fit(train_data)

23/12/12 14:13:14 WARN Instrumentation: [0c6d5b26] regParam is zero, which might cause numerical instability and overfitting.


In [23]:
regressor.coefficients

DenseVector([1.0074, -11.1562, 0.1291])

In [24]:
regressor.intercept

0.1042591780879742

In [25]:
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show(100, False)

+--------------------+----------------+---------------------+
|Independent Features|temperature_2m_c|prediction           |
+--------------------+----------------+---------------------+
|[0.0,0.0,2.7]       |0.7             |0.4529160423408001   |
|[0.0,0.0,3.4]       |0.9             |0.5433085627026438   |
|[0.0,0.0,3.6]       |1.5             |0.5691349970917421   |
|[0.0,0.0,4.0]       |0.6             |0.6207878658699385   |
|[0.0,0.0,4.7]       |0.6             |0.7111803862317823   |
|[0.0,0.0,4.7]       |1.0             |0.7111803862317823   |
|[0.0,0.0,5.2]       |1.1             |0.7757464722045279   |
|[0.0,0.0,5.8]       |0.2             |0.8532257753718224   |
|[0.0,0.0,6.6]       |1.4             |0.9565315129282153   |
|[0.0,0.0,7.1]       |-2.0            |1.0210975989009607   |
|[0.0,0.0,7.7]       |-3.2            |1.0985769020682554   |
|[0.0,0.0,7.7]       |1.2             |1.0985769020682554   |
|[0.0,0.0,7.8]       |1.2             |1.1114901192628046   |
|[0.0,0.

In [26]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(1.8088343481781473, 5.132008237908722)

In [27]:
spark.stop()