In [1]:
# 2_model_training.ipynb
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [2]:
# Initialiser Spark
spark = SparkSession.builder.appName("Rain Prediction Training").getOrCreate()

In [3]:
# Charger les données nettoyées
data = spark.read.csv('../data/cleaned_weatherAUS.csv', header=True, inferSchema=True)

In [4]:
# Sélectionner les colonnes nécessaires
feature_columns = ["MinTemp", "MaxTemp", "Rainfall", "WindGustSpeed", "Humidity3pm", "Pressure3pm", "Cloud9am", "Temp3pm"]
data = data.select(*feature_columns, "RainTomorrow")

In [5]:
# Convertir la cible en index numérique
indexer = StringIndexer(inputCol="RainTomorrow", outputCol="label")

In [6]:
# Assembler les features
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

In [7]:
# Définir le modèle
lr = LogisticRegression(featuresCol="features", labelCol="label")

In [8]:
# Créer le pipeline
pipeline = Pipeline(stages=[indexer, assembler, lr])

In [9]:
# Entraîner le modèle
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train_data)

In [10]:
# Évaluer le modèle
predictions = model.transform(test_data)
predictions.select("label", "prediction").show(10)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 10 rows



In [11]:
# Sauvegarder le modèle
model.save("models/rain_prediction_model")
print("Modèle entraîné et sauvegardé.")

Modèle entraîné et sauvegardé.
