# 3. Model training

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
df=pd.read_csv('train_test_dataset_TropicalStorms.csv')

In [50]:
from sklearn.preprocessing import StandardScaler
# Séparer les features (X) et la cible (y)
X = df.drop(columns=["TD9636_STAGE"])  # Supprime la colonne cible
y = df["TD9636_STAGE"]  # Colonne cible

# Diviser les données en train (80%) et test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalisation des features (fortement recommandé pour la régression logistique)
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
# here we initialize the model
lr_model = LogisticRegression(random_state=42,max_iter=1000)

In [53]:
# here we train the model on the training data
lr_model.fit(X=X_train_scaler, y=y_train)

In [54]:
y_test_predicted = lr_model.predict(X_test_scaler)

In [55]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Matrice de confusion
conf_mat=pd.DataFrame(
    columns=["y_test_0","y_test_1","y_test_2","y_test_3","y_test_4","y_test_5","y_test_6"],index=["y_pred_0","y_pred_1","y_pred_2","y_pred_3","y_pred_4","y_pred_5","y_pred_6"]
)
conf_mat.loc[:,:]=confusion_matrix(y_true=y_test,y_pred= y_test_predicted)
print("Matrice de confusion:")
print(conf_mat)

# Rapport de classification
report_lr=classification_report(y_test, y_test_predicted)

Matrice de confusion:
         y_test_0 y_test_1 y_test_2 y_test_3 y_test_4 y_test_5 y_test_6
y_pred_0      598       23        6        0        2        2        0
y_pred_1       56     2414      696        0       17        5        2
y_pred_2        8      703     2580        0      130       11        0
y_pred_3        0        0       21        0      127        0        0
y_pred_4        0        6      245        0     1873        0        0
y_pred_5        7        4        8        0        0       43        4
y_pred_6       27        5       12        0        0        3        5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [56]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X=X_train,y=y_train)

In [57]:
y_test_predicted_dt = dt_model.predict(X_test)

In [58]:
report_dt=classification_report(y_pred=y_test_predicted_dt,y_true=y_test)

In [59]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X=X_train,y=y_train)

In [60]:
y_test_predicted_rf = rf_model.predict(X_test)

In [61]:
report_rf = classification_report(y_pred=y_test_predicted_rf,y_true=y_test)

In [62]:
from sklearn.neural_network import MLPClassifier
# try a new classifier: Multi-Layer Perceptron classifier
mlp_model = MLPClassifier(hidden_layer_sizes=(20,10),max_iter=1000)
mlp_model.fit(X=X_train,y=y_train)

In [63]:
y_test_predicted_mlp = mlp_model.predict(X_test)

In [64]:
report_mlp = classification_report(y_pred=y_test_predicted_mlp,y_true=y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Comparing results:

In [65]:
print("Report of logistic regression")
print(report_lr)

Report of logistic regression
              precision    recall  f1-score   support

         0.0       0.86      0.95      0.90       631
         1.0       0.77      0.76      0.76      3190
         2.0       0.72      0.75      0.74      3432
         3.0       0.00      0.00      0.00       148
         4.0       0.87      0.88      0.88      2124
         5.0       0.67      0.65      0.66        66
         6.0       0.45      0.10      0.16        52

    accuracy                           0.78      9643
   macro avg       0.62      0.58      0.59      9643
weighted avg       0.77      0.78      0.77      9643



In [66]:
print("Report of MLP model")
print(report_mlp)

Report of MLP model
              precision    recall  f1-score   support

         0.0       0.99      0.70      0.82       631
         1.0       0.73      0.77      0.75      3190
         2.0       0.71      0.76      0.73      3432
         3.0       0.00      0.00      0.00       148
         4.0       0.89      0.85      0.87      2124
         5.0       0.38      0.39      0.39        66
         6.0       0.63      0.23      0.34        52

    accuracy                           0.76      9643
   macro avg       0.62      0.53      0.56      9643
weighted avg       0.76      0.76      0.76      9643



In [67]:
print("Report of Decision Tree classifier model")
print(report_dt)

Report of Decision Tree classifier model
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96       631
         1.0       0.89      0.91      0.90      3190
         2.0       0.88      0.87      0.88      3432
         3.0       0.61      0.65      0.63       148
         4.0       0.92      0.92      0.92      2124
         5.0       0.95      0.95      0.95        66
         6.0       0.77      0.71      0.74        52

    accuracy                           0.90      9643
   macro avg       0.86      0.85      0.85      9643
weighted avg       0.90      0.90      0.90      9643



In [68]:
print("Report of Random Forest model")
print(report_rf)

Report of Random Forest model
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98       631
         1.0       0.95      0.94      0.94      3190
         2.0       0.91      0.94      0.92      3432
         3.0       0.86      0.52      0.65       148
         4.0       0.95      0.94      0.95      2124
         5.0       0.91      0.92      0.92        66
         6.0       1.00      0.67      0.80        52

    accuracy                           0.94      9643
   macro avg       0.93      0.85      0.88      9643
weighted avg       0.94      0.94      0.93      9643



 We didn't noticed particular outliers to remove.
## We choose the Random Forest Model as our final Model to predict `TD9636_STAGE`

In [69]:
#Saving our model with pickle
import pickle

with open("random_forest_model.pkl", "wb") as file:
    pickle.dump(rf_model, file)
