In [14]:
# -*- coding: utf-8 -*-
"""
script_lstm_inondation.py

Pipeline complet : prétraitement, entraînement, évaluation et sauvegarde
d’un modèle LSTM pour prédiction binaire d’inondation à horizon 3 jours.
"""
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report
)
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# 1. Chargement des données
df = pd.read_csv('./dataset_prepared.csv', parse_dates=['date'], dayfirst=True)

# 2. Sélection des colonnes pertinentes
features = [
    'latitude_centroid','longitude_centroid','tempmax','tempmin','temp',
    'feelslikemax','feelslikemin','feelslike','dew','humidity','precipprob',
    'precipcover','windspeed','winddir','pressure','cloudcover','visibility',
    'elevation','soil_type'
]
target = 'label'
df = df[['chemin_directory','date'] + features + [target]]



In [15]:
# 3. Split régions train / test (80% régions pour le train)
regions = df['chemin_directory'].unique()
n_train = int(0.8 * len(regions))
train_regions = set(regions[:n_train])
df_train = df[df['chemin_directory'].isin(train_regions)].copy()
df_test  = df[~df['chemin_directory'].isin(train_regions)].copy()

# 3bis. Sauvegarde du dataset de test au format CSV (même structure que le dataset original)
# Cette sauvegarde doit être faite AVANT le prétraitement, donc juste après le split train/test
print("\nSauvegarde du dataset de test original (avant tout prétraitement)...")

# Générer un nom de fichier incluant la date
from datetime import datetime
date_str = datetime.now().strftime("%Y%m%d")
test_data_filename = f'test_dataset.csv'

# Créer une copie du dataframe avant prétraitement pour s'assurer qu'il conserve la structure originale
# Notamment avec la colonne 'soil_type' non encodée en one-hot
df_test_original = df_test.copy()

# Sauvegarder le dataframe de test tel quel, avec la même structure que le dataset d'origine
df_test_original.to_csv(test_data_filename, index=False)

print(f"Dataset de test sauvegardé dans {test_data_filename}")
print(f"Nombre de lignes: {len(df_test_original)}")
print(f"Nombre de régions: {df_test_original['chemin_directory'].nunique()}")
print(f"Distribution des classes: {df_test_original[target].value_counts().to_dict()}")
print(f"Colonnes: {df_test_original.columns.tolist()}")

# 4. Imputation des NaN par la médiane calculée sur le train
for col in features:
    if df_train[col].isna().any():
        med = df_train[col].median()
        df_train[col].fillna(med, inplace=True)
        df_test[col].fillna(med, inplace=True)

# 5. Encodage one-hot de soil_type
df_train = pd.get_dummies(df_train, columns=['soil_type'], prefix='soil')
df_test  = pd.get_dummies(df_test,  columns=['soil_type'], prefix='soil')
# Harmonisation des colonnes
df_test = df_test.reindex(columns=df_train.columns, fill_value=0)

# 6. Normalisation des variables continues
#    On exclut chemin_directory, date, target et les colonnes soil_*
exclude_cols = ['chemin_directory','date',target]
numeric_cols = [
    c for c in df_train.columns
    if c not in exclude_cols and not c.startswith('soil_')
]
scaler = StandardScaler().fit(df_train[numeric_cols])
df_train[numeric_cols] = scaler.transform(df_train[numeric_cols])
df_test[numeric_cols]  = scaler.transform(df_test[numeric_cols])

# 7. Tri par région et date
df_train.sort_values(['chemin_directory','date'], inplace=True)
df_test.sort_values(['chemin_directory','date'], inplace=True)




Sauvegarde du dataset de test original (avant tout prétraitement)...
Dataset de test sauvegardé dans test_dataset.csv
Nombre de lignes: 2955
Nombre de régions: 68
Distribution des classes: {0: 2160, 1: 795}
Colonnes: ['chemin_directory', 'date', 'latitude_centroid', 'longitude_centroid', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin', 'feelslike', 'dew', 'humidity', 'precipprob', 'precipcover', 'windspeed', 'winddir', 'pressure', 'cloudcover', 'visibility', 'elevation', 'soil_type', 'label']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(med, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[col].fillna(med, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves

In [16]:
# 8. Création des séquences temporelles
L = 7  # longueur de la séquence d’entrée (jours)
H = 3  # horizon de prédiction (jours futurs)

def create_sequences(df, L, H, feature_cols, target_col):
    Xs, ys = [], []
    for region, grp in df.groupby('chemin_directory'):
        data = grp.reset_index(drop=True)
        for i in range(len(data) - L - H + 1):
            Xs.append(data.iloc[i : i+L][feature_cols].values)
            ys.append(data.iloc[i+L-1 + H][target_col])
    return np.array(Xs), np.array(ys)

feature_cols = [c for c in df_train.columns if c not in exclude_cols]
X_train, y_train = create_sequences(df_train, L, H, feature_cols, target)
X_test,  y_test  = create_sequences(df_test,  L, H, feature_cols, target)

print("Avant cast → dtype X_train:", X_train.dtype, 
      "  dtype y_train:", y_train.dtype)

# 9. Conversion en float32
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)
X_test  = np.array(X_test,  dtype=np.float32)
y_test  = np.array(y_test,  dtype=np.float32)

print("Après cast → dtype X_train:", X_train.dtype, 
      "  dtype y_train:", y_train.dtype)
print("Shapes: X_train", X_train.shape, "y_train", y_train.shape)


Avant cast → dtype X_train: object   dtype y_train: int64
Après cast → dtype X_train: float32   dtype y_train: float32
Shapes: X_train (9786, 7, 33) y_train (9786,)


In [17]:

# 10. Définition du modèle LSTM
n_features = X_train.shape[2]
model = Sequential([
    LSTM(64, input_shape=(L, n_features), return_sequences=False),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
model.summary()


  super().__init__(**kwargs)


In [18]:
# 11. Callbacks pour entraînement
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ModelCheckpoint('best_model.keras', save_best_only=True)
]

# 12. Entraînement
history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=50,
    batch_size=32,
    callbacks=callbacks,
    verbose=2
)


Epoch 1/50
276/276 - 3s - 10ms/step - accuracy: 0.9077 - loss: 0.2136 - val_accuracy: 0.9265 - val_loss: 0.2037
Epoch 2/50
276/276 - 3s - 10ms/step - accuracy: 0.9077 - loss: 0.2136 - val_accuracy: 0.9265 - val_loss: 0.2037
Epoch 2/50
276/276 - 1s - 3ms/step - accuracy: 0.9668 - loss: 0.0861 - val_accuracy: 0.9265 - val_loss: 0.1637
Epoch 3/50
276/276 - 1s - 3ms/step - accuracy: 0.9668 - loss: 0.0861 - val_accuracy: 0.9265 - val_loss: 0.1637
Epoch 3/50
276/276 - 1s - 3ms/step - accuracy: 0.9754 - loss: 0.0612 - val_accuracy: 0.9265 - val_loss: 0.1755
Epoch 4/50
276/276 - 1s - 3ms/step - accuracy: 0.9754 - loss: 0.0612 - val_accuracy: 0.9265 - val_loss: 0.1755
Epoch 4/50
276/276 - 1s - 3ms/step - accuracy: 0.9777 - loss: 0.0523 - val_accuracy: 0.9265 - val_loss: 0.1456
Epoch 5/50
276/276 - 1s - 3ms/step - accuracy: 0.9777 - loss: 0.0523 - val_accuracy: 0.9265 - val_loss: 0.1456
Epoch 5/50
276/276 - 1s - 3ms/step - accuracy: 0.9788 - loss: 0.0487 - val_accuracy: 0.9265 - val_loss: 0.1368

In [19]:

# 13. Évaluation sur le jeu de test
y_prob = model.predict(X_test).ravel()
y_pred = (y_prob >= 0.5).astype(int)

acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)
auc  = roc_auc_score(y_test, y_prob)

print(f"\n--- Évaluation finale sur TEST ---")
print(f"Accuracy : {acc:.3f}")
print(f"Précision: {prec:.3f}")
print(f"Rappel   : {rec:.3f}")
print(f"F1-score : {f1:.3f}")
print(f"AUC      : {auc:.3f}\n")
print("Détail par classe :\n", classification_report(y_test, y_pred, digits=3))

# 14. Sauvegarde finale du modèle (optionnel si déjà fait)
model.save('modele_inondation_complete.keras')


[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step

--- Évaluation finale sur TEST ---
Accuracy : 0.966
Précision: 0.988
Rappel   : 0.911
F1-score : 0.948
AUC      : 0.993

Détail par classe :
               precision    recall  f1-score   support

         0.0      0.956     0.994     0.975      1556
         1.0      0.988     0.911     0.948       795

    accuracy                          0.966      2351
   macro avg      0.972     0.952     0.961      2351
weighted avg      0.967     0.966     0.966      2351


--- Évaluation finale sur TEST ---
Accuracy : 0.966
Précision: 0.988
Rappel   : 0.911
F1-score : 0.948
AUC      : 0.993

Détail par classe :
               precision    recall  f1-score   support

         0.0      0.956     0.994     0.975      1556
         1.0      0.988     0.911     0.948       795

    accuracy                          0.966      2351
   macro avg      0.972

In [23]:

# 15. Exemple d’inférence sur une séquence de test dont une transition de label existe
example_index = 0  # Index de l'exemple à tester
example_sequence = X_test[example_index]
print("\n--- Exemple d'inférence sur une séquence de test ---")
print("Séquence d'entrée (premiers 5 jours) :")
print(example_sequence[:5])
print("Label réel :", y_test[example_index])
y_prob_example = model.predict(example_sequence[np.newaxis, ...]).ravel()
print("Probabilité prédite :", y_prob_example[0])
print("Label prédit :", int(y_prob_example[0] >= 0.5))


--- Exemple d'inférence sur une séquence de test ---
Séquence d'entrée (premiers 5 jours) :
[[-0.61024135 -0.39653173 -3.9273639  -3.378598    0.3116215  -3.5822558
  -3.3017292   0.17289694  0.24271904  0.21280074 -0.5382856  -0.48560694
   0.09629185  0.07175133  0.2233267   0.01728057 -0.33851472  0.73481107
   0.          0.          0.          0.          0.          0.
   0.          0.          1.          0.          0.          0.
   0.          0.          0.        ]
 [-0.61024135 -0.39653173 -3.9273639  -3.378598    0.3116215  -3.5822558
  -3.3017292   0.17289694  0.24271904  0.21280074 -0.5382856  -0.48560694
   0.09629185  0.07175133  0.2233267   0.01728057 -0.33851472  0.73481107
   0.          0.          0.          0.          0.          0.
   0.          0.          1.          0.          0.          0.
   0.          0.          0.        ]
 [-0.61024135 -0.39653173 -3.9273639  -3.378598    0.3116215  -3.5822558
  -3.3017292   0.17289694  0.24271904  0.21280074 