In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Chargement du dataset
df = pd.read_csv('household_power_consumption.txt',
                 sep=';',
                 parse_dates={'Datetime': ['Date', 'Time']},
                 infer_datetime_format=True,
                 na_values=['?'])


In [None]:
# Affichage des premières lignes
print(df.head())

# Affichage des types de données et de la forme du dataset
print(df.dtypes)
print("Shape du dataset :", df.shape)


 Exercise 2

In [None]:
missing_counts = df.isnull().sum()
print("Valeurs manquantes par colonne :\n", missing_counts)


In [None]:
# Remplacement des valeurs manquantes pour les colonnes numériques
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())


In [None]:
print("Valeurs manquantes après remplissage :\n", df.isnull().sum())


Exercise 3

In [None]:
df.set_index('Datetime', inplace=True)


In [None]:
daily_sum = df['Global_active_power'].resample('D').sum()
daily_mean = df['Global_active_power'].resample('D').mean()


In [None]:
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(daily_sum, color='blue')
plt.title("Somme journalière de Global_active_power")
plt.xlabel("Date")
plt.ylabel("Somme")

plt.subplot(1, 2, 2)
plt.plot(daily_mean, color='green')
plt.title("Moyenne journalière de Global_active_power")
plt.xlabel("Date")
plt.ylabel("Moyenne")

plt.tight_layout()
plt.show()


In [None]:
daily_mean_intensity = df['Global_intensity'].resample('D').mean()
daily_std_intensity = df['Global_intensity'].resample('D').std()


In [None]:
plt.figure(figsize=(10, 5))
plt.plot(daily_mean_intensity, label="Moyenne", color='orange')
plt.fill_between(daily_mean_intensity.index,
                 daily_mean_intensity - daily_std_intensity,
                 daily_mean_intensity + daily_std_intensity,
                 color='orange', alpha=0.3, label="Écart-type")
plt.title("Moyenne et écart-type de Global_intensity (par jour)")
plt.xlabel("Date")
plt.ylabel("Global_intensity")
plt.legend()
plt.show()


Exercise 4

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# On normalise uniquement les colonnes numériques (par exemple Global_active_power et Global_intensity)
numeric_cols = ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity',
                'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


In [None]:
# Calcul de l'index de split
split_index = int(len(df) * 0.8)
train_data = df.iloc[:split_index]
test_data = df.iloc[split_index:]


In [None]:
def create_sequences(data, time_steps=10):
    X, y = [], []
    data_values = data[numeric_cols].values  # Utiliser les colonnes normalisées
    for i in range(len(data_values) - time_steps):
        X.append(data_values[i:(i+time_steps)])
        # On peut par exemple prédire la valeur de Global_active_power à l'instant suivant
        y.append(data_values[i+time_steps][0])  # 0 correspond à Global_active_power
    return np.array(X), np.array(y)

time_steps = 10
X_train, y_train = create_sequences(train_data, time_steps)
X_test, y_test = create_sequences(test_data, time_steps)

print("X_train shape :", X_train.shape)
print("y_train shape :", y_train.shape)


 Exercise 5

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [None]:
num_features = len(numeric_cols)

model = Sequential()
model.add(LSTM(50, activation='tanh', return_sequences=True, input_shape=(time_steps, num_features)))
model.add(Dropout(0.2))
model.add(LSTM(50, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(1))  # Prédiction d'une valeur continue (Global_active_power)


In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()


 Exercise 6

In [None]:
history = model.fit(X_train, y_train, epochs=20, batch_size=32,
                    validation_split=0.1, verbose=1)


In [None]:
test_loss = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)


In [None]:
plt.figure(figsize=(12, 5))

# Courbe de Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Entraînement')
plt.plot(history.history['val_loss'], label='Validation')
plt.title("Loss vs. Époques")
plt.xlabel("Époques")
plt.ylabel("Loss")
plt.legend()

plt.tight_layout()
plt.show()
