In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
file_path = 'F:\Third_project\household_power_consumption.txt'
df = pd.read_csv(file_path, sep=';', parse_dates={'datetime': ['Date', 'Time']}, na_values='?', low_memory=False)

  df = pd.read_csv(file_path, sep=';', parse_dates={'datetime': ['Date', 'Time']}, na_values='?', low_memory=False)
  df = pd.read_csv(file_path, sep=';', parse_dates={'datetime': ['Date', 'Time']}, na_values='?', low_memory=False)


In [4]:
from sklearn.preprocessing import StandardScaler

# Create engineered features if not already present
df['Is_Peak_Hour'] = df['datetime'].dt.hour.between(18, 22).astype(int)
df['Daily_avg_power'] = df['Global_active_power'].rolling(window=1440, min_periods=1).mean()
df['Rolling_avg_power_60min'] = df['Global_active_power'].rolling(window=60, min_periods=1).mean()

# List of columns to scale
to_scale = [
    'Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity',
    'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3',
    'Daily_avg_power', 'Rolling_avg_power_60min'
]

scaler = StandardScaler()
scaled = scaler.fit_transform(df[to_scale])
scaled_df = pd.DataFrame(scaled, columns=[f"{col}_scaled" for col in to_scale])

# Concatenate scaled features and Is_Peak_Hour
df = pd.concat([df.reset_index(drop=True), scaled_df], axis=1)

In [5]:
# Use only scaled features
features = [
    'Global_reactive_power_scaled', 'Voltage_scaled', 'Global_intensity_scaled',
    'Sub_metering_1_scaled', 'Sub_metering_2_scaled', 'Sub_metering_3_scaled',
    'Daily_avg_power_scaled', 'Rolling_avg_power_60min_scaled', 'Is_Peak_Hour'
]

X = df[features]
y = df['Global_active_power_scaled']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Drop rows with any NaN values in features or target
df = df.dropna(subset=features + ['Global_active_power_scaled'])

In [None]:
# # Fill NaNs with the mean of each column
# df[features + ['Global_active_power_scaled']] = df[features + ['Global_active_power_scaled']].fillna(
#     df[features + ['Global_active_power_scaled']].mean()
# )

In [11]:
print(df[features + ['Global_active_power_scaled']].isnull().sum())

Global_reactive_power_scaled      0
Voltage_scaled                    0
Global_intensity_scaled           0
Sub_metering_1_scaled             0
Sub_metering_2_scaled             0
Sub_metering_3_scaled             0
Daily_avg_power_scaled            0
Rolling_avg_power_60min_scaled    0
Is_Peak_Hour                      0
Global_active_power_scaled        0
dtype: int64


In [16]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42),
    "Neural Network (MLP)": MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=200, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    results[name] = {
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "MAE": mean_absolute_error(y_test, preds),
        "R2": r2_score(y_test, preds)
    }

results_df = pd.DataFrame(results).T
results_df.sort_values("RMSE", inplace=True)
results_df

Unnamed: 0,RMSE,MAE,R2
Neural Network (MLP),0.028207,0.017161,0.999209
Gradient Boosting,0.02966,0.018274,0.999125
Random Forest,0.032555,0.01921,0.998946
Linear Regression,0.03779,0.024221,0.99858


In [14]:
# 1. Define your features
features = [
    'Global_reactive_power_scaled', 'Voltage_scaled', 'Global_intensity_scaled',
    'Sub_metering_1_scaled', 'Sub_metering_2_scaled', 'Sub_metering_3_scaled',
    'Daily_avg_power_scaled', 'Rolling_avg_power_60min_scaled', 'Is_Peak_Hour'
]

# 2. Drop rows with any NaN values in features or target
df = df.dropna(subset=features + ['Global_active_power_scaled'])

# 3. Now create X and y
X = df[features]
y = df['Global_active_power_scaled']

# 4. Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
print(X.isnull().sum())
print(y.isnull().sum())

Global_reactive_power_scaled      0
Voltage_scaled                    0
Global_intensity_scaled           0
Sub_metering_1_scaled             0
Sub_metering_2_scaled             0
Sub_metering_3_scaled             0
Daily_avg_power_scaled            0
Rolling_avg_power_60min_scaled    0
Is_Peak_Hour                      0
dtype: int64
0


In [7]:
import matplotlib.pyplot as plt

In [8]:
import seaborn as sns

In [10]:
plt.figure(figsize=(10, 6))
sns.barplot(x=results_df.index, y=results_df["RMSE"], hue=results_df.index, palette="Blues_r", legend=False)
plt.title("Model Comparison - RMSE")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
results_df.sort_values("RMSE").style.highlight_min(color='lightgreen', subset=['RMSE', 'MAE']) \
                                     .highlight_max(color='lightblue', subset=['R2'])

NameError: name 'results_df' is not defined

<Figure size 1000x600 with 0 Axes>

In [18]:
pip install jinja2

Collecting jinja2Note: you may need to restart the kernel to use updated packages.

  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ? eta -:--:--
     ---------------------------------------- 0.0/134.9 kB ?

ERROR: Exception:
Traceback (most recent call last):
  File "f:\Third_project\env\lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "f:\Third_project\env\lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
  File "f:\Third_project\env\lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
  File "f:\Third_project\env\lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 90, in read
    data = self.__fp.read(amt)
  File "C:\Users\ADMIN\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 466, in read
    s = self.fp.read(amt)
  File "C:\Users\ADMIN\AppData\Local\Programs\Python\Python310\lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\ADMIN\AppData\Local\Programs\Python\Python310\lib\ssl.py", line 1274, in recv_into

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Gradient Boosting
gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

gb_grid = GridSearchCV(
    GradientBoostingRegressor(random_state=42),
    gb_params,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

gb_grid.fit(X_train, y_train)
print("🔍 Best Gradient Boosting Params:", gb_grid.best_params_)
print("✅ Best RMSE (Gradient Boosting):", -gb_grid.best_score_)

In [None]:
mlp_params = {
    'hidden_layer_sizes': [(64,), (64, 32)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'max_iter': [300]
}

mlp_grid = GridSearchCV(
    MLPRegressor(random_state=42),
    mlp_params,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

mlp_grid.fit(X_train, y_train)
print("🔍 Best MLP Params:", mlp_grid.best_params_)
print("✅ Best RMSE (MLP):", -mlp_grid.best_score_)