In [None]:
# --- COMPLETE JUPYTER NOTEBOOK CODE ---
# Save this as: GENML801_Energy_Consumption_Prediction.ipynb

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

print('All libraries imported successfully!')

# Create realistic energy consumption dataset
def create_realistic_energy_data():
    np.random.seed(42)
    start_date = datetime(2020, 1, 1)
    end_date = datetime(2022, 12, 31)
    dates = pd.date_range(start=start_date, end=end_date, freq='H')
    n_samples = len(dates)
    
    base_consumption = np.random.gamma(2, 0.3, n_samples)
    temperature = np.zeros(n_samples)
    humidity = np.zeros(n_samples)
    global_active_power = np.zeros(n_samples)
    
    for i, date in enumerate(dates):
        base_temp = 15 + 10 * np.sin(2 * np.pi * (date.timetuple().tm_yday - 80) / 365)
        temperature[i] = base_temp + np.random.normal(0, 3)
        base_humidity = 50 + 20 * np.sin(2 * np.pi * (date.timetuple().tm_yday - 100) / 365)
        humidity[i] = max(20, min(95, base_humidity + np.random.normal(0, 10)))
        hour = date.hour
        if 7 <= hour <= 9:
            daily_multiplier = 1.8
        elif 18 <= hour <= 21:
            daily_multiplier = 2.0
        elif 0 <= hour <= 5:
            daily_multiplier = 0.6
        else:
            daily_multiplier = 1.2
        day_of_week = date.weekday()
        weekly_multiplier = 1.3 if day_of_week >= 5 else 1.0
        month = date.month
        if month in [12, 1, 2]:
            seasonal_multiplier = 1.4
        elif month in [6, 7, 8]:
            seasonal_multiplier = 1.5
        else:
            seasonal_multiplier = 1.1
        temp_effect = 0.02 * abs(temperature[i] - 20)
        global_active_power[i] = (base_consumption[i] * daily_multiplier * weekly_multiplier * seasonal_multiplier + temp_effect)
    data = {'datetime': dates, 'temperature': temperature, 'humidity': humidity, 'global_active_power': global_active_power}
    df = pd.DataFrame(data)
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['month'] = df['datetime'].dt.month
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    return df

df = create_realistic_energy_data()
print('Dataset Shape:', df.shape)

# Feature Engineering
df['day_of_year'] = df['datetime'].dt.dayofyear
df['week_of_year'] = df['datetime'].dt.isocalendar().week
df['season'] = (df['month'] % 12 + 3) // 3
df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
df.sort_values('datetime', inplace=True)
df['consumption_lag_1'] = df['global_active_power'].shift(1)
df['consumption_lag_24'] = df['global_active_power'].shift(24)
df['consumption_lag_168'] = df['global_active_power'].shift(168)
df.fillna(method='bfill', inplace=True)

# Prepare data for model training
def prepare_data_for_frequency(df, frequency='D'):
    if frequency == 'D':
        resampled_df = df.resample('D', on='datetime').agg({'global_active_power': 'sum', 'temperature': 'mean', 'humidity': 'mean'}).reset_index()
    elif frequency == 'W':
        resampled_df = df.resample('W', on='datetime').agg({'global_active_power': 'sum', 'temperature': 'mean', 'humidity': 'mean'}).reset_index()
    else:
        resampled_df = df.resample('Y', on='datetime').agg({'global_active_power': 'sum', 'temperature': 'mean', 'humidity': 'mean'}).reset_index()
    return resampled_df

daily_df = prepare_data_for_frequency(df, 'D')
weekly_df = prepare_data_for_frequency(df, 'W')
yearly_df = prepare_data_for_frequency(df, 'Y')

# Model training
def train_and_evaluate_model(df, freq_name):
    X = df[['temperature', 'humidity']]
    y = df['global_active_power']
    split = int(len(X)*0.8)
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_s, y_train)
    y_pred = model.predict(X_test_s)
    print(f'{freq_name} R²:', r2_score(y_test, y_pred))
    joblib.dump(model, f'{freq_name.lower()}_energy_model.pkl')
    joblib.dump(scaler, f'{freq_name.lower()}_scaler.pkl')

train_and_evaluate_model(daily_df, 'Daily')
train_and_evaluate_model(weekly_df, 'Weekly')
train_and_evaluate_model(yearly_df, 'Yearly')

print('Models saved successfully!')