In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# ==========================================
# 1. DATA LOADER (Dual Mode)
# ==========================================
def get_data():
    # --- OPTION A: GENERATE SYNTHETIC DATA (Run this immediately) ---
    print("Generating synthetic data for demonstration...")
    dates = pd.date_range(start='2024-01-01', periods=365*24, freq='H')
    df = pd.DataFrame({'timestamp': dates})

    # Simulating realistic patterns
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df['temperature_c'] = np.random.normal(20, 8, len(df)) # Avg temp 20Â°C
    df['traffic_index'] = np.random.randint(1, 10, len(df)) # 1-10 scale

    # Complex Demand Formula: Demand is higher in evenings, weekends, and cold weather
    df['energy_demand_kwh'] = (
        15 +
        (df['hour'].apply(lambda x: 10 if 16 <= x <= 21 else 2)) + # Evening peak
        (df['is_weekend'] * 5) + # Weekend trips
        (df['temperature_c'] * -0.3) + # Cold weather = battery drain = more charging
        np.random.normal(0, 2, len(df)) # Random noise
    )

    # --- OPTION B: REAL KAGGLE DATA (Uncomment to use real file) ---
    # df = pd.read_csv('ev_charging_data.csv')
    # df['timestamp'] = pd.to_datetime(df['timestamp_column'])
    # df = df.rename(columns={'kwh_total': 'energy_demand_kwh'})

    return df

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================
df = get_data()

print(f"Data Loaded: {len(df)} rows.")

# Extract time features (Machine Learning models can't read 'Dates' directly)
df['hour'] = df['timestamp'].dt.hour
df['month'] = df['timestamp'].dt.month
df['day_of_week'] = df['timestamp'].dt.dayofweek

# Define Predictors (X) and Target (y)
features = ['hour', 'day_of_week', 'month', 'temperature_c', 'traffic_index']
target = 'energy_demand_kwh'

X = df[features]
y = df[target]

# ==========================================
# 3. TRAIN MODEL (Random Forest)
# ==========================================
# Split: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Random Forest Regressor... (This handles non-linear weather patterns)")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ==========================================
# 4. EVALUATE
# ==========================================
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
accuracy = r2_score(y_test, predictions)

print(f"\n--- Model Results ---")
print(f"Mean Absolute Error: {mae:.2f} kWh (On average, we are off by this amount)")
print(f"Model Accuracy (R2 Score): {accuracy:.2%} (Explains {accuracy:.2%} of variance)")

# Feature Importance (What drives demand?)
importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
print("\n--- Key Drivers of Demand ---")
print(importances)

# ==========================================
# 5. EXPORT RESULTS FOR TABLEAU
# ==========================================
# Create a final dataframe that combines actuals, predictions, and dates
results_df = X_test.copy()
results_df['Actual_Demand'] = y_test
results_df['Predicted_Demand'] = predictions
results_df['Error'] = results_df['Actual_Demand'] - results_df['Predicted_Demand']
# We need to map the indices back to original timestamps
results_df['timestamp'] = df.loc[results_df.index, 'timestamp']

results_df.to_csv('final_forecast_results.csv', index=False)
print("\nSUCCESS: 'final_forecast_results.csv' created. Import this into Tableau.")

Generating synthetic data for demonstration...
Data Loaded: 8760 rows.
Training Random Forest Regressor... (This handles non-linear weather patterns)


  dates = pd.date_range(start='2024-01-01', periods=365*24, freq='H')



--- Model Results ---
Mean Absolute Error: 1.67 kWh (On average, we are off by this amount)
Model Accuracy (R2 Score): 83.87% (Explains 83.87% of variance)

--- Key Drivers of Demand ---
hour             0.471375
temperature_c    0.277320
day_of_week      0.203281
month            0.025893
traffic_index    0.022132
dtype: float64

SUCCESS: 'final_forecast_results.csv' created. Import this into Tableau.
