# Electricity Consumption Prediction - Exploratory Data Analysis

This notebook explores the energy and weather datasets to understand patterns and relationships.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('✓ Libraries imported successfully!')

## 1. Load Datasets

In [None]:
# Load energy dataset
energy_path = r's:\Saurabh Pinjarkar\dataset\LATEST_DATASET_ENERGY\energy_dataset.csv'
energy_df = pd.read_csv(energy_path)
energy_df['time'] = pd.to_datetime(energy_df['time'])

print(f"Energy dataset shape: {energy_df.shape}")
print(f"Date range: {energy_df['time'].min()} to {energy_df['time'].max()}")
energy_df.head()

In [None]:
# Load weather dataset
weather_path = r's:\Saurabh Pinjarkar\dataset\LATEST_DATASET_ENERGY\weather_features.csv'
weather_df = pd.read_csv(weather_path)
weather_df['dt_iso'] = pd.to_datetime(weather_df['dt_iso'])

print(f"Weather dataset shape: {weather_df.shape}")
print(f"Date range: {weather_df['dt_iso'].min()} to {weather_df['dt_iso'].max()}")
weather_df.head()

## 2. Data Overview

In [None]:
# Energy dataset info
print("=" * 50)
print("ENERGY DATASET INFORMATION")
print("=" * 50)
energy_df.info()

In [None]:
# Weather dataset info
print("=" * 50)
print("WEATHER DATASET INFORMATION")
print("=" * 50)
weather_df.info()

In [None]:
# Statistical summary
energy_df.describe()

## 3. Missing Values Analysis

In [None]:
# Missing values in energy dataset
missing_energy = energy_df.isnull().sum()
missing_energy = missing_energy[missing_energy > 0].sort_values(ascending=False)

if len(missing_energy) > 0:
    plt.figure(figsize=(10, 6))
    missing_energy.plot(kind='bar')
    plt.title('Missing Values in Energy Dataset')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print('✓ No missing values in energy dataset!')

## 4. Target Variable Analysis (Electricity Consumption)

In [None]:
# Analyze total load actual (target variable)
target_col = 'total load actual'

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution
axes[0].hist(energy_df[target_col].dropna(), bins=50, edgecolor='black')
axes[0].set_title('Distribution of Electricity Consumption')
axes[0].set_xlabel('Consumption (MW)')
axes[0].set_ylabel('Frequency')

# Box plot
axes[1].boxplot(energy_df[target_col].dropna())
axes[1].set_title('Box Plot of Electricity Consumption')
axes[1].set_ylabel('Consumption (MW)')

plt.tight_layout()
plt.show()

print(f"Mean consumption: {energy_df[target_col].mean():.2f} MW")
print(f"Median consumption: {energy_df[target_col].median():.2f} MW")
print(f"Std deviation: {energy_df[target_col].std():.2f} MW")

## 5. Time Series Analysis

In [None]:
# Plot consumption over time (first month)
first_month = energy_df[energy_df['time'] < '2015-02-01']

fig = px.line(first_month, x='time', y=target_col, 
              title='Electricity Consumption - First Month',
              labels={'time': 'Date', target_col: 'Consumption (MW)'})
fig.show()

In [None]:
# Hourly patterns
energy_df['hour'] = energy_df['time'].dt.hour
hourly_avg = energy_df.groupby('hour')[target_col].mean()

plt.figure(figsize=(12, 5))
plt.plot(hourly_avg.index, hourly_avg.values, marker='o', linewidth=2)
plt.title('Average Electricity Consumption by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Average Consumption (MW)')
plt.grid(True, alpha=0.3)
plt.xticks(range(0, 24))
plt.tight_layout()
plt.show()

In [None]:
# Daily patterns (day of week)
energy_df['day_of_week'] = energy_df['time'].dt.dayofweek
daily_avg = energy_df.groupby('day_of_week')[target_col].mean()

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

plt.figure(figsize=(10, 5))
plt.bar(range(7), daily_avg.values, color='steelblue')
plt.title('Average Electricity Consumption by Day of Week')
plt.xlabel('Day')
plt.ylabel('Average Consumption (MW)')
plt.xticks(range(7), days, rotation=45)
plt.tight_layout()
plt.show()

## 6. Weather Features Analysis

In [None]:
# Temperature analysis
weather_df['temp_celsius'] = weather_df['temp'] - 273.15

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Temperature distribution
axes[0, 0].hist(weather_df['temp_celsius'], bins=50, color='orange', edgecolor='black')
axes[0, 0].set_title('Temperature Distribution')
axes[0, 0].set_xlabel('Temperature (°C)')
axes[0, 0].set_ylabel('Frequency')

# Humidity distribution
axes[0, 1].hist(weather_df['humidity'], bins=50, color='blue', edgecolor='black')
axes[0, 1].set_title('Humidity Distribution')
axes[0, 1].set_xlabel('Humidity (%)')
axes[0, 1].set_ylabel('Frequency')

# Wind speed distribution
axes[1, 0].hist(weather_df['wind_speed'], bins=50, color='green', edgecolor='black')
axes[1, 0].set_title('Wind Speed Distribution')
axes[1, 0].set_xlabel('Wind Speed (m/s)')
axes[1, 0].set_ylabel('Frequency')

# Pressure distribution
axes[1, 1].hist(weather_df['pressure'], bins=50, color='purple', edgecolor='black')
axes[1, 1].set_title('Pressure Distribution')
axes[1, 1].set_xlabel('Pressure (hPa)')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Merge datasets for correlation analysis
merged_df = pd.merge(
    energy_df[['time', target_col]].rename(columns={'time': 'datetime'}),
    weather_df[['dt_iso', 'temp_celsius', 'humidity', 'wind_speed', 'pressure']].rename(columns={'dt_iso': 'datetime'}),
    on='datetime',
    how='inner'
)

# Select numeric columns for correlation
corr_cols = [target_col, 'temp_celsius', 'humidity', 'wind_speed', 'pressure']
correlation_matrix = merged_df[corr_cols].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.3f', cmap='coolwarm', 
            square=True, linewidths=1, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix: Consumption vs Weather Features')
plt.tight_layout()
plt.show()

## 8. Key Insights

Based on the exploratory analysis:

1. **Temporal Patterns**: Clear hourly and daily consumption patterns
2. **Weather Impact**: Temperature and humidity correlate with consumption
3. **Seasonality**: Consumption varies by time of day and day of week
4. **Data Quality**: Minimal missing values, clean datasets

**Next Steps:**
- Feature engineering with cyclical encoding
- Lag features for temporal dependencies
- Model training with Random Forest
- Hyperparameter optimization

## 9. Save Processed Data (Optional)

In [None]:
# Save merged dataset for quick access
# merged_df.to_csv('../processed_data.csv', index=False)
# print('✓ Processed data saved!')