# Building Energy Consumption - Data Exploration

This notebook explores the building energy consumption dataset and prepares it for model training.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

## 1. Load and Inspect Data

In [None]:
# Load sample energy data
df = pd.read_csv('../data/building_energy_data.csv')
print(f'Dataset shape: {df.shape}')
print(f'\nFirst few rows:')
df.head(10)

In [None]:
# Data info
print('Data Info:')
df.info()
print(f'\nMissing values:\n{df.isnull().sum()}')
print(f'\nBasic statistics:\n{df.describe()}')

## 2. Data Quality Assessment

In [None]:
# Check for outliers using IQR method
def identify_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    outliers = data[(data[column] < Q1 - 1.5*IQR) | (data[column] > Q3 + 1.5*IQR)]
    return len(outliers)

for col in df.select_dtypes(include=[np.number]).columns:
    outlier_count = identify_outliers(df, col)
    print(f'{col}: {outlier_count} outliers')

## 3. Distribution Analysis

In [None]:
# Plot distributions
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    plt.figure(figsize=(10, 4))
    plt.hist(df[col], bins=50, edgecolor='black', alpha=0.7)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

## 4. Correlation Analysis

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr(numeric_only=True)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

# Top correlations with target
if 'energy_consumption' in df.columns:
    target_corr = correlation_matrix['energy_consumption'].sort_values(ascending=False)
    print('Top correlations with energy_consumption:')
    print(target_corr[1:11])

## 5. Temporal Pattern Analysis

In [None]:
# If timestamp exists, analyze temporal patterns
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    
    # Plot hourly pattern
    hourly_avg = df.groupby('hour')['energy_consumption'].mean()
    plt.figure(figsize=(12, 5))
    hourly_avg.plot(marker='o', linewidth=2)
    plt.title('Average Energy Consumption by Hour')
    plt.xlabel('Hour of Day')
    plt.ylabel('Energy Consumption (kWh)')
    plt.grid(True)
    plt.show()