# ⚡ High-Quality Exploratory Data Analysis (EDA)
## Electricity Theft Detection

This notebook provides a comprehensive analysis of the electricity consumption data to detect theft. We will cover:
1. **Data Loading & Cleaning**: Handling missing values and parsing dates.
2. **Univariate Analysis**: Distribution of consumption and target classes.
3. **Bivariate Analysis**: Comparing theft vs. normal consumption patterns.
4. **Time Series Analysis**: Visualizing consumption trends over time.
5. **Advanced Visualizations**: Correlation heatmaps and boxplots.
6. **Dimensionality Reduction**: PCA/t-SNE (optional for high-dim data).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Set plot style for premium quality
plt.style.use('ggplot')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# %matplotlib inline is no longer needed in modern VS Code

## 1. Data Loading & Cleaning

In [None]:
# Load the dataset
data_path = '../data/raw/electricity_theft_data.csv'
try:
    # Attempt to read with default settings
    df = pd.read_csv(data_path)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
except Exception as e:
    print(f"Error loading dataset: {e}")

# Display first few rows
df.head()

In [None]:
# Check for 'FLAG' column and 'CONS_NO'
if 'FLAG' in df.columns:
    print("Target column 'FLAG' found.")
    df['FLAG'] = df['FLAG'].astype(int)
else:
    print("⚠️ 'FLAG' column NOT found! Checking if it's the last column...")
    # Fallback: Assume last column is target if unnamed or different
    df.rename(columns={df.columns[-1]: 'FLAG'}, inplace=True)

if 'CONS_NO' in df.columns:
    # Set CONS_NO as index or drop for analysis
    df.set_index('CONS_NO', inplace=True)
    print("CONS_NO set as index.")

In [None]:
# Handle Missing Values
missing_values = df.isnull().sum().sum()
print(f"Total missing values: {missing_values}")

# Fill missing values with 0 (assuming no consumption) or interpolate
df.fillna(0, inplace=True)

# Visualizing Missing Values (First 50 columns Heatmap)
plt.figure(figsize=(12, 4))
sns.heatmap(df.iloc[:500, :50].isnull(), cbar=False, cmap='viridis')
plt.title('Missing Value Patterns (Subset of Data)')
plt.show()

## 2. Univariate Analysis

In [None]:
# Class Distribution
plt.figure(figsize=(8, 5))
ax = sns.countplot(x='FLAG', data=df, palette=['#1f77b4', '#ff7f0e'])
plt.title('Class Distribution: Normal (0) vs Theft (1)', fontsize=16)
plt.xlabel('Class (0: Normal, 1: Theft)')
plt.ylabel('Count')
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.show()

In [None]:
# Bar Plot: Comparative Mean Features by Class
feature_cols = ['avg_consumption', 'std_consumption', 'max_consumption', 'min_consumption']
df_melted = pd.melt(df, id_vars=['FLAG'], value_vars=feature_cols)
plt.figure(figsize=(10, 6))
sns.barplot(x='variable', y='value', hue='FLAG', data=df_melted, palette=['#1f77b4', '#ff7f0e'])
plt.title('Average Consumption Metric Comparisons')
plt.yscale('log')
plt.show()

## 3. Time Series Analysis & Consumption Patterns

In [None]:
# Extract time series data (excluding FLAG)
ts_data = df.drop(columns=['FLAG'])

# Calculate average daily consumption for each user
df['avg_consumption'] = ts_data.mean(axis=1)
df['std_consumption'] = ts_data.std(axis=1)
df['max_consumption'] = ts_data.max(axis=1)
df['min_consumption'] = ts_data.min(axis=1)

# Visualizing Random Samples from Each Class
n_samples = 3
normal_samples = df[df['FLAG'] == 0].sample(n_samples, random_state=42).drop(columns=['FLAG', 'avg_consumption', 'std_consumption', 'max_consumption', 'min_consumption'])
theft_samples = df[df['FLAG'] == 1].sample(n_samples, random_state=42).drop(columns=['FLAG', 'avg_consumption', 'std_consumption', 'max_consumption', 'min_consumption'])

fig, axes = plt.subplots(n_samples, 2, figsize=(18, 12), sharey=False)

for i in range(n_samples):
    # Normal
    axes[i, 0].plot(normal_samples.iloc[i].values, color='#1f77b4', linewidth=1)
    axes[i, 0].set_title(f'Normal User {normal_samples.index[i]} Consumption')
    axes[i, 0].set_ylabel('kWh')
    
    # Theft
    axes[i, 1].plot(theft_samples.iloc[i].values, color='#ff7f0e', linewidth=1)
    axes[i, 1].set_title(f'Theft User {theft_samples.index[i]} Consumption')

plt.tight_layout()
plt.show()

In [None]:
# Hexbin Plot: Visualizing Density of Avg vs Std Consumption
plt.figure(figsize=(10, 8))
hb = plt.hexbin(df['avg_consumption'], df['std_consumption'], gridsize=40, cmap='YlGnBu', bins='log')
plt.colorbar(hb, label='log10(count)')
plt.title('Density of Consumption Patterns (Avg vs Std)')
plt.xlabel('Average Consumption')
plt.ylabel('Std Dev of Consumption')
plt.show()

## 4. Feature Engineering Visualization
We compare distibutions of derived features between classes.

In [None]:
feature_cols = ['avg_consumption', 'std_consumption', 'max_consumption', 'min_consumption']

plt.figure(figsize=(16, 12))
for i, col in enumerate(feature_cols, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(x='FLAG', y=col, data=df, palette=['#1f77b4', '#ff7f0e'])
    plt.title(f'{col} Boxplot')
    plt.yscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Violin Plots: Visualizing Probability Density by Class
plt.figure(figsize=(16, 12))
for i, col in enumerate(feature_cols, 1):
    plt.subplot(2, 2, i)
    sns.violinplot(x='FLAG', y=col, data=df, palette=['#1f77b4', '#ff7f0e'], split=True)
    plt.title(f'{col} Density (Violin Plot)')
    plt.yscale('log')

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation of engineered features
corr_matrix = df[['FLAG'] + feature_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix (Target vs Derived Features)')
plt.show()

## 6. Advanced PCA Visualization (Optional)
Visualizing the high-dimensional data in 2D.

In [None]:
# Taking a subset for PCA to avoid memory issues if dataset is huge
subset_size = 5000
if len(df) > subset_size:
    data_subset = df.sample(subset_size, random_state=42)
else:
    data_subset = df

# Use only numeric time-series data for PCA
X_pca = data_subset.drop(columns=['FLAG', 'avg_consumption', 'std_consumption', 'max_consumption', 'min_consumption']).fillna(0)
y_pca = data_subset['FLAG']

# Standardize
scaler = StandardScaler()
X_pca_scaled = scaler.fit_transform(X_pca)

# Apply PCA
pca = PCA(n_components=2)
X_pca_2d = pca.fit_transform(X_pca_scaled)

pca_df = pd.DataFrame(data=X_pca_2d, columns=['PC1', 'PC2'])
pca_df['FLAG'] = y_pca.values

plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='FLAG', data=pca_df, alpha=0.6, palette=['#1f77b4', '#ff7f0e'])
plt.title('PCA 2D Projection of Electricity Consumption Patterns')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} Variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} Variance)')
plt.legend(title='Class')
plt.show()