 ## Exploration Notebook 
        
This notebook is where we prototype data ingestion, cleaning, quality checks, and quick visualizations.

**Before production-izing into `src/` folder and the Streamlit dashboard.**

In [None]:
# Install Dependencies

!pip install pandas requests pyyaml plotly streamlit matplotlib seaborn statsmodels

Collecting matplotlib
  Using cached matplotlib-3.10.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting statsmodels
  Using cached statsmodels-0.14.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.59.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (107 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting pypars

In [15]:
import sys
from pathlib import Path

# add src/ to import path so we can import our modules
sys.path.append(str(Path("../src").resolve()))

import pandas as pd
import requests
import yaml
import plotly

from data_fetcher import fetch_historical_weather, fetch_historical_energy
from data_processor import clean_weather, clean_energy, merge_weather_energy
from data_quality_report import generate_report


In [None]:
# Load API keys & city list
config = yaml.safe_load(open("../config/config.yaml"))
config


In [None]:
# Pick first city for prototyping
city = config["cities"][0]
slug = city["name"].lower().replace(" ", "_")

# Fetch raw weather and energy
df_w = fetch_historical_weather(
    station_id=city["station_id"],
    days=90,
    token=config["noaa_token"]
)
df_e = fetch_historical_energy(
    region=city["region"],
    days=90,
    api_key=config["eia_key"]
)

# Preview
df_w.head(), df_e.head()


In [None]:
print(f"Weather shape: {df_w.shape}, Energy shape: {df_e.shape}")
print("Missing in weather:\n", df_w.isna().sum())
print("Missing in energy:\n", df_e.isna().sum())


In [None]:
# Clean and merge
cw = clean_weather(df_w)
ce = clean_energy(df_e)
df_combined = merge_weather_energy(cw, ce)

# Preview
df_combined.head()


In [None]:
print(f"Processed shape: {df_combined.shape}")
print("Processed missing:\n", df_combined.isna().sum())


In [None]:
# Generate data quality report
import json

report = generate_report()
print(json.dumps(report, indent=2))


In [None]:
# Visualize TMAX vs Demand using Plotly
import plotly.express as px

fig = px.scatter(
    df_combined,
    x="TMAX", y="demand",
    trendline="ols",
    title="TMAX vs Demand"
)
fig.show()


In [None]:
# Visualize TMAX vs Demand using Matplotlib
import matplotlib.pyplot as plt

fig, ax1 = plt.subplots(figsize=(10,5))
ax2 = ax1.twinx()

ax1.plot(df_combined["date"], df_combined["TMAX"], label="TMAX", color="tab:blue")
ax2.plot(df_combined["date"], df_combined["demand"], label="Demand", linestyle="--", color="tab:red")

ax1.set_xlabel("Date")
ax1.set_ylabel("Temperature (°F)", color="tab:blue")
ax2.set_ylabel("Demand", color="tab:red")
plt.title("Daily Temperature & Energy Demand")
ax1.legend(loc="upper left")
ax2.legend(loc="upper right")
plt.show()


In [None]:
# Visualize Avg Demand by Temp Bin & Weekday using Seaborn
import seaborn as sns
import matplotlib.pyplot as plt

df_h = df_combined.copy()
df_h["weekday"] = df_h["date"].dt.day_name()
df_h["temp_bin"] = pd.cut(
    (df_h["TMAX"] + df_h["TMIN"]) / 2,
    bins=[-float("inf"), 50, 60, 70, 80, 90, float("inf")],
    labels=["<50°F","50-60°F","60-70°F","70-80°F","80-90°F",">90°F"]
)

pivot = df_h.groupby(["temp_bin","weekday"])["demand"] \
    .mean().reset_index().pivot(index="temp_bin", columns="weekday", values="demand")

plt.figure(figsize=(8,6))
sns.heatmap(pivot, annot=True, fmt=".0f", cmap="coolwarm")
plt.title("Avg Demand by Temp Bin & Weekday")
plt.show()


In [None]:
# Visualize Demand by City using Plotly
import plotly.express as px

city_coords = {
    "new_york": {"lat":40.7128, "lon":-74.0060},
    "chicago":  {"lat":41.8781, "lon":-87.6298},
    "houston":  {"lat":29.7604, "lon":-95.3698},
    "phoenix":  {"lat":33.4484, "lon":-112.0740},
    "seattle":  {"lat":47.6062, "lon":-122.3321},
}

# Build DataFrame of the latest demand per city
map_rows = []
latest = df_combined.iloc[-1]  # for one city demo
map_rows.append({
    "City": city["name"],
    "lat": city_coords[slug]["lat"],
    "lon": city_coords[slug]["lon"],
    "demand": latest.demand
})
map_df = pd.DataFrame(map_rows)

fig_map = px.scatter_geo(
    map_df,
    lat="lat", lon="lon",
    scope="usa",
    size="demand",
    hover_name="City"
)
fig_map.show()
