# HDD-CDD
- HDD/CDD do not directly drive energy stock prices
- They do influence demand, which can move prices
- Stocks respond only when weather alters expectations or cash flows
- The signal is real but subtle
- Poor modeling makes it look useless; good modeling makes it valuable



HDD-CDD of several cities is collected using the Meteostat library
#### Source: Meteostat – public data aggregation service.
- Meteostat collects and harmonizes
- NOAA’s Global Historical Climatology Network (GHCN),
- Environment and Climate Change Canada,
- Deutscher Wetterdienst (DWD, Germany), and
- MET Norway, among others

Accessed via the official meteostat Python library.

In [1]:
# Install packages if missing
import sys, subprocess, importlib

def ensure(p):
    try: importlib.import_module(p)
    except: subprocess.check_call([sys.executable, "-m", "pip", "install", p])

ensure("meteostat"); ensure("pandas"); ensure("tqdm")

# Imports
from datetime import datetime
from pathlib import Path
import pandas as pd
from meteostat import Daily, Stations   # <-- Stations is imported here
from tqdm import tqdm


In [2]:
# Where to save
ROOT = Path.cwd()
RAW_DAILY   = ROOT / "data_raw" / "hddcdd" / "daily"
CLEAN_MONTH = ROOT / "data_clean"
RAW_DAILY.mkdir(parents=True, exist_ok=True)
CLEAN_MONTH.mkdir(parents=True, exist_ok=True)

# Locations (lat, lon)
LOCATIONS = {
    "CorpusChristi_TX": (27.8006, -97.3964),
    "Houston_TX":       (29.7604, -95.3698),
    "NewYork_NY":       (40.7128, -74.0060),
}

# Date range (start recent to avoid legacy gaps)
START = datetime(2021, 1, 1)
END   = datetime.today()

# Degree-day base (°F)
BASE_F = 65.0

# Optional: hard-coded airport fallback if auto-pick fails
FALLBACK_STATIONS = {
    "CorpusChristi_TX": "KCRP",  # Corpus Christi Intl
    "Houston_TX":       "KIAH",  # George Bush Intercontinental
    "NewYork_NY":       "KJFK",  # JFK
}


In [5]:
def c_to_f(c):
    return c * 9/5 + 32

def add_hdd_cdd(df, base_f=65.0):
    # Prefer tavg; if missing, use (tmin+tmax)/2 when available
    tmean_c = df["tavg"].copy() if "tavg" in df.columns else pd.Series(pd.NA, index=df.index)
    if "tmin" in df.columns and "tmax" in df.columns:
        mask = tmean_c.isna()
        tmean_c.loc[mask] = (df.loc[mask, "tmin"] + df.loc[mask, "tmax"]) / 2

    tmean_f = c_to_f(tmean_c.astype(float))
    hdd = (base_f - tmean_f).clip(lower=0)
    cdd = (tmean_f - base_f).clip(lower=0)

    out = df.copy()
    out["tmean_f"] = tmean_f.round(2)
    out["HDD"] = hdd.round(2)
    out["CDD"] = cdd.round(2)
    return out

def best_station_df(lat, lon, start, end, max_candidates=8):
    """
    Return a 1-row Stations DataFrame (not an ID string).
    We pass this DataFrame directly to meteostat.Daily(...).
    """
    st = Stations().nearby(lat, lon)
    cand = st.fetch(max_candidates)

    if cand is None or len(cand) == 0:
        return None

    best_row = None
    best_non_na = -1

    # iterate over candidate rows by position; keep each row as a 1-row DataFrame
    for i in range(len(cand)):
        row1 = cand.iloc[[i]]            # keep DataFrame shape (1 row)
        df = Daily(row1, start, end).fetch()
        if df.empty:
            continue
        # count days with at least one temperature value
        have = df[["tavg","tmin","tmax"]].notna().any(axis=1).sum()
        if have > best_non_na:
            best_non_na = have
            best_row = row1

    return best_row  # 1-row DataFrame or None


In [7]:
all_daily = []
chosen = {}

for name, (lat, lon) in tqdm(LOCATIONS.items()):
    st_row = best_station_df(lat, lon, START, END)

    # Fallback: try a known airport code if search fails
    if st_row is None and name in FALLBACK_STATIONS:
        try:
            st_row = Stations().id(FALLBACK_STATIONS[name]).fetch(1)  # 1-row DF
            if st_row is None or len(st_row) == 0:
                st_row = None
        except Exception:
            st_row = None

    if st_row is None:
        print(f"[{name}] No station found for {START.date()}–{END.date()}")
        continue

    # Keep something human-readable about the station we picked (name/icao if present)
    meta_cols = [c for c in ["name","icao","wmo","id"] if c in st_row.columns]
    chosen[name] = st_row[meta_cols].to_dict(orient="records")[0] if meta_cols else {"picked":"ok"}

    # Fetch daily & compute HDD/CDD
    df = Daily(st_row, START, END).fetch().reset_index().rename(columns={"time":"date"})
    if df.empty:
        print(f"[{name}] Station returned no data")
        continue

    df = add_hdd_cdd(df, base_f=BASE_F)

    keep = ["date","tmean_f","HDD","CDD","tmin","tmax","tavg","prcp","snow"]
    for k in keep:
        if k not in df.columns: df[k] = pd.NA
    df = df[keep]
    df.insert(0, "location", name)

    outpath = RAW_DAILY / f"{name}_daily_hddcdd.csv"
    df.to_csv(outpath, index=False)
    all_daily.append(df)

print("Chosen stations:", chosen)
print(f"Daily files saved to: {RAW_DAILY}")
if all_daily:
    display(all_daily[0].head())


100%|██████████| 3/3 [03:10<00:00, 63.39s/it]

Chosen stations: {'CorpusChristi_TX': {'name': 'Ingleside / Millsville', 'icao': 'KTFP', 'wmo': None}, 'Houston_TX': {'name': 'Houston Intercontinental', 'icao': 'KIAH', 'wmo': '72243'}, 'NewYork_NY': {'name': 'Newark Airport', 'icao': 'KEWR', 'wmo': '72502'}}
Daily files saved to: C:\Users\siddh\OneDrive - Texas A&M University-Corpus Christi\Documents\SIDDHARTHA\MASTER'S TAMUCC\MASTER'S THESIS\Dr.Pal Thesis\Data\Scenario-2\HDD_CDD\data_raw\hddcdd\daily





Unnamed: 0,location,date,tmean_f,HDD,CDD,tmin,tmax,tavg,prcp,snow
0,CorpusChristi_TX,2021-01-01,49.82,15.18,0.0,4.6,14.9,9.9,,
1,CorpusChristi_TX,2021-01-02,51.26,13.74,0.0,4.6,17.4,10.7,0.0,
2,CorpusChristi_TX,2021-01-03,56.12,8.88,0.0,6.1,20.8,13.4,0.0,
3,CorpusChristi_TX,2021-01-04,63.5,1.5,0.0,11.2,24.6,17.5,0.0,
4,CorpusChristi_TX,2021-01-05,66.56,0.0,1.56,14.2,23.7,19.2,3.8,


In [8]:
# Export one CSV per city with full daily info

import pandas as pd
from pathlib import Path
import glob

# Paths (adjust ROOT if your notebook isn't in the same folder you used before)
ROOT = Path.cwd()
RAW_DAILY = ROOT / "data_raw" / "hddcdd" / "daily"
OUT_DIR   = ROOT / "data_clean" / "hddcdd_daily_by_city"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) Get the combined daily dataframe
try:
    _ = all_daily  # check if exists
    daily_all = pd.concat(all_daily, ignore_index=True)
except NameError:
    files = glob.glob(str(RAW_DAILY / "*_daily_hddcdd.csv"))
    if not files:
        raise SystemExit("No daily files found. Run the download cell first.")
    frames = [pd.read_csv(f, parse_dates=["date"]) for f in files]
    daily_all = pd.concat(frames, ignore_index=True)

# 2) Keep columns in the order you showed
cols = ["location","date","tmean_f","HDD","CDD","tmin","tmax","tavg","prcp","snow"]
for c in cols:
    if c not in daily_all.columns:
        daily_all[c] = pd.NA
daily_all = daily_all[cols]
daily_all["date"] = pd.to_datetime(daily_all["date"])

# 3) Write one CSV per city
saved = []
for loc, g in daily_all.groupby("location", sort=True):
    out_path = OUT_DIR / f"{loc}_daily_hddcdd.csv"
    g.sort_values("date").to_csv(out_path, index=False)
    saved.append(out_path)

print("Saved per-city files:")
for p in saved:
    print(" -", p)


Saved per-city files:
 - C:\Users\siddh\OneDrive - Texas A&M University-Corpus Christi\Documents\SIDDHARTHA\MASTER'S TAMUCC\MASTER'S THESIS\Dr.Pal Thesis\Data\Scenario-2\HDD_CDD\data_clean\hddcdd_daily_by_city\CorpusChristi_TX_daily_hddcdd.csv
 - C:\Users\siddh\OneDrive - Texas A&M University-Corpus Christi\Documents\SIDDHARTHA\MASTER'S TAMUCC\MASTER'S THESIS\Dr.Pal Thesis\Data\Scenario-2\HDD_CDD\data_clean\hddcdd_daily_by_city\Houston_TX_daily_hddcdd.csv
 - C:\Users\siddh\OneDrive - Texas A&M University-Corpus Christi\Documents\SIDDHARTHA\MASTER'S TAMUCC\MASTER'S THESIS\Dr.Pal Thesis\Data\Scenario-2\HDD_CDD\data_clean\hddcdd_daily_by_city\NewYork_NY_daily_hddcdd.csv
