# Climate Data Processing Summary

## 1. Data Source
The analysis uses statewide climate data from the NOAA nClimDiv (Climate Divisional Database). 
These datasets provide monthly statewide averages for:
- average temperature (TAVG)
- cooling degree days (CDD)
- heating degree days (HDD)

The files are distributed in fixed-width text format under the NOAA CIRS/climdiv directory.

## 2. Processing Steps
1. Download statewide files for TAVG, CDD, and HDD.
2. Read each file using NOAA’s fixed-width column specifications (state code, element code, year, and 12 monthly values).
3. Keep only state codes 001–050, which correspond to U.S. states.
4. Reshape each dataset from wide to long format (one row per state-year-month).
5. Clean missing values and convert all fields to numeric types.
6. Merge TAVG, CDD, and HDD into a single panel dataset.
7. Map each state code to its state name and two-letter postal abbreviation.



## 3. Output
The processed dataset is saved as:
``climate_statewide_clean.csv``  
with variables:
state, year, month, tavg, cdd, hdd, state_name, state_abbrev

## Note on Hawaii Data
Although Hawaii (state code 049) appears in the nClimDiv documentation, the statewide TAVG, CDD, and HDD files do not include Hawaii records in the public CIRS/climdiv distribution. As a result, the final dataset contains 49 states.


In [19]:
import requests

def download(url, filename):
    r = requests.get(url)
    with open(filename, "wb") as f:
        f.write(r.content)
    print("Downloaded:", filename)

base = "https://www.ncei.noaa.gov/pub/data/cirs/climdiv/"

files = [
    "climdiv-tmpcst-v1.0.0-20250905",
    "climdiv-cddcst-v1.0.0-20250905",
    "climdiv-hddcst-v1.0.0-20250905",
]

for f in files:
    download(base + f, f)


Downloaded: climdiv-tmpcst-v1.0.0-20250905
Downloaded: climdiv-cddcst-v1.0.0-20250905
Downloaded: climdiv-hddcst-v1.0.0-20250905


In [None]:
import pandas as pd
import requests

# 0. State code → state name
state_name_map = {
    1:  "Alabama", 2:  "Arizona", 3:  "Arkansas", 4:  "California",
    5:  "Colorado", 6:  "Connecticut", 7:  "Delaware", 8:  "Florida",
    9:  "Georgia", 10: "Idaho", 11: "Illinois", 12: "Indiana",
    13: "Iowa", 14: "Kansas", 15: "Kentucky", 16: "Louisiana",
    17: "Maine", 18: "Maryland", 19: "Massachusetts", 20: "Michigan",
    21: "Minnesota", 22: "Mississippi", 23: "Missouri", 24: "Montana",
    25: "Nebraska", 26: "Nevada", 27: "New Hampshire", 28: "New Jersey",
    29: "New Mexico", 30: "New York", 31: "North Carolina", 32: "North Dakota",
    33: "Ohio", 34: "Oklahoma", 35: "Oregon", 36: "Pennsylvania",
    37: "Rhode Island", 38: "South Carolina", 39: "South Dakota",
    40: "Tennessee", 41: "Texas", 42: "Utah", 43: "Vermont", 44: "Virginia",
    45: "Washington", 46: "West Virginia", 47: "Wisconsin", 48: "Wyoming",
    49: "Hawaii", 50: "Alaska"
}

state_abbrev_map = {
    "Alabama": "AL", "Arizona": "AZ", "Arkansas": "AR", "California": "CA",
    "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL",
    "Georgia": "GA", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN",
    "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA",
    "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI",
    "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT",
    "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ",
    "New Mexico": "NM", "New York": "NY", "North Carolina": "NC",
    "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR",
    "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC",
    "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT",
    "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
    "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
    "Hawaii": "HI", "Alaska": "AK"
}

# 1. Download NOAA statewide files

base_url = "https://www.ncei.noaa.gov/pub/data/cirs/climdiv/"

files = {
    "tavg": "climdiv-tmpcst-v1.0.0-20250905",
    "cdd":  "climdiv-cddcst-v1.0.0-20250905",
    "hdd":  "climdiv-hddcst-v1.0.0-20250905",
}

for key, fname in files.items():
    print(f"Downloading {fname}...")
    url = base_url + fname
    r = requests.get(url)
    with open(fname, "wb") as f:
        f.write(r.content)
    print("✔ Downloaded", fname)


# 2. Fixed-width specifications
colspecs = [
    (0, 3), (3, 4), (4, 6), (6, 10),
    (10, 17), (17, 24), (24, 31), (31, 38),
    (38, 45), (45, 52), (52, 59), (59, 66),
    (66, 73), (73, 80), (80, 87), (87, 94)
]

names = [
    "state", "division", "element", "year",
    "jan","feb","mar","apr","may","jun",
    "jul","aug","sep","oct","nov","dec"
]


# 3. Load + clean + melt
def load_climdiv(path):
    df = pd.read_fwf(path, colspecs=colspecs, names=names)

    df = df.astype({"state": int, "division": int, "element": int, "year": int})

    df = df[df["state"] <= 50].copy()

    df = df.melt(
        id_vars=["state", "element", "year"],
        value_vars=["jan","feb","mar","apr","may","jun",
                    "jul","aug","sep","oct","nov","dec"],
        var_name="month",
        value_name="value"
    )

    month_map = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,
                 "jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}

    df["month"] = df["month"].map(month_map)
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df = df.dropna(subset=["value"])

    return df

# 4. Load and rename
tavg = load_climdiv(files["tavg"]).rename(columns={"value": "tavg"}).drop(columns=["element"])
cdd  = load_climdiv(files["cdd"]).rename(columns={"value": "cdd"}).drop(columns=["element"])
hdd  = load_climdiv(files["hdd"]).rename(columns={"value": "hdd"}).drop(columns=["element"])


# 5. Merge all climate variables
climate = (
    tavg.merge(cdd, on=["state", "year", "month"])
        .merge(hdd, on=["state", "year", "month"])
)


# 6. Add state_name and abbreviation
climate["state_name"] = climate["state"].map(state_name_map)
climate["state_abbrev"] = climate["state_name"].map(state_abbrev_map)

print("Final dataset:", climate.shape)
print(climate.head())

# 7. Save to CSV
climate.to_csv("climate_statewide_clean.csv", index=False)
print("Saved climate_statewide_clean.csv")


Downloading climdiv-tmpcst-v1.0.0-20250905...
✔ Downloaded climdiv-tmpcst-v1.0.0-20250905
Downloading climdiv-cddcst-v1.0.0-20250905...
✔ Downloaded climdiv-cddcst-v1.0.0-20250905
Downloading climdiv-hddcst-v1.0.0-20250905...
✔ Downloaded climdiv-hddcst-v1.0.0-20250905
✔ Final dataset: (76668, 8)
   state  year  month  tavg   cdd    hdd state_name state_abbrev
0      1  1895      1  43.1   5.0  717.0    Alabama           AL
1      1  1896      1  43.5   4.0  693.0    Alabama           AL
2      1  1897      1  41.8   3.0  752.0    Alabama           AL
3      1  1898      1  49.0  19.0  545.0    Alabama           AL
4      1  1899      1  43.8   5.0  690.0    Alabama           AL
✔ Saved climate_statewide_clean.csv


In [None]:
present_states = sorted(climate["state_name"].unique())
print("States found:", len(present_states))
print(present_states)

States found: 49
['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']


# EIA Electricity Data Processing Summary

## 1. Data Source
The electricity data comes from the U.S. Energy Information Administration (EIA) Retail Sales dataset accessed through the EIA v2 API.
The dataset includes monthly statewide electricity retail sales, including:

electricity sales in megawatt-hours (sales_mwh)

number of customers (customers)

revenue in millions of dollars (revenue_million_dollars)
All 50 states and the District of Columbia are included.

## 2. Data Collection Steps

Query the EIA v2 API using monthly frequency and request sales, customers, and revenue.

Loop through the API using an offset parameter to retrieve all rows, since the API returns data in batches.

Combine all retrieved rows into a single dataset.

Save the raw dataset as eia_retail_sales_states_monthly_full.csv.



In [None]:
import requests
import pandas as pd

API_KEY = ""
url = f"https://api.eia.gov/v2/electricity/retail-sales/data/?api_key={API_KEY}"

states = [
    "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY",
    "LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND",
    "OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC"
]

payload_base = {
    "frequency": "monthly",
    "data": ["sales", "customers", "revenue"],
    "facets": {
        "stateid": states,
        "sectorid": ["ALL"]
    },
    "sort": [{"column": "period", "direction": "asc"}],
    "length": 5000,
}

all_rows = []

offset = 0
while True:
    payload = payload_base.copy()
    payload["offset"] = offset
    r = requests.post(url, json=payload).json()

    if "response" not in r or len(r["response"]["data"]) == 0:
        break
    
    all_rows.extend(r["response"]["data"])
    offset += 5000
    print("Fetched:", len(all_rows))

df = pd.DataFrame(all_rows)
df.to_csv("eia_retail_sales_states_monthly_full.csv", index=False)
df.head()


Fetched: 5000
Fetched: 10000
Fetched: 15000
Fetched: 15096


Unnamed: 0,period,stateid,stateDescription,sectorid,sectorName,sales,customers,revenue,sales-units,customers-units,revenue-units
0,2001-01,AK,Alaska,ALL,all sectors,521.03566,,51.96404,million kilowatt hours,number of customers,million dollars
1,2001-01,AL,Alabama,ALL,all sectors,7362.47302,,407.61261,million kilowatt hours,number of customers,million dollars
2,2001-01,AR,Arkansas,ALL,all sectors,3804.21013,,216.58535,million kilowatt hours,number of customers,million dollars
3,2001-01,AZ,Arizona,ALL,all sectors,4786.79176,,304.10688,million kilowatt hours,number of customers,million dollars
4,2001-01,CA,California,ALL,all sectors,21744.31668,,1893.25678,million kilowatt hours,number of customers,million dollars


## 3. Cleaning Steps

Extract year and month values from the period field (formatted as YYYY-MM).

Rename columns for clarity:

stateDescription → state_name

sales → sales_mwh

revenue → revenue_million_dollars

Remove metadata fields such as sales-units, customers-units, revenue-units, sectorid, and sectorName.

Remove the period field after extracting year and month.

Drop the customers field if it contains no valid data.

Reorder columns into a consistent structure.
## 4. Output
The cleaned dataset is saved as eia_cleaned_state_monthly_electricity.csv.
Variables included: year, month, stateid, state_name, sales_mwh, and revenue_million_dollars.
This dataset is ready to be merged with the cleaned NOAA climate dataset.

In [None]:
import pandas as pd

# 1. Load the raw EIA dataset
df = pd.read_csv("eia_retail_sales_states_monthly_full.csv")

# 2. Extract year and month from the 'period' field
df["year"] = df["period"].str[:4].astype(int)
df["month"] = df["period"].str[5:7].astype(int)

# 3. Rename columns to cleaner, more descriptive names
df = df.rename(columns={
    "stateDescription": "state_name",
    "sales": "sales_mwh",
    "revenue": "revenue_million_dollars"
})

# 4. Columns to remove (only if they exist—prevents errors)
cols_to_drop = [
    "period",              # remove 'period' field
    "sales-units",
    "customers-units",
    "revenue-units",
    "sectorid",
    "sectorName"
]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# 5. If the 'customers' column contains only missing values, drop it
if "customers" in df.columns and df["customers"].isna().all():
    df = df.drop(columns=["customers"])

# 6. Reorder columns for a cleaner structure
keep_order = [
    "year", "month",
    "stateid", "state_name",
    "sales_mwh", "revenue_million_dollars"
]

df = df[keep_order]

# 7. Save cleaned dataset
df.to_csv("eia_cleaned_state_monthly_electricity.csv", index=False)

df


Unnamed: 0,year,month,stateid,state_name,sales_mwh,revenue_million_dollars
0,2001,1,AK,Alaska,521.03566,51.96404
1,2001,1,AL,Alabama,7362.47302,407.61261
2,2001,1,AR,Arkansas,3804.21013,216.58535
3,2001,1,AZ,Arizona,4786.79176,304.10688
4,2001,1,CA,California,21744.31668,1893.25678
...,...,...,...,...,...,...
15091,2025,8,VT,Vermont,481.97770,91.56739
15092,2025,8,WA,Washington,7434.05290,830.89085
15093,2025,8,WI,Wisconsin,6574.10561,913.08143
15094,2025,8,WV,West Virginia,2786.21362,319.77006
