In [1]:
import pandas as pd
import numpy as np
import requests
from sklearn.linear_model import LinearRegression

Pull climate data via API call to create datasets for each category

In [21]:
# NOTE. THESE URLS UPDATE frequently, so you will need to update the date at the end of the url to match the current version
# find the urls at https://www.ncei.noaa.gov/pub/data/cirs/climdiv/
# i.e change https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-pcpncy-v1.0.0-20250306' to
# https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-pcpncy-v1.0.0-20250404'
# data was originally retrieved for the '20250306' data, but below urls are updated to '20250404' for the github repository

url_dict = {'total_precipitation':('https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-pcpncy-v1.0.0-20250404','inches'),
            'avg_temp': ('https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-tmpccy-v1.0.0-20250404','F'),
            'max_temp': ('https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-tmaxcy-v1.0.0-20250404','F'),
            'min_temp': ('https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-tmincy-v1.0.0-20250404','F'),
            'cool_deg_days': ('https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-cddccy-v1.0.0-20250404', 'count'), 
            'heat_deg_days': ('https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-hddccy-v1.0.0-20250404', 'count')
           }

In [22]:
# On the NOAA site, the state codes do not match state FIPS codes. Below fixes this.
# https://www.bls.gov/respondents/mwr/electronic-data-interchange/appendix-d-usps-state-abbreviations-and-fips-codes.htm
state_code_dict={'01':'01', '02':'04', '03':'05', '04':'06', '05':'08', '06':'09', '07':'10',
                '08':'12', '09':'13', '10':'16', '11':'17', '12':'18', '13':'19', '14':'20',
                '15':'21', '16':'22', '17': '23', '18':'24', '19':'25', '20':'26', '21':'27', 
                '22':'28', '23':'29', '24':'30', '25':'31', '26':'32', '27':'33', '28':'34', 
                '29':'35', '30':'36', '31':'37', '32':'38', '33':'39', '34':'40', '35':'41',
                '36':'42', '37':'44', '38':'45', '39':'46', '40':'47', '41':'48', '42':'49',
                '43':'50', '44':'51', '45':'53', '46':'54', '47':'55', '48':'56', '49':'15',
                '50':'02'}

In [23]:
all_dfs = {}
for category, url in url_dict.items():
    response = requests.get(url[0])
    lines = response.text.splitlines()
    line_list = []
    columns = ['measure', 'unit', 'state', 'county', 'county_fips', 'year', 'jan', 'feb', 
               'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    for line in lines: 
        line = line.strip()
        fields = line.split()
        months = [float(x) for x in fields[1:]]
        id_string = fields[0]
        
        state = id_string[:2]
        state_fip = state_code_dict[state]
        county = id_string[2:5]
        county_fips = state_fip + county
        year = int(id_string[7:])
        
        prefix = [category, url[1], state, county, county_fips, year]
        line_list.append(prefix + months)
    
    df = pd.DataFrame(line_list, columns = columns)
    all_dfs[category] = df

total_precip = all_dfs['total_precipitation']
max_temp = all_dfs['max_temp']
min_temp = all_dfs['min_temp']
avg_temp = all_dfs['avg_temp']
cool = all_dfs['cool_deg_days']
hot = all_dfs['heat_deg_days']

Calculate means and slopes for every month for all datasets

In [14]:
cleaned_dfs = {}
column_months = ['jan_mean', 'feb_mean', 'mar_mean', 'apr_mean', 'may_mean', 'jun_mean', 'jul_mean', 'aug_mean', 
                 'sep_mean', 'oct_mean', 'nov_mean', 'dec_mean', 'jan_slope', 'feb_slope', 'mar_slope', 'apr_slope', 
                 'may_slope', 'jun_slope', 'jul_slope', 'aug_slope', 'sep_slope', 'oct_slope', 'nov_slope', 'dec_slope'
                ]

for df in [total_precip, max_temp, min_temp, avg_temp, cool, hot]:
    
    measure = df.loc[0, 'measure']
    unit = df.loc[0, 'unit']
    
    column_months_spec = [x + '_' + measure for x in column_months]
    cleaned_columns = ['measure', 'unit','county_fips'] + column_months_spec    
    
    #filter for the years 2000-2020
    df = df[(df['year']>=2000) & (df['year']<2021)]
    
    #drop some columns. Then groupby county_fips.
    dropped = df.drop(['measure', 'unit', 'state', 'county', 'year'], axis = 1)
    grouped = dropped.groupby(['county_fips'])
    means_df = grouped.mean()
    
    groups = grouped.groups.keys()
    measure_dict = {}
    for g in groups:
        county_df = grouped.get_group(g).reset_index(drop = True)
        county = county_df.loc[0, 'county_fips']
        
        #pull mean data
        month_means = list(means_df.loc[g])
        
        #find slopes
        year_count = len(county_df)
        month_slopes = []
        for month in county_df.columns[1:]:
            data = county_df[month]
            x = np.array(list(range(year_count))).reshape(-1,1)
            y = np.array(data).reshape(-1,1)
            linreg = LinearRegression().fit(x, y)            
            slope = linreg.coef_[0][0]
            month_slopes.append(slope)
        
        #dict with list data for each county
        full_data = [measure, unit, county] + month_means + month_slopes
        measure_dict[county] = full_data
    
    #create df for each measure (avg temp, precipitation, etc)
    measure_df = pd.DataFrame.from_dict(measure_dict, orient = 'index')
    measure_df.columns = cleaned_columns
    cleaned_dfs[measure] = measure_df

merge all dataframes and export

In [17]:
all_dfs = list(cleaned_dfs.values())
base_df = all_dfs[0]
base_df = base_df.drop(['measure', 'unit'], axis = 1)
merge_dfs = all_dfs[1:]

for df in merge_dfs:
    df = df.drop(['measure', 'unit'], axis = 1)
    base_df = base_df.merge(df,how = 'left', on='county_fips')

In [24]:
base_df.to_csv('merge_data/climate_all_data_clean.csv', index = False)

remove heat and cool days and export that version too. This version is used for modeling since heat/cool days has too much missing data.

In [19]:
all_dfs = list(cleaned_dfs.values())[:-2]
base_df = all_dfs[0]
base_df = base_df.drop(['measure', 'unit'], axis = 1)
merge_dfs = all_dfs[1:]

for df in merge_dfs:
    df = df.drop(['measure', 'unit'], axis = 1)
    base_df = base_df.merge(df,how = 'left', on='county_fips')
    
base_df.to_csv('merge_data/climate_all_data_clean_without_heat-cool-days.csv', index = False)