In [84]:
import pandas as pd
import os
import numpy as np
import requests

current_dir = os.getcwd()

Before we begin processing our climate data, there's a problem here: The NOAA, which is the organization which we are scraping data from, uses a different set of state codes compared to FIPS state codes. In order to fix this, we need to make a dictionary which maps the NOAA's state codes to FIPS state codes, and then combine the FIPS state code with the county state code to create the proper FIPS code used in the rest of our datasets,

In [85]:
# Mapping of custom state codes to FIPS state codes
state_code_mapping = {
    '01': '01', '02': '04', '03': '05', '04': '06', '05': '08', '06': '09', '07': '10', '08': '12', '09': '13',
    '10': '16', '11': '17', '12': '18', '13': '19', '14': '20', '15': '21', '16': '22', '17': '23', '18': '24',
    '19': '25', '20': '26', '21': '27', '22': '28', '23': '29', '24': '30', '25': '31', '26': '32', '27': '33',
    '28': '34', '29': '35', '30': '36', '31': '37', '32': '38', '33': '39', '34': '40', '35': '41', '36': '42',
    '37': '44', '38': '45', '39': '46', '40': '47', '41': '48', '42': '49', '43': '50', '44': '51', '45': '53',
    '46': '54', '47': '55', '48': '56', '50': '02'
}

In [86]:
# URLs we are scraping
urls = {
    'Avg_Temp': "https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-tmpccy-v1.0.0-20240506",
    'Max_Temp': "https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-tmaxcy-v1.0.0-20240506",
    'Min_Temp': "https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-tmincy-v1.0.0-20240506",
    'Precipitation': "https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-pcpncy-v1.0.0-20240506"
}

In [87]:
# Array of DataFrames we are appending to
dfs_climate = []

# Iterate through each URL
for var, url in urls.items():
    # Fetching data from given URL
    response = requests.get(url)
    data = response.text
    
    # Parse data into DataFrame
    lines = data.strip().split('\n')
    
    parsed_data = []
    
    for line in lines:
        identification_number = line[:11]
        state_code = identification_number[:2]
        county_FIPS = identification_number[2:5]
        year = int(identification_number[7:11])
        
        monthly_data = line[11:].split()
        
        # Combine state_code and county_FIPS to form the FIPS code
        state_fips_code = state_code_mapping.get(state_code, state_code)
        fips_code = int(state_fips_code + county_FIPS)
        
        row = {'FIPS': fips_code, 'State_Code': state_code, 'County_FIPS': county_FIPS, 'Year': year, **{f'month_{i}': float(val) for i, val in enumerate(monthly_data, start=1)}}
        
        # Filter parsed data so only data between 1966-2022 is collected
        if year >= 1966 and year <= 2022:
            parsed_data.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(parsed_data)
    
    # Consolidate data from monthly to yearly
    if var == 'Avg_Temp':
        df['Avg_Temp'] = df[[f'month_{i}' for i in range(1, 13)]].mean(axis=1)
        df = df[['FIPS', 'State_Code', 'County_FIPS', 'Year', 'Avg_Temp']]
    elif var == 'Max_Temp':
        df['Max_Temp'] = df[[f'month_{i}' for i in range(1, 13)]].max(axis=1)
        df = df[['FIPS', 'State_Code', 'County_FIPS', 'Year', 'Max_Temp']]
    elif var == 'Min_Temp':
        df['Min_Temp'] = df[[f'month_{i}' for i in range(1, 13)]].min(axis=1)
        df = df[['FIPS', 'State_Code', 'County_FIPS', 'Year', 'Min_Temp']]
    elif var == 'Precipitation':
        df['Total_Precip'] = df[[f'month_{i}' for i in range(1, 13)]].sum(axis=1)
        df = df[['FIPS', 'State_Code', 'County_FIPS', 'Year', 'Total_Precip']]
    
    # Append DataFrame array of DataFrames
    dfs_climate.append(df)

In [88]:
# Merge all DataFrames into a single DataFrame
merged_df_climate = dfs_climate[0]

for df in dfs_climate[1:]:
    merged_df_climate = pd.merge(merged_df_climate, df, on=['FIPS', 'State_Code', 'County_FIPS', 'Year'], how='outer')

merged_df_climate.head()

Unnamed: 0,FIPS,State_Code,County_FIPS,Year,Avg_Temp,Max_Temp,Min_Temp,Total_Precip
0,1001,1,1,1966,62.391667,93.3,32.2,57.23
1,1001,1,1,1967,62.541667,88.1,34.9,56.32
2,1001,1,1,1968,61.633333,92.4,29.5,43.67
3,1001,1,1,1969,62.058333,92.5,32.3,48.11
4,1001,1,1,1970,62.666667,92.1,28.1,50.06


In [89]:
rows_with_nan = merged_df_climate[merged_df_climate.isna().any(axis=1)].copy()
print(rows_with_nan)

Empty DataFrame
Columns: [FIPS, State_Code, County_FIPS, Year, Avg_Temp, Max_Temp, Min_Temp, Total_Precip]
Index: []


In [90]:
# Put merged DataFrame into a CSV for easier access
# merged_df_climate.to_csv('Climate_Data_with_FIPS.csv', index=False)
merged_df_climate.to_csv(os.path.join(current_dir, '..', 'SharedData', 'Climate_Data_with_FIPS.csv'), index=False)