In [57]:
import dataclasses
import functools
import geopandas as gpd
import glob
import numpy as np
import pandas as pd
import re
from typing import Callable, Dict, List


us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}

us_abbrev_to_state = {v: k for k, v in us_state_to_abbrev.items()}


def load_shapes():
    # Load CoC shapefile
    coc_shapefile_path = 'data/CoC_GIS_National_Boundary_2022/CoC_GIS_National_Boundary.gdb'
    coc_gdf = gpd.read_file(coc_shapefile_path)
    # Load county shapefile
    county_shapefile_path = 'data/tl_2022_us_county/tl_2022_us_county.shp'
    county_gdf = gpd.read_file(county_shapefile_path)
    # Make sure both GeoDataFrames have the same CRS (Coordinate Reference System)
    county_gdf = county_gdf.to_crs(coc_gdf.crs)
    # Perform spatial join between CoCs and counties
    coc_county_gdf = gpd.sjoin(county_gdf, coc_gdf, how='inner', op='intersects', lsuffix='cnty', rsuffix='coc')
    coc_county_gdf.columns = coc_county_gdf.columns.str.lower()
    coc_county_gdf['county_name'] = coc_county_gdf['name']
    coc_county_gdf['coc_number'] = coc_county_gdf['cocnum']
    coc_county_gdf['coc_name'] = coc_county_gdf['cocname']
    coc_county_gdf = coc_county_gdf.drop(columns=['cocnum', 'cocname', 'name'])
    return coc_county_gdf[['coc_name', 'coc_number', 'county_name', 'state_name']]


def load_hud_coc_data(filename: str, year: int) -> pd.DataFrame:
    df = pd.read_excel(filename, sheet_name=str(year))
    df.columns = [
        col.lower()
            .replace(',', '')
            .replace(' - ', '_')
            .replace(' ', '_')
            .replace('/', '_')
            .replace('-', '_to_')
            .replace('(', '')
            .replace(')', '')
        for col in df.columns
    ]
    if year != 2022:
        df = df.rename(columns={
            f'overall_homeless_black_or_african_american_{year}': f'overall_homeless_black_african_american_or_african_{year}',
            f'overall_homeless_asian_{year}': f'overall_homeless_asian_or_asian_american_{year}',
        })
    year_cols = [
        'overall_homeless',
        'sheltered_total_homeless',
        'unsheltered_homeless',
        'overall_homeless_male',
        'overall_homeless_female',
        'overall_homeless_white',
        'overall_homeless_hispanic_latino',
        'overall_homeless_black_african_american_or_african',
        'overall_homeless_asian_or_asian_american',
    ]
    df = df[['coc_number', 'coc_name', *[f'{col}_{year}' for col in year_cols]]]
    return df


def load_hud_affordable_units_data(filename: str, by: str, year: int) -> pd.DataFrame:
    df = pd.read_excel(filename, header=0, usecols='A:L')
    df.columns = [col.lower() for col in df.columns]
    df = df[df['program_label'] == 'Summary of All HUD Programs']
    if 'quarter' in df.columns:
        assert all(df[df['quarter'].dt.year[0] == year])
    cols_to_return = [
        'state_name',
        f'affordable_units_total_{year}', f'affordable_units_occupied_{year}', f'affordable_units_vacant_{year}',
    ]
    df['state_name'] = df['states'].str[3:]
    df[f'affordable_units_occupied_{year}'] = (df['total_units'] * df['pct_occupied'] / 100).astype(np.int32)
    df[f'affordable_units_vacant_{year}'] = (df['total_units'] - df[f'affordable_units_occupied_{year}']).astype(np.int32)
    df = df.rename(columns={'total_units': f'affordable_units_total_{year}'})
    if by == 'county':
        cols_to_return.append('county_name')
        df['county_name'] = df['name'].str.removesuffix(' County')
        return df[cols_to_return]
    if by == 'state':
        df = df[df['state_name'].isin(us_state_to_abbrev.keys())]
        df = df[cols_to_return]
        df = df.groupby(['state_name']).sum()
        df = df.reset_index()
        return df[cols_to_return]
    raise ValueError(f'Unknown value for `by`, expected `county` or `state`, got `{by}`')


def load_usa_temperature_data(filename: str, by: str, year: int) -> pd.DataFrame:
    df = pd.read_csv(filename)
    df = df.rename(columns={'mean_annual_temperature': f'mean_annual_temperature_{year}'})
    return df
    
    
def load_bls_unemployment_data(filename: str, by: str) -> pd.DataFrame:
    df = pd.read_excel(filename, index_col=0, header=4, usecols='D:I', skipfooter=3)
    df = df[1:]
    df = df.drop(columns=['Unnamed: 5'])
    df = df.astype(np.int32)
    df.columns = [col.lower().replace(' ', '_') for col in df.columns]
    df = df.rename(columns={'force': 'labor_force'})
    df = pd.pivot(df, columns='year', values=['labor_force', 'employed', 'unemployed'])
    df.columns = ['_'.join(map(str, col)) for col in df.columns]
    data_cols = df.columns
    df = df.reset_index(names='state_and_county')
    ddf = df['state_and_county'].str.split(', ', expand=True)
    df = df.assign(
        county_name=ddf[0].str.removesuffix(' County'),
        state_name=ddf[1].map(us_abbrev_to_state),
    )
    df = df.reset_index()
    df = df[['state_name', 'county_name', *data_cols]]
    if by == 'county':
        return df
    if by == 'state':
        df = df.groupby('state_name').sum()
        df = df.drop(columns=['county_name'])
        return df
    raise ValueError(f'Unknown value for `by`, expected `county` or `state`, got `{by}`')


def load_census_population_data(filename: str, by: str, year: int) -> pd.DataFrame:
    years = [year]
    if year < 2021:
        df = pd.read_csv(filename)
        df = df[df['COUNTY'] != 0]
        col_mapping = {f'POPESTIMATE{year}': f'population_{year}' for year in years}
        df = df.rename(columns=col_mapping)
        df = df.assign(
            county_name=df['CTYNAME'].str.removesuffix(' County'),
            state_name=df['STNAME'],
        )
        data_cols = col_mapping.values()
    else:
        df = pd.read_excel(filename, index_col=0, header=3, usecols='A:E', skipfooter=5)
        df = df.drop(columns='Unnamed: 1')
        df.columns = [f'population_{col}' for col in df.columns]
        data_cols = df.columns
        df = df[1:]  # Remove row for entire United States
        df = df.reset_index(names='state_and_county')
        df['state_and_county'] = df['state_and_county'].str.removeprefix('.')
        ddf = df['state_and_county'].str.split(', ', expand=True)
        df = df.assign(
            county_name=ddf[0].str.removesuffix(' County'),
            state_name=ddf[1],
        )
    if by == 'county':
        return df[['state_name', 'county_name', *data_cols]]
    if by == 'state':
        df = df.drop(columns=['county_name'])
        df = df.groupby('state_name').sum().reset_index()
        return df[['state_name', *data_cols]]
    raise ValueError(f'Unknown value for `by`, expected `county` or `state`, got `{by}`')


def load_bea_income_data(filename: str, by: str, year: int) -> pd.DataFrame:
    years = [year]
    if by == 'county' and year > 2020:
        df = pd.read_excel(filename, index_col=0, header=3, usecols='A:D', skipfooter=5)
        df.columns = [f'per_capita_income_{col}' for col in df.columns]
        data_cols = df.columns
        df = df[1:]  # Remove row for entire United States
        df = df.reset_index(names='state_or_county')
        return load_bea_data(df, data_cols)
    if by == 'county':
        df = pd.read_csv(filename)
        df = df[df['Description'] == 'Per capita personal income (dollars) 2/']
        ddf = df['GeoName'].str.split(', ', expand=True)
        df = df.assign(
            county_name=ddf[0],
            state_name=ddf[1].map(us_abbrev_to_state),
        )
        df = df[~df['state_name'].isna()]
        col_mapping = {str(year): f'per_capita_income_{year}' for year in years}
        df = df.rename(columns=col_mapping)
        for col in col_mapping.values():
            df[col] = df[col].astype(np.float64)
        df = df[['state_name', 'county_name', *col_mapping.values()]]
        df = df.reset_index(drop=True)
        return df
    if by == 'state' and year > 2021:
        df = pd.read_excel(filename, index_col=0, header=3, usecols='A:D', skipfooter=5)
        df = df[df['Description'] == 'Per capita personal income (dollars) 2/']
        ddf = df['GeoName'].str.split(', ', expand=True)
        df = df.assign(
            county_name=ddf[0],
            state_name=ddf[1].map(us_abbrev_to_state),
        )
        df = df[df['state_name'].isin(us_state_to_abbrev.keys())]
        col_mapping = {str(year): f'per_capita_income_{year}' for year in years}
        df = df.rename(columns=col_mapping)
        df = df[['state_name', 'county_name', *col_mapping.values()]]
        return df
    if by == 'state':
        df = pd.read_csv(filename)
        df = df[df['Description'] == 'Per capita personal income (dollars) 2/']
        df = df.assign(state_name=df['GeoName'].str.removesuffix(' *'))
        df = df[df['state_name'].isin(us_state_to_abbrev.keys())]
        col_mapping = {str(year): f'per_capita_income_{year}' for year in years}
        df = df.rename(columns=col_mapping)
        df = df[['state_name', *col_mapping.values()]]
        df = df.reset_index(drop=True)
        return df
    raise ValueError(f'Unknown value for `by`, expected `county` or `state`, got `{by}`')


def load_bea_gdp_data(filename: str, by: str) -> pd.DataFrame:
    if by != 'county':
        raise ValueError(f'Unknown value for `by`, expected `county`, got `{by}`')
    df = pd.read_excel(filename, index_col=0, header=3, usecols='A:E', skipfooter=4)
    df.columns = [f'real_gdp_2012_dollars_{col}' for col in df.columns]
    data_cols = df.columns
    df = df * 1000  # Units are thousands of dollars. Convert to just dollars.
    df = df[1:]  # Remove row for entire United States.
    df = df.reset_index(names='state_or_county')
    return load_bea_data(df, data_cols)


def load_bea_data(df: pd.DataFrame, data_cols: List[str]) -> pd.DataFrame:
    df = df.copy(deep=True)
    rows = []
    df['is_state'] = False
    df['county_name'] = df['state_or_county']
    is_state = False
    for i, row in df.iterrows():
        if is_state:
            row.is_state = True
            is_state = False
        if all(row[data_cols].isna()):
            state_name = df.loc[i+1, 'state_or_county']
            is_state = True
        else:
            row['state_name'] = state_name
        rows.append(row)
    parsed_df = pd.DataFrame(rows)
    parsed_df = parsed_df.dropna()
    parsed_df = parsed_df[parsed_df['is_state'] == False]
    parsed_df = parsed_df.drop(columns=['is_state', 'state_or_county'])
    parsed_df = parsed_df[[
        'state_name', 'county_name', *data_cols]]
    return parsed_df


def load_zillow_housing_price_data(index: str, filename: str, by: str, year: int) -> pd.DataFrame:
    years = [year]
    df = pd.read_csv(filename)
    if by == 'county':
        df['county_name'] = df['RegionName'].str.removesuffix(' County')
        df['state_name'] = df['StateName'].map(us_abbrev_to_state)
        index_cols = ['state_name', 'county_name']
    elif by == 'state':
        df['state_name'] = df['RegionName'].str.removesuffix(' County')
        index_cols = ['state_name']
    else:
        raise ValueError(f'Unknown value for `by`, expected `county` or `state`, got `{by}`')
    for year in years:
        df[f'{index}_value_{year}'] = df[[v for v in df.columns if v.startswith(f'{year}-')]].mean(axis=1)
    df = df[[*index_cols, *[f'{index}_value_{year}' for year in years]]]
    return df


def load_opportunity_insights_social_capital_data(filename: str, by: str, year: int) -> pd.DataFrame:
    if by != 'county':
        raise ValueError(f'Unknown value for `by`, expected `county`, got `{by}`')
    df = pd.read_csv(filename)
    ddf = df['county_name'].str.split(', ', expand=True)
    df = df.assign(
        county_name=ddf[0],
        state_name=ddf[1],
    )
    df = df[[
        'state_name', 'county_name',
        'num_below_p50', 'ec_county', 'ec_se_county',
        'child_ec_county', 'child_ec_se_county', 'ec_grp_mem_county',
        'ec_high_county', 'ec_high_se_county', 'child_high_ec_county',
        'child_high_ec_se_county', 'ec_grp_mem_high_county',
        'exposure_grp_mem_county', 'exposure_grp_mem_high_county',
        'child_exposure_county', 'child_high_exposure_county',
        'bias_grp_mem_county', 'bias_grp_mem_high_county', 'child_bias_county',
        'child_high_bias_county', 'clustering_county', 'support_ratio_county',
        'volunteering_rate_county', 'civic_organizations_county',
    ]]
    df.columns = [f'{col}_{year}' if col not in ['state_name', 'county_name'] else col for col in df.columns]
    return df


def load_coc_to_county_df() -> pd.DataFrame:
    fips_to_county_df = pd.read_csv('data/fips2county.tsv', sep='\t')
    fips_to_county_df = fips_to_county_df.rename(columns={
        'StateFIPS': 'state_fips',
        'StateName': 'state_name',
        'CountyName': 'county_name',
        'CountyFIPS': 'county_fips',
    })
    fips_to_county_df = fips_to_county_df[['state_fips', 'state_name', 'county_name', 'county_fips']]
    fips_to_county_df

    coc_mapping_df = pd.read_csv('data/county_coc_match.csv')
    coc_mapping_df = coc_mapping_df[coc_mapping_df['rel_type'] != 4.0].groupby(['coc_name', 'coc_number', 'county_fips']).count().reset_index()
    coc_mapping_df = coc_mapping_df[['coc_name', 'coc_number', 'county_fips']]
    coc_mapping_df

    coc_mapping_df = coc_mapping_df.merge(fips_to_county_df, on=['county_fips'], how='inner')
    return coc_mapping_df


def merge_dfs(dfs: List[pd.DataFrame], cols: List[str]) -> pd.DataFrame:
    merged_df = dfs[0]
    for i, df in enumerate(dfs[1:]):
        merged_df = merged_df.merge(df, on=cols, how='inner')
        print(i, len(merged_df))
    return merged_df


def file_in_year_range(year: int, path_regex: str) -> str:
    files = glob.glob(f'{path_regex}_*')
    for f in files:
        m = re.search(f'{path_regex}_(\d+)_to_(\d+)\.*', f)
        start_year = int(m.group(1))
        end_year = int(m.group(2))
        if year >= start_year and year <= end_year:
            return f
    raise ValueError(f'Unable to find file in path for year {year}, by {by} at path {path_regex}')


def file_path_for_unemployement_data(year: int, by: str) -> str:
    return f'data/by_county/BLS_unemployment_by_county_{year}.xlsx'


def file_path_for_income_data(year: int, by: str) -> str:
    return file_in_year_range(year, f'data/by_{by}/BEA_income_by_{by}')


def file_path_for_gdp_data(year: int, by: str) -> str:
    return file_in_year_range(year, f'data/by_{by}/BEA_GDP_by_{by}')


def file_path_for_social_capital_data(year: int, by: str) -> str:
    return f'data/by_{by}/Opportunity_Insights_social_capital_by_{by}_{year}.csv'


def file_path_for_zillow_data(index: str, year: int, by: str) -> str:
    return file_in_year_range(year, f'data/by_{by}/Zillow_{index}_by_{by}')


def file_path_for_affordable_units_data(year: int, by: str) -> str:
    return f'data/by_{by}/HUD_projects_by_{by}_{year}.xlsx'


def file_path_for_population_data(year: int, by: str) -> str:
    return file_in_year_range(year, 'data/by_county/Census_population_by_county')

def file_path_for_temperature_data(year: int, by: str) -> str:
    return 'data/by_state/Average_annual_temperature_by_state.csv'


In [53]:
@dataclasses.dataclass
class Dataset:
    load_fn: Callable
    file_path_fn: Callable
    sum_cols: List[str] = ()
    mean_cols: List[str] = ()        


ALL_DATASETS = {
    'unemployment': Dataset(
        load_fn=load_bls_unemployment_data,
        file_path_fn=file_path_for_unemployement_data,
        sum_cols=['labor_force', 'employed', 'unemployed'],
    ),
    'income': Dataset(
        load_fn=load_bea_income_data,
        file_path_fn=file_path_for_income_data,
        mean_cols=['per_capita_income'],
    ),
    'gdp': Dataset(
        load_fn=load_bea_gdp_data,
        file_path_fn=file_path_for_gdp_data,
        sum_cols=['real_gdp_2012_dollars'],
    ),
    'social_capital': Dataset(
        load_fn=load_opportunity_insights_social_capital_data,
        file_path_fn=file_path_for_social_capital_data,
        mean_cols=[
            'ec_county',
            'child_ec_county',
            'ec_grp_mem_county',
            'ec_high_county',
            'child_high_ec_county',
            'ec_grp_mem_high_county',
            'exposure_grp_mem_county',
            'exposure_grp_mem_high_county',
            'child_exposure_county',
            'child_high_exposure_county',
            'bias_grp_mem_county',
            'bias_grp_mem_high_county',
            'child_bias_county',
            'child_high_bias_county',
            'clustering_county',
            'support_ratio_county',
            'volunteering_rate_county',
            'civic_organizations_county',
        ],
    ),
    'population': Dataset(
        load_fn=load_census_population_data,
        file_path_fn=file_path_for_population_data,
        sum_cols=['population'],
    ),
    'affordable_units': Dataset(
        load_fn=load_hud_affordable_units_data,
        file_path_fn=file_path_for_affordable_units_data,
        sum_cols=['affordable_units_total', 'affordable_units_occupied', 'affordable_units_vacant'],
    ),
    'all_homes': Dataset(
        load_fn=functools.partial(load_zillow_housing_price_data, 'zhvi_all_homes'),
        file_path_fn=functools.partial(file_path_for_zillow_data, 'zhvi_all_homes'),
        mean_cols=['zhvi_bottom_tier_value'],
    ),
    'bottom_tier_homes': Dataset(
        load_fn=functools.partial(load_zillow_housing_price_data, 'zhvi_bottom_tier'),
        file_path_fn=functools.partial(file_path_for_zillow_data, 'zhvi_bottom_tier'),
        mean_cols=['zhvi_all_homes_value'],
    ),
    'rent': Dataset(
        load_fn=functools.partial(load_zillow_housing_price_data, 'zori_rent'),
        file_path_fn=functools.partial(file_path_for_zillow_data, 'zori_rent'),
        mean_cols=['zori_rent_value'],
    ),
    'temperature': Dataset(
        load_fn=load_usa_temperature_data,
        file_path_fn=file_path_for_temperature_data,
        mean_cols=['mean_annual_temperature'],
    ),
}


def get_sum_cols(dataset_names: List[str], year: int) -> List[str]:
    cols = []
    for dataset_name in dataset_names:
        cols += ALL_DATASETS[dataset_name].sum_cols
    return [f'{col}_{year}' for col in cols]


def get_mean_cols(dataset_names: List[str], year: int) -> List[str]:
    cols = []
    for dataset_name in dataset_names:
        cols += ALL_DATASETS[dataset_name].mean_cols
    return [f'{col}_{year}' for col in cols]


def load_dataset(dataset: Dataset, year: int, by=str) -> pd.DataFrame:
    try:
        return dataset.load_fn(dataset.file_path_fn(year, by), by=by)
    except TypeError:
        return dataset.load_fn(dataset.file_path_fn(year, by), by=by, year=year)


def load_datasets(dataset_names: List[str], year: int, by: str) -> Dict[str, pd.DataFrame]:
    datasets = {}
    for dataset_name in dataset_names:
        datasets[dataset_name] = load_dataset(ALL_DATASETS[dataset_name], year, by)
    return datasets


def create_features_df(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    coc_to_county_df = load_coc_to_county_df()
    one_off_datasets = ['affordable_units', 'temperature']
    feature_merge_cols = ['state_name', 'county_name'] if by == 'county' else ['state_name']
    dfs = [v for k, v in datasets.items() if k not in one_off_datasets]
    features_df = merge_dfs(dfs + [coc_to_county_df], feature_merge_cols)
    if 'affordable_units' in datasets:
        features_df = features_df.merge(datasets['affordable_units'], how='left', on=feature_merge_cols).fillna(0)
    if 'temperature' in datasets:
        features_df = features_df.merge(datasets['temperature'], how='inner', on=['state_name'])
    return features_df


def create_model_df(features_df: pd.DataFrame, year: int, by: str) -> pd.DataFrame:
    coc_df = load_hud_coc_data('data/HUD_2007-2022-PIT-Counts-by-CoC.xlsx', year)
    coc_data_cols = [col for col in coc_df.columns if col not in ['coc_name', 'coc_number']]
    model_df = features_df.merge(coc_df, on=['coc_number'], how='inner')
    model_df = model_df.rename(columns={'coc_name_x': 'coc_name'})
    cols = ['coc_number', 'coc_name', 'state_name']
    if by == 'county':
        cols.append('county_name')
    model_df = model_df.sort_values(by=cols).reset_index(drop=True)
    return model_df, coc_data_cols


def create_dataset(config: DataConfig) -> pd.DataFrame:
    year = config.year
    by = config.by
    dataset_names = config.dataset_names
    datasets = load_datasets(dataset_names, year, by)
    features_df = create_features_df(datasets)
    model_df, coc_data_cols = create_model_df(features_df, year, by)
    
    coc_model_df = model_df.copy(deep=True)
    sum_cols = get_sum_cols(dataset_names, year)
    mean_cols = get_mean_cols(dataset_names, year)

    if by == 'state':
        grouped_by_df = coc_model_df.groupby(['state_name'])
        coc_data_df = grouped_by_df[coc_data_cols].sum().reset_index()
        state_data_df = grouped_by_df[sum_cols + mean_cols].mean().reset_index()
        df = merge_dfs([coc_data_df, state_data_df], cols=['state_name'])
        df = df.sort_values(by=['state_name'])
    
    elif by == 'county':
        grouped_by_df = coc_model_df.groupby(['coc_number', 'coc_name'])
        total_mean_cols = [f'population_weighted_{col}' for col in mean_cols]
        coc_model_df[total_mean_cols] = coc_model_df[mean_cols].multiply(coc_model_df[f'population_{year}'], axis=0)
        coc_data_df = grouped_by_df[coc_data_cols].mean().reset_index()
        mean_data_df = grouped_by_df[total_mean_cols].sum().reset_index()
        sum_data_df = grouped_by_df[sum_cols].sum().reset_index()
        df = merge_dfs([coc_data_df, mean_data_df, sum_data_df], cols=['coc_number', 'coc_name'])
        df[mean_cols] = df[total_mean_cols].div(df[f'population_{year}'], axis=0)
        df = df.drop(columns=total_mean_cols)
        df = df.sort_values(by=['coc_number'])
        
    df.columns = [col.removesuffix(f'_{year}') for col in df.columns]
    return df


In [58]:
@dataclasses.dataclass
class DataConfig:
    year: int
    by: str
    dataset_names: List[str]
    suffix: str = ''


common_dataset_names = [
    'unemployment',
    'income',
    'all_homes',
    'bottom_tier_homes',
    'population',
    'affordable_units',
    'temperature',
]

additional_dataset_names = [
    'gdp',
    'social_capital',
    'rent',
]

# years = [2016, 2017, 2018, 2019, 2020, 2021]
# years = [2017]
# years = list(range(2011, 2022))
years = [
    2021,
#     2020,  # ValueError: invalid literal for int() with base 10: 'N.A.'
    2019,
    2018,
    2017,
    2016,
    2015,
#     2014,  # Here and earlier: Doesn't have as granular breakdown of homeless demographics
#     2013,
#     2012,
#     2011,
]

data_configs = [
    *[DataConfig(year, 'state', common_dataset_names) for year in years],
#     *[DataConfig(year, 'county', common_dataset_names) for year in years],
#     DataConfig(2021, 'county', common_dataset_names + ['social_capital'], suffix='with_social_capital'),
#     DataConfig(2021, 'county', common_dataset_names + ['rent'], suffix='with_rent'),
]


In [59]:
for config in data_configs:
    print(config)
    year = config.year
    by = config.by
    suffix = f'_{config.suffix}' if config.suffix else ''
    df = create_dataset(config)
    df.to_csv(f'data/model/model_dataset_by_{by}_{year}{suffix}.csv', index=False)    

DataConfig(year=2014, by='state', dataset_names=['unemployment', 'income', 'all_homes', 'bottom_tier_homes', 'population', 'affordable_units', 'temperature'], suffix='')
0 50
1 50
2 50
3 50
4 3102
0 50
DataConfig(year=2013, by='state', dataset_names=['unemployment', 'income', 'all_homes', 'bottom_tier_homes', 'population', 'affordable_units', 'temperature'], suffix='')


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [56]:
load_hud_affordable_units_data(file_path_for_affordable_units_data(2014, 'state'), by='state', year=2014)

Unnamed: 0,state_name,affordable_units_total_2014,affordable_units_occupied_2014,affordable_units_vacant_2014
0,Alabama,93537,84183,9354
1,Alaska,7728,6877,851
2,Arizona,40786,37523,3263
3,Arkansas,50996,45896,5100
4,California,481569,447859,33710
5,Colorado,60751,56498,4253
6,Connecticut,81875,74506,7369
7,Delaware,12919,11627,1292
8,District of Columbia,34114,28314,5800
9,Florida,195423,181743,13680
