In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
import os
csv_folder = 'data/CDC'
date_list = [
    "05-31-2021",
    "06-30-2021",
    "07-31-2021",
    "08-31-2021",
    "09-30-2021",
    "10-31-2021",
    "11-30-2021",
    "12-31-2021",
    "01-31-2022",
    "02-28-2022",
    "03-31-2022",
    "04-30-2022",
    "05-31-2022",
    "06-29-2022",
    "07-27-2022",
    "08-31-2022",
    "09-28-2022",
]
data_dict = {}

for date in date_list:
    file_path = os.path.join(csv_folder, f'vaccinations-{date}.csv')
    
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        data_dict[date] = df
    else:
        print(f"File not found for {date}")

print(data_dict.keys())
# Now data_dict contains DataFrames with keys as dates


dict_keys(['05-31-2021', '06-30-2021', '07-31-2021', '08-31-2021', '09-30-2021', '10-31-2021', '11-30-2021', '12-31-2021', '01-31-2022', '02-28-2022', '03-31-2022', '04-30-2022', '05-31-2022', '06-29-2022', '07-27-2022', '08-31-2022', '09-28-2022'])


In [3]:
sorted_data_dict = {date: df.sort_values('FIPS') for date, df in data_dict.items()}
print(sorted_data_dict['05-31-2021'].head())
print(sorted_data_dict['06-30-2021'].head())
print(sorted_data_dict['07-31-2021'].head())
print(sorted_data_dict['02-28-2022'].head())
print(sorted_data_dict['03-31-2022'].head())

for date, df in sorted_data_dict.items():
    print(f"Dataset for {date} has {df.shape[0]} rows.")


            Date   FIPS  MMWR_week    Recip_County Recip_State  \
1576  05/31/2021  01001         22  Autauga County          AL   
203   05/31/2021  01003         22  Baldwin County          AL   
184   05/31/2021  01005         22  Barbour County          AL   
780   05/31/2021  01007         22     Bibb County          AL   
2462  05/31/2021  01009         22   Blount County          AL   

      Completeness_pct  Administered_Dose1_Recip  Administered_Dose1_Pop_Pct  \
1576              91.8                   13982.0                        25.0   
203               91.8                   64644.0                        29.0   
184               91.8                    5739.0                        23.2   
780               91.8                    5131.0                        22.9   
2462              91.8                   10659.0                        18.4   

      Administered_Dose1_Recip_5Plus  Administered_Dose1_Recip_5PlusPop_Pct  \
1576                             NaN       

In [4]:
merged_df = pd.concat(sorted_data_dict.values(), ignore_index=True)
#print(merged_df.head())
print(merged_df.shape)
merged_df.isnull().sum()
merged_df.dropna(axis=1, how='all', inplace=True)
merged_df.isnull().sum()

(55808, 80)


Date                        0
FIPS                        0
MMWR_week                   0
Recip_County                0
Recip_State                 3
                        ...  
Census2019_5PlusPop     23598
Census2019_5to17Pop     33261
Census2019_12PlusPop     1058
Census2019_18PlusPop     1051
Census2019_65PlusPop    23598
Length: 72, dtype: int64

In [5]:
columns_to_fill = [col for col in merged_df.columns if 'Booster' in col]
merged_df[columns_to_fill] = merged_df[columns_to_fill].fillna(0)


In [6]:
print(merged_df.columns.tolist())
null_counts = merged_df.isnull().sum()
columns_to_drop = null_counts[null_counts > 20000].index
merged_df.drop(columns_to_drop, axis=1, inplace=True)


['Date', 'FIPS', 'MMWR_week', 'Recip_County', 'Recip_State', 'Completeness_pct', 'Administered_Dose1_Recip', 'Administered_Dose1_Pop_Pct', 'Administered_Dose1_Recip_5Plus', 'Administered_Dose1_Recip_5PlusPop_Pct', 'Administered_Dose1_Recip_12Plus', 'Administered_Dose1_Recip_12PlusPop_Pct', 'Administered_Dose1_Recip_18Plus', 'Administered_Dose1_Recip_18PlusPop_Pct', 'Administered_Dose1_Recip_65Plus', 'Administered_Dose1_Recip_65PlusPop_Pct', 'Series_Complete_Yes', 'Series_Complete_Pop_Pct', 'Series_Complete_5Plus', 'Series_Complete_5PlusPop_Pct', 'Series_Complete_5to17', 'Series_Complete_5to17Pop_Pct', 'Series_Complete_12Plus', 'Series_Complete_12PlusPop_Pct', 'Series_Complete_18Plus', 'Series_Complete_18PlusPop_Pct', 'Series_Complete_65Plus', 'Series_Complete_65PlusPop_Pct', 'Booster_Doses', 'Booster_Doses_Vax_Pct', 'Booster_Doses_5Plus', 'Booster_Doses_5Plus_Vax_Pct', 'Booster_Doses_12Plus', 'Booster_Doses_12Plus_Vax_Pct', 'Booster_Doses_18Plus', 'Booster_Doses_18Plus_Vax_Pct', 'Boost

In [7]:
columns_to_drop

Index(['Administered_Dose1_Recip_5Plus',
       'Administered_Dose1_Recip_5PlusPop_Pct', 'Series_Complete_5Plus',
       'Series_Complete_5PlusPop_Pct', 'Series_Complete_5to17',
       'Series_Complete_5to17Pop_Pct', 'Series_Complete_5PlusPop_Pct_SVI',
       'Series_Complete_5to17Pop_Pct_SVI',
       'Series_Complete_5PlusPop_Pct_UR_Equity',
       'Series_Complete_5to17Pop_Pct_UR_Equity', 'Census2019_5PlusPop',
       'Census2019_5to17Pop', 'Census2019_65PlusPop'],
      dtype='object')

In [8]:
merged_df.isnull().sum()

Date                                          0
FIPS                                          0
MMWR_week                                     0
Recip_County                                  0
Recip_State                                   3
Completeness_pct                            211
Administered_Dose1_Recip                   1941
Administered_Dose1_Pop_Pct                 1094
Administered_Dose1_Recip_12Plus            3395
Administered_Dose1_Recip_12PlusPop_Pct     1663
Administered_Dose1_Recip_18Plus            3037
Administered_Dose1_Recip_18PlusPop_Pct     1299
Administered_Dose1_Recip_65Plus            3030
Administered_Dose1_Recip_65PlusPop_Pct     1292
Series_Complete_Yes                         203
Series_Complete_Pop_Pct                     633
Series_Complete_12Plus                      563
Series_Complete_12PlusPop_Pct               993
Series_Complete_18Plus                      203
Series_Complete_18PlusPop_Pct               633
Series_Complete_65Plus                  

In [9]:
merged_df['Date'] = pd.to_datetime(merged_df['Date']).dt.strftime('%Y-%m-%d')
sorted_merged_df = merged_df.sort_values(['FIPS', 'Date'], ascending=[True, True])
print(sorted_merged_df.head())
unique_fips = sorted_merged_df['FIPS'].unique()
print(unique_fips)


             Date   FIPS  MMWR_week    Recip_County Recip_State  \
0      2021-05-31  01001         22  Autauga County          AL   
3282   2021-06-30  01001         26  Autauga County          AL   
6564   2021-07-31  01001         30  Autauga County          AL   
9846   2021-08-31  01001         35  Autauga County          AL   
13128  2021-09-30  01001         39  Autauga County          AL   

       Completeness_pct  Administered_Dose1_Recip  Administered_Dose1_Pop_Pct  \
0                  91.8                   13982.0                        25.0   
3282               91.7                   15910.0                        28.5   
6564               91.7                   17484.0                        31.3   
9846               92.1                   20499.0                        36.7   
13128              92.5                   23988.0                        42.9   

       Administered_Dose1_Recip_12Plus  \
0                              13981.0   
3282                      

In [19]:
merged_df.fillna(method='ffill', inplace=True)
merged_df.drop(merged_df[merged_df['FIPS'] == 'UNK'].index, inplace=True)
merged_df['FIPS'] = merged_df['FIPS'].astype(int)
print(merged_df.shape)


(54808, 59)


In [16]:
csv_folder = 'data/JHU'
date_list = [
    "05-31-2021",
    "06-30-2021",
    "07-31-2021",
    "08-31-2021",
    "09-30-2021",
    "10-31-2021",
    "11-30-2021",
    "12-31-2021",
    "01-31-2022",
    "02-28-2022",
    "03-31-2022",
    "04-30-2022",
    "05-31-2022",
    "06-29-2022",
    "07-27-2022",
    "08-31-2022",
    "09-28-2022",
]
data_dict = {}

for date in date_list:
    file_path = os.path.join(csv_folder, f'deaths-05-01-2021-to-{date}.csv')
    
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        data_dict[date] = df
    else:
        print(f"File not found for {date}")

print(data_dict.keys())

dict_keys(['05-31-2021', '06-30-2021', '07-31-2021', '08-31-2021', '09-30-2021', '10-31-2021', '11-30-2021', '12-31-2021', '01-31-2022', '02-28-2022', '03-31-2022', '04-30-2022', '05-31-2022', '06-29-2022', '07-27-2022', '08-31-2022', '09-28-2022'])


In [17]:
# Create an empty list to store the modified datasets
modified_datasets = []

# Iterate over the data_dict dictionary
for date, df in data_dict.items():
    # Add a 'Date' column with the corresponding date
    df['Date'] = date
    # Append the modified dataset to the list
    modified_datasets.append(df)

# Concatenate all the datasets into one dataframe
concatenated_df = pd.concat(modified_datasets)
concatenated_df['Date'] = pd.to_datetime(concatenated_df['Date']).dt.strftime('%Y-%m-%d')
unique_fips = concatenated_df['FIPS'].unique()


In [18]:
joined_df = merged_df.merge(concatenated_df, on=['FIPS', 'Date'])
print(joined_df)


             Date   FIPS  MMWR_week         Recip_County Recip_State  \
0      2021-05-31   1001         22       Autauga County          AL   
1      2021-05-31   1003         22       Baldwin County          AL   
2      2021-05-31   1005         22       Barbour County          AL   
3      2021-05-31   1007         22          Bibb County          AL   
4      2021-05-31   1009         22        Blount County          AL   
...           ...    ...        ...                  ...         ...   
54293  2022-09-28  72145         39  Vega Baja Municipio          PR   
54294  2022-09-28  72147         39    Vieques Municipio          PR   
54295  2022-09-28  72149         39   Villalba Municipio          PR   
54296  2022-09-28  72151         39    Yabucoa Municipio          PR   
54297  2022-09-28  72153         39      Yauco Municipio          PR   

       Completeness_pct  Administered_Dose1_Recip  Administered_Dose1_Pop_Pct  \
0                  91.8                   13982.0     

In [20]:
joined_df.to_csv('data/merged_data.csv', index=False)
joined_df.to_pickle('data/merged_data.pkl')