In [1]:
import pandas as pd
import numpy as np
import requests #api
import json #json file
from datetime import datetime

import matplotlib.pyplot as plt
from scipy.stats import linregress #stastistical analysis

from pathlib import Path #reading csv - connecting to the csv merges mac and windows

In [2]:
covid_vaccines_path = Path("COVID-19_Vaccines_Up_to_Date_Status.csv")
covid_vaccines_df = pd.read_csv(covid_vaccines_path)
covid_vaccines_df.head()

Unnamed: 0,Date,Location,Demographic_Category,census,Up_to_date,Up_to_date_pct_agegroup
0,09/13/2023,AK,Female_Ages_12-17_yrs,27938.0,2697.0,9.7
1,09/13/2023,AK,Female_Ages_75+_yrs,16346.0,7344.0,44.9
2,09/13/2023,AK,Male_Ages_12-17_yrs,29377.0,2598.0,8.8
3,09/13/2023,CA,Male_Ages_65+_yrs,2602393.0,1331083.0,51.1
4,09/13/2023,CT,Male_Ages_65+_yrs,276002.0,175897.0,63.7


In [3]:
hospitalizations_path = "https://data.cdc.gov/resource/n8mc-b4w4.json"
hospitalization_data = requests.get(hospitalizations_path).json()
#hospitalization_data

In [4]:
hospitalization_df = pd.DataFrame(hospitalization_data)
hospitalization_df.head()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,case_positive_specimen,underlying_conditions_yn
0,2021-12,FL,12,CHARLOTTE,12015,50 to 64 years,Female,White,Non-Hispanic/Latino,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,,
1,2020-11,MI,26,ALLEGAN,26005,65+ years,Female,White,Non-Hispanic/Latino,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,,
2,2021-03,OH,39,MEDINA,39103,,,,,0.0,Unknown,Unknown,Laboratory-confirmed case,Symptomatic,No,No,,1.0,
3,2022-10,RI,44,WASHINGTON,44009,18 to 49 years,Female,Unknown,Unknown,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,,
4,2020-08,WA,53,YAKIMA,53077,0 - 17 years,Male,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,Missing,0.0,


In [5]:
hospitalization_df.age_group.value_counts()

18 to 49 years    444
50 to 64 years    203
0 - 17 years      202
65+ years         125
Missing            15
NA                 11
Name: age_group, dtype: int64

In [6]:
covid_vaccines_df.Demographic_Category.value_counts()

Female_Ages_12-17_yrs    236
Ages_50-64_yrs           236
Male_Ages_2-4_yrs        236
Ages_25-39_yrs           236
Male_Ages_25-39_yrs      236
Sex_Unknown              236
Female_Ages_25-49_yrs    236
Female_Ages_18-24_yrs    236
Male_Ages_65-74_yrs      236
Ages_25-49_yrs           236
Female_Ages_25-39_yrs    236
Ages_18-24_yrs           236
Ages_65-74_yrs           236
Female_Ages_40-49_yrs    236
Male_Ages_25-49_yrs      236
Male_Ages_<2yrs          236
Sex_Female               236
Male_Ages_75+_yrs        236
Male_Ages_50-64_yrs      236
Age_Unknown              236
Male_Ages_<5yrs          236
Female_Ages_50-64_yrs    236
Female_Ages_75+_yrs      236
Ages_<2yrs               236
Male_Ages_12-17_yrs      236
Male_Ages_65+_yrs        236
Ages_75+_yrs             236
Female_Ages_2-4_yrs      236
Female_Ages_65-74_yrs    236
Ages_12-17_yrs           236
Female_Ages_<2yrs        236
Sex_Male                 236
Female_Ages_<5yrs        236
Ages_65+_yrs             236
Male_Ages_18-2

In [7]:
covid_vaccines_df.set_index('Location').sort_index().head(20)

Unnamed: 0_level_0,Date,Demographic_Category,census,Up_to_date,Up_to_date_pct_agegroup
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AK,09/13/2023,Female_Ages_12-17_yrs,27938.0,2697.0,9.7
AK,09/13/2023,Female_Ages_25-39_yrs,80186.0,9014.0,11.2
AK,07/12/2023,Male_Ages_2-4_yrs,15880.0,53.0,0.3
AK,07/12/2023,Female_Ages_25-49_yrs,121096.0,15552.0,12.8
AK,07/12/2023,Ages_2-4_yrs,31090.0,94.0,0.3
AK,09/13/2023,Female_Ages_65-74_yrs,29793.0,13587.0,45.6
AK,07/12/2023,Sex_Female,350127.0,60375.0,17.2
AK,07/12/2023,Male_Ages_65-74_yrs,31507.0,13996.0,44.4
AK,07/12/2023,Female_Ages_40-49_yrs,40910.0,6725.0,16.4
AK,07/12/2023,Ages_12-17_yrs,57315.0,4992.0,8.7


In [8]:
no_general_gender = covid_vaccines_df.loc[covid_vaccines_df['Demographic_Category'].str.contains('yrs')]
no_general_gender

Unnamed: 0,Date,Location,Demographic_Category,census,Up_to_date,Up_to_date_pct_agegroup
0,09/13/2023,AK,Female_Ages_12-17_yrs,27938.0,2697.0,9.7
1,09/13/2023,AK,Female_Ages_75+_yrs,16346.0,7344.0,44.9
2,09/13/2023,AK,Male_Ages_12-17_yrs,29377.0,2598.0,8.8
3,09/13/2023,CA,Male_Ages_65+_yrs,2602393.0,1331083.0,51.1
4,09/13/2023,CT,Male_Ages_65+_yrs,276002.0,175897.0,63.7
...,...,...,...,...,...,...
10141,06/14/2023,TX,Male_Ages_65-74_yrs,1046971.0,,
10142,06/14/2023,WI,Ages_25-49_yrs,1797281.0,285589.0,15.9
10143,06/14/2023,VI,Male_Ages_5-11_yrs,5225.0,12.0,0.2
10146,06/14/2023,WI,Male_Ages_<2yrs,65295.0,1663.0,2.5


In [9]:
no_general_gender.loc[(no_general_gender.Location == "AK")& (no_general_gender.Date == '09/13/2023')]

Unnamed: 0,Date,Location,Demographic_Category,census,Up_to_date,Up_to_date_pct_agegroup
0,09/13/2023,AK,Female_Ages_12-17_yrs,27938.0,2697.0,9.7
1,09/13/2023,AK,Female_Ages_75+_yrs,16346.0,7344.0,44.9
2,09/13/2023,AK,Male_Ages_12-17_yrs,29377.0,2598.0,8.8
30,09/13/2023,AK,Ages_<5yrs,51080.0,301.0,0.6
117,09/13/2023,AK,Female_Ages_<2yrs,9717.0,73.0,0.8
160,09/13/2023,AK,Male_Ages_25-49_yrs,134560.0,11458.0,8.5
234,09/13/2023,AK,Ages_50-64_yrs,136166.0,27624.0,20.3
335,09/13/2023,AK,Ages_40-49_yrs,85647.0,12070.0,14.1
416,09/13/2023,AK,Ages_25-49_yrs,255656.0,27595.0,10.8
489,09/13/2023,AK,Ages_2-4_yrs,31090.0,160.0,0.5


In [10]:
minus_male = no_general_gender.loc[~no_general_gender['Demographic_Category'].str.contains('Male')]
minus_male

Unnamed: 0,Date,Location,Demographic_Category,census,Up_to_date,Up_to_date_pct_agegroup
0,09/13/2023,AK,Female_Ages_12-17_yrs,27938.0,2697.0,9.7
1,09/13/2023,AK,Female_Ages_75+_yrs,16346.0,7344.0,44.9
5,09/13/2023,DE,Ages_75+_yrs,76048.0,,
6,09/13/2023,HI,Female_Ages_2-4_yrs,25076.0,519.0,2.1
7,09/13/2023,IA,Female_Ages_65-74_yrs,161188.0,89956.0,55.8
...,...,...,...,...,...,...
10134,06/14/2023,PA,Ages_75+_yrs,1039269.0,516640.0,49.7
10136,06/14/2023,WA,Ages_2-4_yrs,280089.0,11859.0,4.2
10137,06/14/2023,TN,Ages_65+_yrs,1143393.0,,
10142,06/14/2023,WI,Ages_25-49_yrs,1797281.0,285589.0,15.9


In [11]:
minus_female = minus_male.loc[~minus_male['Demographic_Category'].str.contains('Female')]
minus_female

Unnamed: 0,Date,Location,Demographic_Category,census,Up_to_date,Up_to_date_pct_agegroup
5,09/13/2023,DE,Ages_75+_yrs,76048.0,,
8,09/13/2023,ID,Ages_12-17_yrs,156543.0,,
11,09/13/2023,KY,Ages_<2yrs,107266.0,812.0,0.8
14,09/13/2023,MA,Ages_<5yrs,357362.0,15956.0,4.5
20,09/13/2023,ND,Ages_5-11_yrs,70715.0,3728.0,5.3
...,...,...,...,...,...,...
10133,06/14/2023,PR,Ages_<5yrs,117482.0,240.0,0.2
10134,06/14/2023,PA,Ages_75+_yrs,1039269.0,516640.0,49.7
10136,06/14/2023,WA,Ages_2-4_yrs,280089.0,11859.0,4.2
10137,06/14/2023,TN,Ages_65+_yrs,1143393.0,,


In [12]:
minus_female['Date'] = pd.to_datetime(minus_female["Date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  minus_female['Date'] = pd.to_datetime(minus_female["Date"])


In [13]:
minus_female.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3068 entries, 5 to 10142
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Date                     3068 non-null   datetime64[ns]
 1   Location                 3068 non-null   object        
 2   Demographic_Category     3068 non-null   object        
 3   census                   3068 non-null   float64       
 4   Up_to_date               2385 non-null   float64       
 5   Up_to_date_pct_agegroup  2385 non-null   float64       
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 167.8+ KB


In [ ]:
minus_female['Date'].dt.year

5        2023
8        2023
11       2023
14       2023
20       2023
         ... 
10133    2023
10134    2023
10136    2023
10137    2023
10142    2023
Name: Date, Length: 3068, dtype: int64

In [ ]:
minus_female['Date'].dt.month

5        9
8        9
11       9
14       9
20       9
        ..
10133    6
10134    6
10136    6
10137    6
10142    6
Name: Date, Length: 3068, dtype: int64

In [None]:
#minus_female["Date"].iloc[0].month

In [None]:
#minus_female["Date"].iloc[0].year

In [None]:
#minus_female['year'] = minus_female["Date"].dt.year
#minus_female['month'] = minus_female["Date"].dt.month
#minus_female

In [ ]:
# delete old date column 
del minus_female['Date']
minus_female.head()

In [ ]:
#drop rows with NaN
minus_female.dropna(how = 'any')
minus_female.head()

#### HELP DROP THE NaN

In [ ]:
#with Nan strings, frst conert them to real NA value:
# minus_female_nan = minus_female.replace('Nan', pd.NA).dropna(axis=1)
# minus_female.head()
#hmm this removed the entire column? 

In [ ]:
#group by month and state -- didnt seem to work? 
vaccines_grouped = minus_female.groupby(['month', 'Location'])
vaccines_grouped.head()

In [25]:
#grouping age demographic category into similar categories to other db
minus_female.Demographic_Category.value_counts()

Ages_75+_yrs      236
Ages_12-17_yrs    236
Ages_<2yrs        236
Ages_<5yrs        236
Ages_5-11_yrs     236
Ages_40-49_yrs    236
Ages_2-4_yrs      236
Ages_65+_yrs      236
Ages_18-24_yrs    236
Ages_25-39_yrs    236
Ages_25-49_yrs    236
Ages_50-64_yrs    236
Ages_65-74_yrs    236
Name: Demographic_Category, dtype: int64

In [26]:
#age groups: 
# 0 - 17 
# 18 - 49
# 50 -64
# 65+