In [11]:
import numpy as np 
import pandas as pd
import requests

In [30]:
# Setting the api url.
url = 'https://data.cdc.gov/resource/rh2h-3yt2.json'

# Get request. 
resp = requests.get(url)

# Extracting data using json method.
data = resp.json()

# Loading json data into data frame.
Covid_Vaccination_df = pd.DataFrame(data)

# Checking the data.
Covid_Vaccination_df.head()

Unnamed: 0,date,mmwr_week,location,administered_daily,administered_cumulative,administered_7_day_rolling,admin_dose_1_daily,admin_dose_1_cumulative,admin_dose_1_day_rolling,date_type,administered_daily_change,administered_daily_change_1,series_complete_daily,series_complete_cumulative,series_complete_day_rolling,booster_daily,booster_cumulative,booster_7_day_rolling_average
0,2022-02-17T00:00:00.000,7,AR,8181,4060791,3870,5159,1953703,1512,Report,8181,3870,565,1606459,931,38,598043,1047
1,2022-02-17T00:00:00.000,7,MS,11441,3737753,3744,5350,1728455,1343,Report,11441,3744,550,1506969,750,70,533210,901
2,2022-02-17T00:00:00.000,7,SD,3304,1347735,1253,1934,655657,532,Report,3304,1253,401,527134,324,304,209155,310
3,2022-02-17T00:00:00.000,7,NE,15638,3064271,3486,7226,1327578,1323,Report,15638,3486,1990,1203610,667,2494,575704,903
4,2022-02-17T00:00:00.000,7,NJ,24684,16406772,17156,9113,7630388,4912,Report,24684,17156,5218,6560847,4340,7643,2891618,7743


In [13]:
# Step 1:
# Identifying unwanted columns and deleting them from data frame as they are not required for our analysis.

# Removing unwanted columns from dataframe.
Covid_Vaccination_df = Covid_Vaccination_df.drop(['mmwr_week','administered_cumulative','administered_7_day_rolling',
                                          'admin_dose_1_daily','admin_dose_1_cumulative','admin_dose_1_day_rolling',
                                          'date_type','administered_daily_change','administered_daily_change_1',
                                          'series_complete_cumulative','series_complete_day_rolling',
                                          'booster_cumulative','booster_7_day_rolling_average'], axis = 1)

# Checking the data.
Covid_Vaccination_df.head()

Unnamed: 0,date,location,administered_daily,series_complete_daily,booster_daily
0,2022-02-17T00:00:00.000,AR,8181,565,38
1,2022-02-17T00:00:00.000,MS,11441,550,70
2,2022-02-17T00:00:00.000,SD,3304,401,304
3,2022-02-17T00:00:00.000,NE,15638,1990,2494
4,2022-02-17T00:00:00.000,NJ,24684,5218,7643


In [15]:
# Step 2:
# Modifying the column name to appropriate name.
Covid_Vaccination_df.rename(columns = {'location':'state','administered_daily':'vaccinated_daily',
                                   'series_complete_daily':'fullyvaccinated_daily'}, inplace = True)

# Checking the columns.
Covid_Vaccination_df.columns

Index(['date', 'state', 'vaccinated_daily', 'fullyvaccinated_daily',
       'booster_daily'],
      dtype='object')

In [17]:
# Step 3:
# Converting the data frame columns to appropriate datatypes.

# Checking the column data types.
Covid_Vaccination_df.dtypes

date                     object
state                    object
vaccinated_daily         object
fullyvaccinated_daily    object
booster_daily            object
dtype: object

In [18]:
# Converting 'submission_date' column to datetime datatype.
Covid_Vaccination_df['date'] = pd.to_datetime(Covid_Vaccination_df['date'])

# Converting 'vaccinated_daily,fullyvaccinated_daily,booster_daily' columns to int datatype.
Covid_Vaccination_df['vaccinated_daily'] = pd.to_numeric(Covid_Vaccination_df['vaccinated_daily'])
Covid_Vaccination_df['fullyvaccinated_daily'] = pd.to_numeric(Covid_Vaccination_df['fullyvaccinated_daily'])
Covid_Vaccination_df['booster_daily'] = pd.to_numeric(Covid_Vaccination_df['booster_daily'])

# Checking the column data types.
Covid_Vaccination_df.dtypes

date                     datetime64[ns]
state                            object
vaccinated_daily                  int64
fullyvaccinated_daily             int64
booster_daily                     int64
dtype: object

In [22]:
# Step 4:
# Data Cleaning: Checking missing data.
Covid_Vaccination_df.isna().sum()

date                     0
state                    0
vaccinated_daily         0
fullyvaccinated_daily    0
booster_daily            0
dtype: int64

In [26]:
# Step 5:
# Identifying and removing the rows which are not in these fifty states.
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

Covid_Vaccination_df.drop(Covid_Vaccination_df[~Covid_Vaccination_df['state'].isin(states)].index, inplace = True)

# Checking the data.
Covid_Vaccination_df.head()

Unnamed: 0,date,state,vaccinated_daily,fullyvaccinated_daily,booster_daily
0,2022-02-17,AR,8181,565,38
1,2022-02-17,MS,11441,550,70
2,2022-02-17,SD,3304,401,304
3,2022-02-17,NE,15638,1990,2494
4,2022-02-17,NJ,24684,5218,7643
