In [44]:
import numpy as np 
import pandas as pd 

In [45]:
# Loading CSV data into data frame.
Covid_byState_df = pd.read_csv('Data/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv')

# Checking the data.
Covid_byState_df.head()

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,02/14/2020,AL,0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,02/16/2020 12:00:00 AM,Agree,Agree
1,04/01/2021,CA,3570660,3570660.0,0.0,2234,0.0,58090,58090.0,0.0,154,0.0,04/03/2021 12:00:00 AM,Agree,Agree
2,05/31/2021,CA,3685032,3685032.0,0.0,644,0.0,62011,62011.0,0.0,5,0.0,06/02/2021 12:00:00 AM,Agree,Agree
3,07/14/2020,CA,336447,336447.0,0.0,7285,0.0,7039,7039.0,0.0,25,0.0,07/16/2020 12:00:00 AM,Agree,Agree
4,02/06/2020,NE,0,,,0,,0,,,0,,03/26/2020 04:22:39 PM,Agree,Agree


In [46]:
# Step 1:
# Identifying unwanted columns and deleting them from data frame as they are not required for our analysis.

# Removing unwanted columns from dataframe.
Covid_byState_df = Covid_byState_df.drop(['conf_cases','prob_cases','pnew_case','conf_death','prob_death','pnew_case','pnew_death','consent_cases','consent_deaths','created_at'], axis = 1)


In [47]:
# Step 2:
# Modifying the column name to appropriate name.
Covid_byState_df.rename(columns = {'submission_date':'reported_date'}, inplace = True)

# Checking the columns.
Covid_byState_df.columns

Index(['reported_date', 'state', 'tot_cases', 'new_case', 'tot_death',
       'new_death'],
      dtype='object')

In [48]:
# Step 3:
# Converting the data frame columns to appropriate datatypes.

# Checking the column data types.
Covid_byState_df.dtypes


reported_date    object
state            object
tot_cases         int64
new_case          int64
tot_death         int64
new_death         int64
dtype: object

In [49]:
# Converting 'submission_date' column to datetime datatype.
Covid_byState_df['reported_date'] = pd.to_datetime(Covid_byState_df['reported_date'])

# Checking the column data types.
Covid_byState_df.dtypes


reported_date    datetime64[ns]
state                    object
tot_cases                 int64
new_case                  int64
tot_death                 int64
new_death                 int64
dtype: object

In [50]:
# Step 4:
# Data Cleaning: Identifying and deleting the rows where total cases is 0.
Covid_byState_df.drop(Covid_byState_df[Covid_byState_df['tot_cases'] == 0].index, inplace = True)

# Checking the data.
Covid_byState_df[Covid_byState_df['tot_cases'] == 0]


Unnamed: 0,reported_date,state,tot_cases,new_case,tot_death,new_death


In [56]:
# Step 5:
# Identifying and removing the rows which are not in these fifty states.
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

Covid_byState_df.drop(Covid_byState_df[~Covid_byState_df['state'].isin(states)].index, inplace = True)

Covid_byState_df

Unnamed: 0,reported_date,state,tot_cases,new_case,tot_death,new_death
1,2021-04-01,CA,3570660,2234,58090,154
2,2021-05-31,CA,3685032,644,62011,5
3,2020-07-14,CA,336447,7285,7039,25
5,2020-07-30,ME,3910,22,123,2
6,2020-06-10,VT,1009,10,54,0
...,...,...,...,...,...,...
43854,2020-10-12,DC,16022,38,637,0
43855,2020-12-11,AZ,394804,6986,7245,91
43857,2020-12-15,DC,25339,301,720,4
43858,2021-07-06,OR,209494,117,2782,1
