In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

from sklearn.linear_model import LinearRegression

import statsmodels.formula.api as smf
from sklearn.preprocessing import PolynomialFeatures

from sklearn import metrics

from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')

### Reading covid master dataset data

In [None]:
covid_df = pd.read_parquet("../../../../data/covid_combined_dataset.parquet.gzip")
covid_df.head()

### Filtering Alabama state data

In [None]:
#dataset filter for state to Albama
al_covid_df = covid_df[covid_df['State']=="AL"]
al_covid_df.head()

In [None]:
#Calculating new cases per day or new death per day
al_covid_df['New_cases_per_day']=al_covid_df['Cases'].diff()
al_covid_df["New_death_per_day"]=al_covid_df['Death'].diff()

#Drop the first date as we we are using .diff, we will get the first value as NAN 
al_covid_df = al_covid_df[al_covid_df['Date'] != '2020-01-22']
al_covid_dfBydate = al_covid_df.groupby("Date").sum().reset_index()

al_covid_dfBydate

In [None]:
#Drop columns that are not necessary
al_cleaned_data_df = al_covid_dfBydate.drop(['countyFIPS','StateFIPS','Cases','Death','population'],axis=1)
al_cleaned_data_df.tail()

In [None]:
al_firstcase_index = al_cleaned_data_df['New_cases_per_day'].ne(0).idxmax() 
# Get the index of the row when the first death was detected 
al_first_death_index= al_cleaned_data_df['New_death_per_day'].ne(0).idxmax()

### Calculting number of days since first case and first death

In [None]:
# Assign number of days since the first case
casesSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=al_firstcase_index:
        casesSinceDay1.insert(index, 0) 
    else:
        casesSinceDay1.insert(index, counter) 
        counter = counter+1        
al_cleaned_data_df.insert(1, "numberOfDaysSinceFirstCase", casesSinceDay1)

# Assign number of days since the first death
deathsSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=al_first_death_index:
        deathsSinceDay1.insert(index, 0) 
    else:
        deathsSinceDay1.insert(index, counter) 
        counter = counter+1        
al_cleaned_data_df.insert(3, "numberOfDaysSinceFirstDeath", deathsSinceDay1)

In [None]:
al_cleaned_data_df.head()

### Linear Regression model of AL for cases

In [None]:
al_cases_before_drop = al_cleaned_data_df[['numberOfDaysSinceFirstCase','New_cases_per_day']]
#Drop all the rows 
al_cases = al_cases_before_drop[al_cases_before_drop['numberOfDaysSinceFirstCase']!=0]
al_cases

### Model training

In [None]:
#Define the independent and dependent variable
X = al_cases.iloc[:,0].values.reshape(-1, 1) 
y = al_cases.iloc[:,1].values.reshape(-1, 1) 

#Define and fit the LinearRegression model  
lr_model = LinearRegression()
lr_model.fit(X, y)

#predict the number of new cases per day since the first case
cases_prediction = lr_model.predict(X)

In [None]:
#Function for getting confidence interval
def get_confidence_interval(prediction, y_test, test_predictions, pi=.95):  
    #get standard deviation of y_test
    sum_errs = np.sum((y_test - test_predictions)**2)
    stdev = np.sqrt(sum_errs / (len(y_test) - 2))
#get interval from standard deviation
    one_minus_pi = 1 - pi
    ppf_lookup = 1 - (one_minus_pi / 2)
    z_score = stats.norm.ppf(ppf_lookup)
    interval = z_score * stdev
    
#generate prediction interval lower and upper bound
    lower, upper = prediction - interval, prediction + interval
    return lower, prediction, upper

### Calculating Root mean square error for the data

In [None]:
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y,cases_prediction)))

### Plotting graph for Alabama state cases trend line and confidence intervals

In [None]:
# Predict the number of new cases for the next 3 week's of data
future_dates = np.arange(706,727).reshape(-1,1)
future_cases = lr_model.predict(future_dates)

#Confidence Interval
[err_down, prediction, err_up] = get_confidence_interval(cases_prediction, y, cases_prediction, pi=.95)

#Plot the trend line and prediction path
fig = go.Figure([
    go.Scatter(x=X.squeeze(),y=y.squeeze(),name='Actual Cases',mode='markers'),
    go.Scatter(x=X.squeeze(),y=cases_prediction.squeeze(),name='Model',mode='lines'),
    go.Scatter(x=future_dates.squeeze(),y=future_cases.squeeze(),name='Prediction',mode='lines'),
    go.Scatter(x=X.squeeze(),y=err_down.squeeze(), name='LR Model CI_Lower',mode='lines', line={'dash': 'dash', 'color': 'red'}),
    go.Scatter(x=X.squeeze(),y=err_up.squeeze(), name='LR Model CI_Upper',mode='lines', line={'dash': 'dash', 'color': 'red'}),
    ])

fig.update_layout(
    title={'text':"Linear Regression for Alabama state cases",
           'y':0.85,'x':0.5,
           'xanchor':'center', 
           'yanchor':'top'},
    xaxis_title="Number of Days since first case",
    yaxis_title="Number of new Cases",
)
fig.show()

### Linear Regression for Deaths

In [None]:
al_deaths_before_drop = al_cleaned_data_df[['numberOfDaysSinceFirstDeath','New_death_per_day']]
#Drop all the rows 
al_deaths = al_deaths_before_drop[al_deaths_before_drop['numberOfDaysSinceFirstDeath']!=0]
al_deaths

In [None]:
#Define the independent and dependent variable
X = al_deaths.iloc[:,0].values.reshape(-1, 1) 
y = al_deaths.iloc[:,1].values.reshape(-1, 1) 

lr_model.fit(X, y)

#predict the number of new cases per day since the first case
deaths_prediction = lr_model.predict(X)

In [None]:
print("RMSE for deaths: ",np.sqrt(metrics.mean_squared_error(y,deaths_prediction)))

In [None]:
# Predict the number of new cases for the next 3 week's of data
future_dates = np.arange(694,715).reshape(-1,1)
future_deaths = lr_model.predict(future_dates)

#Confidence Interval
[err_down, prediction, err_up] = get_confidence_interval(deaths_prediction, y, deaths_prediction, pi=.95)

#Plot the trend line and prediction path
fig = go.Figure([
    go.Scatter(x=X.squeeze(),y=y.squeeze(),name='Actual Cases',mode='markers'),
    go.Scatter(x=X.squeeze(),y=deaths_prediction.squeeze(),name='Model',mode='lines'),
    go.Scatter(x=future_dates.squeeze(),y=future_deaths.squeeze(),name='Prediction',mode='lines'),
    go.Scatter(x=X.squeeze(),y=err_down.squeeze(), name='LR Model CI_Lower',mode='lines', line={'dash': 'dash', 'color': 'red'}),
    go.Scatter(x=X.squeeze(),y=err_up.squeeze(), name='LR Model CI_Upper',mode='lines', line={'dash': 'dash', 'color': 'red'}),
    ])

fig.update_layout(
    title={'text':"Linear Regression for Alabama state deaths",
           'y':0.85,'x':0.5,
           'xanchor':'center', 
           'yanchor':'top'},
    xaxis_title="Number of Days since first death",
    yaxis_title="Number of new Cases",
)
fig.show()

### Non-Linear Regression Model of AL for cases

In [None]:
#Define the independent and dependent variable
X = al_cases.numberOfDaysSinceFirstCase.values.reshape(-1, 1)
y_d = al_cases.New_cases_per_day.values.reshape(-1, 1)
x_range = np.linspace(X.min(), X.max(), y_d.shape[0]).reshape(-1, 1)
future_dates = np.arange(531,540).reshape(-1,1)
fig = go.Figure()
fig.add_traces(go.Scatter(x=al_cases['numberOfDaysSinceFirstCase'], y=al_cases['New_cases_per_day']
                          , name= "Actual Cases", mode="markers"))
fig1 = go.Figure()
fig1.add_traces(go.Scatter(x=al_cases['numberOfDaysSinceFirstCase'], y=al_cases['New_cases_per_day']
                          , name= "Actual Cases", mode="markers"))
for degree in [1, 2, 3, 4]:
    
    #Define and fit the LinearRegression model
    poly = PolynomialFeatures(degree)
    poly.fit(X)
    X_poly = poly.transform(X)
    x_range_poly = poly.transform(x_range)
    future_dates_poly = poly.transform(future_dates)
    
    model = LinearRegression(fit_intercept=False)
    model.fit(X_poly, al_cases.New_cases_per_day)
    
    #predict the number of new cases per day since the fist case
    y_poly = model.predict(x_range_poly)
    
    # Predict the number of new cases for the next 1 week's of data
    future_cases = model.predict(future_dates_poly)
    
    #Caluculate the metrics and print
    print("RMSE for degree = ",degree,"is ",np.sqrt(metrics.mean_squared_error(y_d,y_poly)))
    
    if degree == 3:
        [err_down, prediction, err_up] = get_confidence_interval(y_poly, y_d, y_poly, pi=.95)
        fig1.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name="Poly Degree:{}".format(degree)))
        fig1.add_traces(go.Scatter(x=future_dates.squeeze(),y=future_cases.squeeze(),name='Future Degree:{} Prediction'.format(degree),mode='lines'))
        fig1.add_traces(go.Scatter(x=X.squeeze(),y=err_down.squeeze(), name='LR Model CI_Lower',mode='lines', line={'dash': 'dash', 'color': 'red'}))
        fig1.add_traces(go.Scatter(x=X.squeeze(),y=err_up.squeeze(), name='LR Model CI_Upper',mode='lines', line={'dash': 'dash', 'color': 'red'}))

        fig1.update_layout(
        title={'text':"Non-Linear Regression for AL Cases",
               'y':0.92,'x':0.5,
               'xanchor':'center', 
               'yanchor':'top'},
        xaxis_title="Number of Days since first case",
        yaxis_title="Number of new deaths")
    
    #Plot the trend line and prediction path
    fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name="Poly Degree:{}".format(degree)))
    fig.add_traces(go.Scatter(x=future_dates.squeeze(),y=future_cases.squeeze(),name='Future Prediction',mode='lines'))
    fig.update_layout(
    title={'text':"Non-Linear Regression for Alabama Cases",
           'y':0.92,'x':0.5,
           'xanchor':'center', 
           'yanchor':'top'},
    xaxis_title="Number of Days since first case",
    yaxis_title="Number of new cases",
)

fig.show()

### Non-Linear Regression Model of AL for deaths

In [None]:
#Define the independent and dependent variable
X = al_deaths.numberOfDaysSinceFirstDeath.values.reshape(-1, 1)
y_d = al_deaths.New_death_per_day.values.reshape(-1, 1)
x_range = np.linspace(X.min(), X.max(), y_d.shape[0]).reshape(-1, 1)
future_dates = np.arange(509,518).reshape(-1,1)

fig = go.Figure()
fig.add_traces(go.Scatter(x=al_deaths['numberOfDaysSinceFirstDeath'], y=al_deaths['New_death_per_day']
                          , name= "Actual Deaths", mode="markers"))
fig1 = go.Figure()
fig1.add_traces(go.Scatter(x=al_deaths['numberOfDaysSinceFirstDeath'], y=al_deaths['New_death_per_day']
                          , name= "Actual Deaths", mode="markers"))
for degree in [1, 2, 3, 4]:
    
    #Define and fit the LinearRegression model
    poly = PolynomialFeatures(degree)
    poly.fit(X)
    X_poly = poly.transform(X)
    x_range_poly = poly.transform(x_range)
    future_dates_poly = poly.transform(future_dates)
    
    model = LinearRegression(fit_intercept=False)
    model.fit(X_poly, al_deaths.New_death_per_day)
    
    #predict the number of new deaths per day since the fist deaths
    y_poly = model.predict(x_range_poly)
    
    # Predict the number of new deaths for the next 1 week's of data
    future_cases = model.predict(future_dates_poly)
    
    #if degree == 3:
    [err_down, prediction, err_up] = get_confidence_interval(y_poly, y_d, y_poly, pi=.95)
    fig1.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name="Poly Degree:{}".format(degree)))
    fig1.add_traces(go.Scatter(x=future_dates.squeeze(),y=future_cases.squeeze(),name='Future Degree:{} Prediction'.format(degree),mode='lines'))
    fig1.add_traces(go.Scatter(x=X.squeeze(),y=err_down.squeeze(), name='LR Model CI_Lower',mode='lines', line={'dash': 'dash', 'color': 'red'}))
    fig1.add_traces(go.Scatter(x=X.squeeze(),y=err_up.squeeze(), name='LR Model CI_Upper',mode='lines', line={'dash': 'dash', 'color': 'red'}))
    
    fig1.update_layout(
    title={'text':"Non-Linear Regression for AL Deaths",
               'y':0.92,'x':0.5,
               'xanchor':'center', 
               'yanchor':'top'},
        xaxis_title="Number of Days since first death",
        yaxis_title="Number of new deaths")
        
    #Caluculate the metrics and print
    print("RMSE for degree =",degree,"is ",np.sqrt(metrics.mean_squared_error(y_d,y_poly)))
    
    #Plot the trend line and prediction path
    fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name="Poly Degree:{}".format(degree)))
    fig.add_traces(go.Scatter(x=future_dates.squeeze(),y=future_cases.squeeze(),name='Future Degree:{} Prediction'.format(degree),mode='lines')),
    
    
    fig.update_layout(
    title={'text':"Non-Linear Regression for Alabama Deaths",
           'y':0.92,'x':0.5,
           'xanchor':'center', 
           'yanchor':'top'},
    xaxis_title="Number of Days since first death",
    yaxis_title="Number of new deaths",
)
fig.show()

### Observation :
-  From the above plot we can say that non-linear model with degree 3 model best fits the data 

### From stage 2 the top 5 most affected counties are:
  - Hale County
  - Winston County
  - Franklin County
  - Clay County
  - Clarke County

In [None]:
# Remove the rows that conatin county name as "statewide unallocated"
AL_county_data= al_covid_df[al_covid_df['County Name'] != 'statewide unallocated']
top5_county_data = AL_county_data[AL_county_data["County Name"].isin(["Hale County ","Winston County ","Franklin County ","Clay County ","Clarke County "])]
#normalize the total number of cases and deaths of each county by dividing it by total population and multiply by 100000
top5_county_data['New_cases_per_day'][top5_county_data['New_cases_per_day'] < 0] = 0
top5_county_data['New_death_per_day'][top5_county_data['New_death_per_day'] < 0] = 0

top5_county_data['Num_of_Cases_per_day_normalized'] = (top5_county_data['New_cases_per_day']/top5_county_data['population'])*1000000
top5_county_data['Num_of_Deaths_per_day_normalized'] = (top5_county_data['New_death_per_day']/top5_county_data['population'])*1000000
top5_county_data

In [None]:
#Function to calculate the Linear and Non-linear model for new cases
def casesNonLinearModel(df_cases,future_dates,county_name):
    #Define the independent and dependent variable
    X = df_cases.numOfDaysSinceFirstCase.values.reshape(-1, 1)
    y_d = df_cases.Num_of_Cases_per_day_normalized.values.reshape(-1, 1)
    x_range = np.linspace(X.min(), X.max(), y_d.shape[0]).reshape(-1, 1)
    
    fig = go.Figure()
    fig.add_traces(go.Scatter(x=df_cases['numOfDaysSinceFirstCase'], y=df_cases['Num_of_Cases_per_day_normalized']
                              , name= "Actual Cases", mode="markers"))
    for degree in [1, 2, 3, 4]:

        #Define and fit the LinearRegression model
        poly = PolynomialFeatures(degree)
        poly.fit(X)
        X_poly = poly.transform(X)
        x_range_poly = poly.transform(x_range)
        future_dates_poly = poly.transform(future_dates)

        model = LinearRegression(fit_intercept=False)
        model.fit(X_poly, df_cases.Num_of_Cases_per_day_normalized)

        #predict the number of new cases per day since the fist case
        y_poly = model.predict(x_range_poly)

        # Predict the number of new cases for the next 1 week's of data
        future_cases = model.predict(future_dates_poly)

        #Caluculate the metrics and print
        print("RMSE for degree = ",degree,"is ",np.sqrt(metrics.mean_squared_error(y_d,y_poly)))

        #Plot the trend line and prediction path
        fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name="Poly Degree:{}".format(degree)))
        fig.add_traces(go.Scatter(x=future_dates.squeeze(),y=future_cases.squeeze(),name='Future Degree:{} Prediction'.format(degree),mode='lines'))
        fig.update_layout(
        title={'text':"Non-Linear Regression for Alabama {} Cases".format(county_name),
               'y':0.92,'x':0.5,
               'xanchor':'center', 
               'yanchor':'top'},
        xaxis_title="Number of Days since first case",
        yaxis_title="Number of new cases per 100000",
    )
    return fig

In [None]:
 def deathsNonLinearModel(df_deaths,future_dates,county_name):
    #Define the independent and dependent variable
    X = df_deaths.numOfDaysSinceFirstDeath.values.reshape(-1, 1)
    y_d = df_deaths.Num_of_Deaths_per_day_normalized.values.reshape(-1, 1)
    x_range = np.linspace(X.min(), X.max(), y_d.shape[0]).reshape(-1, 1)


    fig = go.Figure()
    fig.add_traces(go.Scatter(x=df_deaths['numOfDaysSinceFirstDeath'], y=df_deaths['Num_of_Deaths_per_day_normalized']
                              , name= "Actual Deaths", mode="markers"))
    for degree in [1, 2, 3, 4]:

        #Define and fit the LinearRegression model
        poly = PolynomialFeatures(degree)
        poly.fit(X)
        X_poly = poly.transform(X)
        x_range_poly = poly.transform(x_range)
        future_dates_poly = poly.transform(future_dates)

        model = LinearRegression(fit_intercept=False)
        model.fit(X_poly, df_deaths.Num_of_Deaths_per_day_normalized)

        #predict the number of new deaths per day since the fist deaths
        y_poly = model.predict(x_range_poly)

        # Predict the number of new deaths for the next 1 week's of data
        future_cases = model.predict(future_dates_poly)

        #Caluculate the metrics and print
        print("RMSE for degree =",degree,"is ",np.sqrt(metrics.mean_squared_error(y_d,y_poly)))

        #Plot the trend line and prediction path
        fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name="Poly Degree:{}".format(degree)))
        fig.add_traces(go.Scatter(x=future_dates.squeeze(),y=future_cases.squeeze(),name='Future Degree:{} Prediction'.format(degree),mode='lines')),
        fig.update_layout(
        title={'text':"Non-Linear Regression for {} Deaths".format(county_name),
               'y':0.92,'x':0.5,
               'xanchor':'center', 
               'yanchor':'top'},
        xaxis_title="Number of Days since first death",
        yaxis_title="Number of new deaths per 100000",
    )
    return fig

### Hale County Cases and DeathsTrends

In [None]:
#Select the hale_county_data 
hale_county_data = top5_county_data[top5_county_data['County Name']=='Hale County '].reset_index()

# Get the index of the row when the first case was detected 
hale_county_data_first_case_index = hale_county_data['Num_of_Cases_per_day_normalized'].ne(0).idxmax() 

# Get the index of the row when the first death was detected 
hale_county_data_first_death_index= hale_county_data['Num_of_Deaths_per_day_normalized'].ne(0).idxmax() 

# Assign number of days since the first case
numOfCasesSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=hale_county_data_first_case_index:
        numOfCasesSinceDay1.insert(index, 0) 
    else:
        numOfCasesSinceDay1.insert(index, counter) 
        counter = counter+1        
hale_county_data.insert(1, "numOfDaysSinceFirstCase", numOfCasesSinceDay1)

# Assign number of days since the first death
numOfDeathsSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=hale_county_data_first_death_index:
        numOfDeathsSinceDay1.insert(index, 0) 
    else:
        numOfDeathsSinceDay1.insert(index, counter) 
        counter = counter+1        
hale_county_data.insert(3, "numOfDaysSinceFirstDeath", numOfDeathsSinceDay1)

#Select the wo colums which are required
hale_county_cases_before_drop = hale_county_data[['numOfDaysSinceFirstCase','Num_of_Cases_per_day_normalized']]
#Drop all the rows before the first case
df_cases = hale_county_cases_before_drop[hale_county_cases_before_drop['numOfDaysSinceFirstCase']!=0]

hale_county_deaths_before_drop = hale_county_data[['numOfDaysSinceFirstDeath','Num_of_Deaths_per_day_normalized']]
df_deaths = hale_county_deaths_before_drop[hale_county_deaths_before_drop['numOfDaysSinceFirstDeath']!=0]

print(df_cases.numOfDaysSinceFirstCase.count())
print(df_deaths.numOfDaysSinceFirstDeath.count())

In [None]:
future_dates_cases = np.arange(689,700).reshape(-1,1)
future_dates_deaths = np.arange(664,675).reshape(-1,1)

fig_cases = casesNonLinearModel(df_cases,future_dates_cases,"Hale_county")
fig_cases.show()

In [None]:
import statsmodels.api as sm
X = df_cases["numOfDaysSinceFirstCase"].values.reshape(-1,1)
y = df_cases["Num_of_Cases_per_day_normalized"].values.reshape(-1,1)

polynomial_features = PolynomialFeatures(degree=3)
xp = polynomial_features.fit_transform(X)

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 


from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model)

plt.scatter(X,y)
plt.plot(X,ypred)
plt.plot(X,upper,'--',label="Upper") # confid. intrvl
plt.plot(X,lower,':',label="lower")
plt.title("Confidence Interval for Hale County Cases")
plt.legend(loc='upper left')

In [None]:
fig_deaths = deathsNonLinearModel(df_deaths,future_dates_deaths,"Hale_county")
fig_deaths.show()

In [None]:
#Confidence Interval
import statsmodels.api as sm
X = df_deaths["numOfDaysSinceFirstDeath"].values.reshape(-1,1)
y = df_deaths["Num_of_Deaths_per_day_normalized"].values.reshape(-1,1)

polynomial_features = PolynomialFeatures(degree=3)
xp = polynomial_features.fit_transform(X)

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 

# print(np.sqrt(mean_squared_error(y,ypred)))
# model.summary()

from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model)

plt.scatter(X,y)
plt.plot(X,ypred)
plt.plot(X,upper,'--',label="Upper") # confid. intrvl
plt.plot(X,lower,':',label="lower")
plt.legend(loc='upper left')

### Winston County Cases and DeathsTrends

In [None]:
#Select the winston_county_data 
winston_county_data = top5_county_data[top5_county_data['County Name']=='Winston County '].reset_index()

# Get the index of the row when the first case was detected 
winston_county_data_first_case_index = winston_county_data['Num_of_Cases_per_day_normalized'].ne(0).idxmax() 

# Get the index of the row when the first death was detected 
winston_county_data_first_death_index= winston_county_data['Num_of_Deaths_per_day_normalized'].ne(0).idxmax() 

# Assign number of days since the first case
numOfCasesSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=winston_county_data_first_case_index:
        numOfCasesSinceDay1.insert(index, 0) 
    else:
        numOfCasesSinceDay1.insert(index, counter) 
        counter = counter+1        
winston_county_data.insert(1, "numOfDaysSinceFirstCase", numOfCasesSinceDay1)

# Assign number of days since the first death
numOfDeathsSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=winston_county_data_first_death_index:
        numOfDeathsSinceDay1.insert(index, 0) 
    else:
        numOfDeathsSinceDay1.insert(index, counter) 
        counter = counter+1        
winston_county_data.insert(3, "numOfDaysSinceFirstDeath", numOfDeathsSinceDay1)

#Select the wo colums which are required
winston_county_cases_before_drop = winston_county_data[['numOfDaysSinceFirstCase','Num_of_Cases_per_day_normalized']]
#Drop all the rows before the first case
df_cases = winston_county_cases_before_drop[winston_county_cases_before_drop['numOfDaysSinceFirstCase']!=0]

winston_county_deaths_before_drop = winston_county_data[['numOfDaysSinceFirstDeath','Num_of_Deaths_per_day_normalized']]
df_deaths = winston_county_deaths_before_drop[winston_county_deaths_before_drop['numOfDaysSinceFirstDeath']!=0]

print(df_cases.numOfDaysSinceFirstCase.count())
print(df_deaths.numOfDaysSinceFirstDeath.count())

In [None]:
future_dates_cases = np.arange(692,699).reshape(-1,1)
future_dates_deaths = np.arange(614,621).reshape(-1,1)

fig_cases = casesNonLinearModel(df_cases,future_dates_cases,"Winston_county")
fig_cases.show()

In [None]:
#Confidence Interval
X = df_cases["numOfDaysSinceFirstCase"].values.reshape(-1,1)
y = df_cases["Num_of_Cases_per_day_normalized"].values.reshape(-1,1)

polynomial_features = PolynomialFeatures(degree=3)
xp = polynomial_features.fit_transform(X)

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 

from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model)

plt.scatter(X,y)
plt.plot(X,ypred)
plt.plot(X,upper,'--',label="Upper") # confid. intrvl
plt.plot(X,lower,':',label="lower")
plt.title("Confidence Interval for Winston County Cases")
plt.legend(loc='upper left')

In [None]:
fig_deaths = deathsNonLinearModel(df_deaths,future_dates_deaths,"Winston_county")
fig_deaths.show()

In [None]:
#Confidence Inteval
X = df_deaths["numOfDaysSinceFirstDeath"].values.reshape(-1,1)
y = df_deaths["Num_of_Deaths_per_day_normalized"].values.reshape(-1,1)

polynomial_features = PolynomialFeatures(degree=2)
xp = polynomial_features.fit_transform(X)

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 

from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model)

plt.scatter(X,y)
plt.plot(X,ypred)
plt.plot(X,upper,'--',label="Upper") # confid. intrvl
plt.plot(X,lower,':',label="lower")
plt.legend(loc='upper left')

### Franklin County Cases and DeathsTrends

In [None]:
#Select the winston_county_data 
franklin_county_data = top5_county_data[top5_county_data['County Name']=='Franklin County '].reset_index()

# Get the index of the row when the first case was detected 
franklin_county_data_first_case_index = franklin_county_data['Num_of_Cases_per_day_normalized'].ne(0).idxmax() 

# Get the index of the row when the first death was detected 
franklin_county_data_first_death_index= franklin_county_data['Num_of_Deaths_per_day_normalized'].ne(0).idxmax() 

# Assign number of days since the first case
numOfCasesSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=franklin_county_data_first_case_index:
        numOfCasesSinceDay1.insert(index, 0) 
    else:
        numOfCasesSinceDay1.insert(index, counter) 
        counter = counter+1        
franklin_county_data.insert(1, "numOfDaysSinceFirstCase", numOfCasesSinceDay1)

# Assign number of days since the first death
numOfDeathsSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=franklin_county_data_first_death_index:
        numOfDeathsSinceDay1.insert(index, 0) 
    else:
        numOfDeathsSinceDay1.insert(index, counter) 
        counter = counter+1        
franklin_county_data.insert(3, "numOfDaysSinceFirstDeath", numOfDeathsSinceDay1)

#Select the wo colums which are required
franklin_county_cases_before_drop = franklin_county_data[['numOfDaysSinceFirstCase','Num_of_Cases_per_day_normalized']]
#Drop all the rows before the first case
df_cases = franklin_county_cases_before_drop[franklin_county_cases_before_drop['numOfDaysSinceFirstCase']!=0]

franklin_county_deaths_before_drop = franklin_county_data[['numOfDaysSinceFirstDeath','Num_of_Deaths_per_day_normalized']]
df_deaths = franklin_county_deaths_before_drop[franklin_county_deaths_before_drop['numOfDaysSinceFirstDeath']!=0]

print(df_cases.numOfDaysSinceFirstCase.count())
print(df_deaths.numOfDaysSinceFirstDeath.count())

In [None]:
future_dates_cases = np.arange(697,704).reshape(-1,1)
future_dates_deaths = np.arange(652,659).reshape(-1,1)

fig_cases = casesNonLinearModel(df_cases,future_dates_cases,"Franklin_county")
fig_cases.show()

In [None]:
#Confidence Interval
X = df_cases["numOfDaysSinceFirstCase"].values.reshape(-1,1)
y = df_cases["Num_of_Cases_per_day_normalized"].values.reshape(-1,1)

polynomial_features = PolynomialFeatures(degree=3)
xp = polynomial_features.fit_transform(X)

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 

from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model)

plt.scatter(X,y)
plt.plot(X,ypred)
plt.plot(X,upper,'--',label="Upper") # confid. intrvl
plt.plot(X,lower,':',label="lower")
plt.title("Confidence Interval for Franklin County Cases")
plt.legend(loc='upper left')

In [None]:
fig_deaths = deathsNonLinearModel(df_deaths,future_dates_deaths,"Franklin_county")
fig_deaths.show()

In [None]:
#Confidence Inteval
X = df_deaths["numOfDaysSinceFirstDeath"].values.reshape(-1,1)
y = df_deaths["Num_of_Deaths_per_day_normalized"].values.reshape(-1,1)

polynomial_features = PolynomialFeatures(degree=3)
xp = polynomial_features.fit_transform(X)

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 

from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model)

plt.scatter(X,y)
plt.plot(X,ypred)
plt.plot(X,upper,'--',label="Upper") # confid. intrvl
plt.plot(X,lower,':',label="lower")
plt.legend(loc='upper left')

### Clay County Cases and Deaths trends

In [None]:
#Select the winston_county_data 
clay_county_data = top5_county_data[top5_county_data['County Name']=='Clay County '].reset_index()

# Get the index of the row when the first case was detected 
clay_county_data_first_case_index = clay_county_data['Num_of_Cases_per_day_normalized'].ne(0).idxmax() 

# Get the index of the row when the first death was detected 
clay_county_data_first_death_index= clay_county_data['Num_of_Deaths_per_day_normalized'].ne(0).idxmax() 

# Assign number of days since the first case
numOfCasesSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=clay_county_data_first_case_index:
        numOfCasesSinceDay1.insert(index, 0) 
    else:
        numOfCasesSinceDay1.insert(index, counter) 
        counter = counter+1        
clay_county_data.insert(1, "numOfDaysSinceFirstCase", numOfCasesSinceDay1)

# Assign number of days since the first death
numOfDeathsSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=clay_county_data_first_death_index:
        numOfDeathsSinceDay1.insert(index, 0) 
    else:
        numOfDeathsSinceDay1.insert(index, counter) 
        counter = counter+1        
clay_county_data.insert(3, "numOfDaysSinceFirstDeath", numOfDeathsSinceDay1)

#Select the wo colums which are required
clay_county_cases_before_drop = clay_county_data[['numOfDaysSinceFirstCase','Num_of_Cases_per_day_normalized']]
#Drop all the rows before the first case
df_cases = clay_county_cases_before_drop[clay_county_cases_before_drop['numOfDaysSinceFirstCase']!=0]

clay_county_deaths_before_drop = clay_county_data[['numOfDaysSinceFirstDeath','Num_of_Deaths_per_day_normalized']]
df_deaths = clay_county_deaths_before_drop[clay_county_deaths_before_drop['numOfDaysSinceFirstDeath']!=0]

print(df_cases.numOfDaysSinceFirstCase.count())
print(df_deaths.numOfDaysSinceFirstDeath.count())

In [None]:
future_dates_cases = np.arange(694,701).reshape(-1,1)
future_dates_deaths = np.arange(665,672).reshape(-1,1)

fig_cases = casesNonLinearModel(df_cases,future_dates_cases,"Clay_county")
fig_cases.show()

In [None]:
#Confidence Interval
X = df_cases["numOfDaysSinceFirstCase"].values.reshape(-1,1)
y = df_cases["Num_of_Cases_per_day_normalized"].values.reshape(-1,1)

polynomial_features = PolynomialFeatures(degree=4)
xp = polynomial_features.fit_transform(X)

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 

from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model)

plt.scatter(X,y)
plt.plot(X,ypred)
plt.plot(X,upper,'--',label="Upper") # confid. intrvl
plt.plot(X,lower,':',label="lower")
plt.title("Confidence interval for Clay County")
plt.legend(loc='upper left')

In [None]:
fig_deaths = deathsNonLinearModel(df_deaths,future_dates_deaths,"Clay_county")
fig_deaths.show()

In [None]:
#Confidence Inteval
X = df_deaths["numOfDaysSinceFirstDeath"].values.reshape(-1,1)
y = df_deaths["Num_of_Deaths_per_day_normalized"].values.reshape(-1,1)

polynomial_features = PolynomialFeatures(degree=3)
xp = polynomial_features.fit_transform(X)

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 

from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model)

plt.scatter(X,y)
plt.plot(X,ypred)
plt.plot(X,upper,'--',label="Upper") # confid. intrvl
plt.plot(X,lower,':',label="lower")
plt.legend(loc='upper left')

### Clarke County Cases and Deaths trends

In [None]:
#Select the winston_county_data 
clarke_county_data = top5_county_data[top5_county_data['County Name']=='Clarke County '].reset_index()

# Get the index of the row when the first case was detected 
clarke_county_data_first_case_index = clarke_county_data['Num_of_Cases_per_day_normalized'].ne(0).idxmax() 

# Get the index of the row when the first death was detected 
clarke_county_data_first_death_index= clarke_county_data['Num_of_Deaths_per_day_normalized'].ne(0).idxmax() 

# Assign number of days since the first case
numOfCasesSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=clarke_county_data_first_case_index:
        numOfCasesSinceDay1.insert(index, 0) 
    else:
        numOfCasesSinceDay1.insert(index, counter) 
        counter = counter+1        
clarke_county_data.insert(1, "numOfDaysSinceFirstCase", numOfCasesSinceDay1)

# Assign number of days since the first death
numOfDeathsSinceDay1=[]
counter=1
for i,index in enumerate(range(len(al_cleaned_data_df))):
    if i<=clarke_county_data_first_death_index:
        numOfDeathsSinceDay1.insert(index, 0) 
    else:
        numOfDeathsSinceDay1.insert(index, counter) 
        counter = counter+1        
clarke_county_data.insert(3, "numOfDaysSinceFirstDeath", numOfDeathsSinceDay1)

#Select the wo colums which are required
clarke_county_cases_before_drop = clarke_county_data[['numOfDaysSinceFirstCase','Num_of_Cases_per_day_normalized']]
#Drop all the rows before the first case
df_cases = clarke_county_cases_before_drop[clarke_county_cases_before_drop['numOfDaysSinceFirstCase']!=0]

clarke_county_deaths_before_drop = clarke_county_data[['numOfDaysSinceFirstDeath','Num_of_Deaths_per_day_normalized']]
df_deaths = clarke_county_deaths_before_drop[clarke_county_deaths_before_drop['numOfDaysSinceFirstDeath']!=0]

print(df_cases.numOfDaysSinceFirstCase.count())
print(df_deaths.numOfDaysSinceFirstDeath.count())

In [None]:
future_dates_cases = np.arange(687,694).reshape(-1,1)
future_dates_deaths = np.arange(667,674).reshape(-1,1)

fig_cases = casesNonLinearModel(df_cases,future_dates_cases,"Clarke_county")
fig_cases.show()

In [None]:
#Confidence Interval
X = df_cases["numOfDaysSinceFirstCase"].values.reshape(-1,1)
y = df_cases["Num_of_Cases_per_day_normalized"].values.reshape(-1,1)

polynomial_features = PolynomialFeatures(degree=4)
xp = polynomial_features.fit_transform(X)

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 

from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model)

plt.scatter(X,y)
plt.plot(X,ypred,linewidth=2)
plt.plot(X,upper,'--',label="Upper") # confid. intrvl
plt.plot(X,lower,':',label="lower")
plt.title("Confidence Interval for Clarke county cases")
plt.legend(loc='upper left')

In [None]:
fig_deaths = deathsNonLinearModel(df_deaths,future_dates_deaths,"Clarke_county")
fig_deaths.show()

In [None]:
#Confidence Inteval
X = df_deaths["numOfDaysSinceFirstDeath"].values.reshape(-1,1)
y = df_deaths["Num_of_Deaths_per_day_normalized"].values.reshape(-1,1)

polynomial_features = PolynomialFeatures(degree=3)
xp = polynomial_features.fit_transform(X)

model = sm.OLS(y, xp).fit()
ypred = model.predict(xp) 

from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model)

plt.scatter(X,y)
plt.plot(X,ypred)
plt.plot(X,upper,'--',label="Upper") # confid. intrvl
plt.plot(X,lower,':',label="lower")
plt.legend(loc='upper left')

### Member task 2 Conclusion
- After analyzing the trend lines and prediction paths of above counties, we can observe that the Hale county is at higher risk and the number of cases are predicted to increase at a higher rate when compared to other counties. 
- The second county which is at most risk is the Winston County 

In [None]:
covid_hospital_merged_df = pd.read_parquet("../../../../data/member/venkat/hospitalBeds_merged_sample.parquet.gzip")
covid_hospital_merged_df

In [None]:
def nonLinearModelStateCases(state,num_degree,prediction):
    state_hospital_data= covid_hospital_merged_df[covid_hospital_merged_df['State']==state]

    #Calculate new cases and deaths for each day
    state_hospital_data['Num_of_Cases_per_day'] = state_hospital_data['Cases'].diff()
    state_hospital_data['Num_of_Deaths_per_day'] = state_hospital_data['Death'].diff()
    
    state_hospital_data['Num_of_Cases_per_day'][state_hospital_data['Num_of_Cases_per_day'] < 0] = 0
    state_hospital_data['Num_of_Deaths_per_day'][state_hospital_data['Num_of_Deaths_per_day'] < 0] = 0
    
    #Drop the first date as we we are using .diff, we will get the first value as NAN 
    state_hospital_data = state_hospital_data[state_hospital_data['Date'] != '2020-01-22']

    # Get the total number of cases and deaths for edf_Mexico_cases_newach day 
    state_hospital_data_groupedBydate = state_hospital_data.groupby("Date").sum().reset_index()
    
    # Get the index of the row when the first death was detected 
    state_first_Cases_index= state_hospital_data_groupedBydate['Num_of_Cases_per_day'].ne(0).idxmax() 

    # Assign number of days since the first death
    numOfCasesSinceDay1=[]
    counter=1
    for i,index in enumerate(range(len(al_cleaned_data_df))):
        if i<=state_first_Cases_index:
            numOfCasesSinceDay1.insert(index, 0) 
        else:
            numOfCasesSinceDay1.insert(index, counter) 
            counter = counter+1        
    state_hospital_data_groupedBydate.insert(3, "numOfDaysSinceFirstCases", numOfCasesSinceDay1)
    
    #max_num_icu_beds = state_hospital_data_groupedBydate.iloc[0]['icu_beds_used_7_day_avg']
    state_no_return_df=state_hospital_data_groupedBydate[['numOfDaysSinceFirstCases','Num_of_Cases_per_day','icu_beds_used_7_day_avg']]
    df_Cases = state_no_return_df[state_no_return_df['numOfDaysSinceFirstCases']!=0]
    #Define the independent and dependent variable
    X = df_Cases.numOfDaysSinceFirstCases.values.reshape(-1, 1)
    y_d = df_Cases.Num_of_Cases_per_day.values.reshape(-1, 1)
    x_range = np.linspace(X.min(), X.max(), y_d.shape[0]).reshape(-1, 1)
    last_date = len(df_Cases)-1
    if prediction == True :
        predictionDays=800
    else:
        predictionDays=last_date+7
    future_dates = np.arange(last_date,predictionDays).reshape(-1,1)

    fig = go.Figure()
    fig.add_traces(go.Scatter(x=df_Cases['numOfDaysSinceFirstCases'], y=df_Cases['Num_of_Cases_per_day']
                              , name= "Actual Cases", mode="markers"))
    for degree in num_degree:

        #Define and fit the LinearRegression model
        poly = PolynomialFeatures(degree)
        poly.fit(X)
        X_poly = poly.transform(X)
        x_range_poly = poly.transform(x_range)
        future_dates_poly = poly.transform(future_dates)

        model = LinearRegression(fit_intercept=False)
        model.fit(X_poly, df_Cases.Num_of_Cases_per_day)

        #predict the number of new deaths per day since the fist deaths
        y_poly = model.predict(x_range_poly)

        # Predict the number of new deaths for the next 1 week's of data
        future_cases = model.predict(future_dates_poly)
        
        #Caluculate the metrics and print
        print("RMSE for degree =",degree,"is ",np.sqrt(metrics.mean_squared_error(y_d,y_poly)))
        
        #Plot the trend line and prediction path
        fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name="Poly Degree:{}".format(degree)))
        fig.add_traces(go.Scatter(x=future_dates.squeeze(),y=future_cases.squeeze(),name='Deg:{} Prediction'.format(degree),mode='lines')),
        fig.update_layout(
        title={'text':"Non-Linear Regression for {} Cases".format(state),
               'y':0.92,'x':0.5,
               'xanchor':'center', 
               'yanchor':'top'},
        xaxis_title="Number of Days since first Case",
        yaxis_title="Number of new Cases ",
    )
    return fig

In [None]:
def nonLinearModelStateDeaths(state,num_degree,prediction):
    state_hospital_data= covid_hospital_merged_df[covid_hospital_merged_df['State']==state]

    #Calculate new cases and deaths for each day
    state_hospital_data['Num_of_Cases_per_day'] = state_hospital_data['Cases'].diff()
    state_hospital_data['Num_of_Deaths_per_day'] = state_hospital_data['Death'].diff()
    
    state_hospital_data['Num_of_Cases_per_day'][state_hospital_data['Num_of_Cases_per_day'] < 0] = 0
    state_hospital_data['Num_of_Deaths_per_day'][state_hospital_data['Num_of_Deaths_per_day'] < 0] = 0
    
    #Drop the first date as we we are using .diff, we will get the first value as NAN 
    state_hospital_data = state_hospital_data[state_hospital_data['Date'] != '2020-01-22']

    # Get the total number of cases and deaths for edf_Mexico_cases_newach day 
    state_hospital_data_groupedBydate = state_hospital_data.groupby("Date").sum().reset_index()
    
    # Get the index of the row when the first death was detected 
    state_first_death_index= state_hospital_data_groupedBydate['Num_of_Deaths_per_day'].ne(0).idxmax() 

    # Assign number of days since the first death
    numOfDeathsSinceDay1=[]
    counter=1
    for i,index in enumerate(range(len(al_cleaned_data_df))):
        if i<=state_first_death_index:
            numOfDeathsSinceDay1.insert(index, 0) 
        else:
            numOfDeathsSinceDay1.insert(index, counter) 
            counter = counter+1        
    state_hospital_data_groupedBydate.insert(3, "numOfDaysSinceFirstDeath", numOfDeathsSinceDay1)
    
    #max_num_icu_beds = state_hospital_data_groupedBydate.iloc[0]['icu_beds_used_7_day_avg']
    state_no_return_df=state_hospital_data_groupedBydate[['numOfDaysSinceFirstDeath','Num_of_Deaths_per_day','icu_beds_used_7_day_avg']]
    df_deaths = state_no_return_df[state_no_return_df['numOfDaysSinceFirstDeath']!=0]
    #Define the independent and dependent variable
    X = df_deaths.numOfDaysSinceFirstDeath.values.reshape(-1, 1)
    y_d = df_deaths.Num_of_Deaths_per_day.values.reshape(-1, 1)
    x_range = np.linspace(X.min(), X.max(), y_d.shape[0]).reshape(-1, 1)
    last_date = len(df_deaths)-1
    if prediction == True :
        predictionDays=800
    else:
        predictionDays=last_date+7
    future_dates = np.arange(last_date,predictionDays).reshape(-1,1)

    fig = go.Figure()
    fig.add_traces(go.Scatter(x=df_deaths['numOfDaysSinceFirstDeath'], y=df_deaths['Num_of_Deaths_per_day']
                              , name= "Actual Deaths", mode="markers"))
    for degree in num_degree:

        #Define and fit the LinearRegression model
        poly = PolynomialFeatures(degree)
        poly.fit(X)
        X_poly = poly.transform(X)
        x_range_poly = poly.transform(x_range)
        future_dates_poly = poly.transform(future_dates)

        model = LinearRegression(fit_intercept=False)
        model.fit(X_poly, df_deaths.Num_of_Deaths_per_day)

        #predict the number of new deaths per day since the fist deaths
        y_poly = model.predict(x_range_poly)

        # Predict the number of new deaths for the next 1 week's of data
        future_cases = model.predict(future_dates_poly)
        
        #Caluculate the metrics and print
        print("RMSE for degree =",degree,"is ",np.sqrt(metrics.mean_squared_error(y_d,y_poly)))
        
        #Plot the trend line and prediction path
        fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name="Poly Degree:{}".format(degree)))
        fig.add_traces(go.Scatter(x=future_dates.squeeze(),y=future_cases.squeeze(),name='Deg:{} Prediction'.format(degree),mode='lines')),
        fig.update_layout(
        title={'text':"Non-Linear Regression for {} Deaths".format(state),
               'y':0.92,'x':0.5,
               'xanchor':'center', 
               'yanchor':'top'},
        xaxis_title="Number of Days since first death",
        yaxis_title="Number of new deaths ",
    )
    return fig

### Plotting non linear regression graph and prediction for Alabama state Cases

In [None]:
#AL Modeling
fig_name = nonLinearModelStateCases("AL",[1,2,3,4],False)
fig_name.show()

In [None]:
fig_name = nonLinearModelStateCases("AL",[2],True)
fig_name.show()

### Observation:

 - From the above plot and the model metrics we can observe that non-linear model with degree 2 best fits the data so we will use degree 3 for prediction and we can observe that Cases will be increased in future

### Plotting non linear regression graph and prediction for Alabama state deaths

In [None]:
#AL Modeling
fig_name = nonLinearModelStateDeaths("AL",[1,2,3,4],False)
fig_name.show()

In [None]:
fig_name = nonLinearModelStateDeaths("AL",[3],True)
fig_name.show()

### Observation:

 - From the above plot and the model metrics we can observe that non-linear model with degree 3 best fits the data so we will use degree 3 for prediction and we can observe that deaths will be increased in future

### Hypothesis Testing:

In [None]:
#group by state and compute the enhacement data for each state
analysis_data_grp = covid_hospital_merged_df.groupby(['State']).sum().reset_index()
analysis_data_grp = analysis_data_grp.drop(columns=['countyFIPS', 'StateFIPS'])
analysis_data_grp

### Hypothesis to determine if states with highest population has highest cases

In [None]:
stats.ttest_ind(a=analysis_data_grp['Cases'], b= analysis_data_grp['population'],equal_var=False)

In this case, the p-value is lower than our significance level α (equal to 1-conf.level or 0.05) so, we should reject the null hypothesis.

### Hypothesis to determine if states with highest number cases has highest number of deaths

In [None]:
stats.ttest_ind(a=analysis_data_grp['Cases'], b= analysis_data_grp['Death'],equal_var=False)

In this case, the p-value is lower than our significance level α (equal to 1-conf.level or 0.05) so, we should reject the null hypothesis.

### Hypothesis to determine if state with population has highest number hospital beds available

In [None]:
stats.ttest_ind(a=analysis_data_grp['population'], b= analysis_data_grp['total_beds_7_day_avg'],equal_var=False)

In this case, the p-value is lower than our significance level α (equal to 1-conf.level or 0.05) so, we should reject the null hypothesis.

### Hypothesis to determine if state with highest number of hospital beds have used higher icu beds

In [None]:
stats.ttest_ind(a=analysis_data_grp['total_beds_7_day_avg'], b= analysis_data_grp['icu_beds_used_7_day_avg'],equal_var=False)

In this case, the p-value is greater than our significance level α (equal to 1-conf.level or 0.05) so, we would fail to reject the null hypothesis.