In [39]:
from collections import Counter
from scipy.stats import gamma
from sklearn.preprocessing import MinMaxScaler
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from datetime import date
plt.style.use('ggplot')

In [40]:
df = pd.read_csv('12_final_processed.csv')

In [41]:
#Filtering out data for feb 21 and march 21 
feb_21 = df[(df['Date'] >= '2021-02-01') & (df['Date'] <= '2021-02-28')]
march_21 = df[(df['Date'] >= '2021-03-01') & (df['Date'] <= '2021-03-31')]

In [42]:
#Calculating the mean for each state/column for feb 21 and march 21 

feb_21_mean_death_MI = feb_21['MI deaths'].mean()
feb_21_mean_cases_MI = feb_21['MI confirmed'].mean()


march_21_mean_death_MI = march_21['MI deaths'].mean()
march_21_mean_cases_MI = march_21['MI confirmed'].mean()


feb_21_mean_death_MN = feb_21['MN deaths'].mean()
feb_21_mean_cases_MN = feb_21['MN confirmed'].mean()


march_21_mean_death_MN = march_21['MN deaths'].mean()
march_21_mean_cases_MN = march_21['MN confirmed'].mean()

#calculating corrected variance for use in tests

def variance(col_data):
    sq_sum = 0
    mean_col_data = col_data.mean()
    n = len(col_data)
    for i in col_data:
        sq_sum = sq_sum + (i -mean_col_data)*(i-mean_col_data)
    return sq_sum/(n-1)

**Summary of the results that we run below:**

We *accept* the NULL hypothesis for deaths in MN for two population Wald's test, z-test, t-test and unpaired t-test

We *accept* the NULL hypothesis for cases in MN for two sample unpaired t-test

We *accept* the NULL hypothesis for death in MI for two sample unpaired t-test

We *reject* the NULL hypothesis for all the other cases  





**Wald's one Sample testing for confirmed cases and deaths in MI and MN**

*NULL hypothesis* H0: mean of confirmed deaths/cases for feb 21 = mean of deaths/cases for march 21

*Alternate hypothesis* H1: mean of confirmed deaths/cases for feb 21 != mean of deaths/cases for march 21 

In [43]:
# walds one sample testing for deaths and confirmed cases for both states 

def walds_1_testing(march_21_mean,feb_21_mean,march_21):
  w_1_numerator = march_21_mean - feb_21_mean
  w_1_denominator = np.sqrt(march_21_mean/len(march_21))
  return np.abs(w_1_numerator/w_1_denominator)



#for death in MI
w_1_result_death_MI = walds_1_testing(march_21_mean_death_MI,feb_21_mean_death_MI,march_21)
if(w_1_result_death_MI>1.96):
  print("Walds 1 sample testing for mean of death in MI is w="+str(w_1_result_death_MI) +" which is greater than z-alpha/2 = 1.96 so we reject the NULL hypothesis")
else:
  print("Walds 1 sample testing for mean of death in MI is  w="+str(w_1_result_death_MI)+ " which is less than z-alpha/2 = 1.96 so we accept the NULL hypothesis")

#for cases in MI
w_1_result_cases_MI = walds_1_testing(march_21_mean_cases_MI,feb_21_mean_cases_MI,march_21)
if(w_1_result_cases_MI>1.96):
  print("Walds 1 sample testing for mean of cases in MI is w="+str(w_1_result_cases_MI) +" which is greater than z-alpha/2 = 1.96 so we reject the NULL hypothesis");
else:
  print("Walds 1 sample testing for mean of cases in MI is w="+str(w_1_result_cases_MI)+ " which is less than z-alpha/2 = 1.96 so we accept the NULL hypothesis")



#for death in MN
w_1_result_death_MN = walds_1_testing(march_21_mean_death_MN,feb_21_mean_death_MN,march_21)
if(w_1_result_death_MN>1.96):
  print("\nWalds 1 sample testing for mean of death in MN is w="+str(w_1_result_death_MN) +" which is greater than z-alpha/2 = 1.96 so we reject the NULL hypothesis")
else:
  print("\nWalds 1 sample testing for mean of death in MN is  w="+str(w_1_result_death_MN)+ " which is less than z-alpha/2 = 1.96 so we accept the NULL hypothesis")

#for cases in MN
w_1_result_cases_MN = walds_1_testing(march_21_mean_cases_MN,feb_21_mean_cases_MN,march_21)
if(w_1_result_cases_MN>1.96):
  print("Walds 1 sample testing for mean of cases in MN is w="+str(w_1_result_cases_MN) +" which is greater than z-alpha/2 = 1.96 so we reject the NULL hypothesis");
else:
  print("Walds 1 sample testing for mean of cases in MN is w="+str(w_1_result_cases_MN)+ " which is less than z-alpha/2 = 1.96 so we accept the NULL hypothesis")

Walds 1 sample testing for mean of death in MI is w=19.18673565170543 which is greater than z-alpha/2 = 1.96 so we reject the NULL hypothesis
Walds 1 sample testing for mean of cases in MI is w=193.96139177500143 which is greater than z-alpha/2 = 1.96 so we reject the NULL hypothesis

Walds 1 sample testing for mean of death in MN is w=2.4912289212760137 which is greater than z-alpha/2 = 1.96 so we reject the NULL hypothesis
Walds 1 sample testing for mean of cases in MN is w=51.90948310795567 which is greater than z-alpha/2 = 1.96 so we reject the NULL hypothesis


**Wald's two population testing for confirmed cases and deaths in MI and MN**

*NULL hypothesis* H0: mean of confirmed deaths/cases for feb 21 = mean of deaths/cases for march 21

*Alternate hypothesis* H1: mean of confirmed deaths/cases for feb 21 != mean of deaths/cases for march 21 

In [44]:
##walds 2 population testing for deaths and confirmed cases for both states

def walds_2_testing(march_21_mean,feb_21_mean,march_21, feb_21):
    #using values of both months for calculating standard error
  se = np.sqrt((march_21_mean/len(march_21)) + (feb_21_mean/len(feb_21))) 
  w_2_result = (march_21_mean - feb_21_mean)/se
  return np.abs(w_2_result)


#for death calculation in MI
w_2_result_death_MI = walds_2_testing(march_21_mean_death_MI,feb_21_mean_death_MI,march_21, feb_21)
if(w_2_result_death_MI>1.96):
  print("walds 2 sample testing for mean of death in MI is w="+str(w_2_result_death_MI) +" which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis")
else:
  print("walds 2 sample testing for mean of death in MI is w="+str(w_2_result_death_MI)+ " which is less than z_alpha/2 = 1.96 so accept the NULL hypothesis")



#for cases calculation in MI
w_2_result_cases_MI = walds_2_testing(march_21_mean_cases_MI,feb_21_mean_cases_MI,march_21, feb_21)
if(w_2_result_cases_MI>1.96):
  print("walds 2 sample testing for mean of cases in MI is w="+str(w_2_result_cases_MI) +" which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis")
else:
  print("walds 2 sample testing for mean of cases in MI is w="+str(w_2_result_cases_MI)+ " which is less than z_alpha/2 = 1.96 so accept the NULL hypothesis")


#for death calculation in MN
w_2_result_death_MN = walds_2_testing(march_21_mean_death_MN,feb_21_mean_death_MN,march_21, feb_21)
if(w_2_result_death_MN>1.96):
  print("\nwalds 2 sample testing for mean of death in MN is w="+str(w_2_result_death_MN) +" which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis")
else:
  print("\nwalds 2 sample testing for mean of death in MN is w="+str(w_2_result_death_MN)+ " which is less than z_alpha/2 = 1.96 so accept the NULL hypothesis")



#for cases calculation in MN
w_2_result_cases_MN = walds_2_testing(march_21_mean_cases_MN,feb_21_mean_cases_MN,march_21, feb_21)
if(w_2_result_cases_MN>1.96):
  print("walds 2 sample testing for mean of cases in MN is w="+str(w_2_result_cases_MN) +" which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis")
else:
  print("walds 2 sample testing for mean of cases in MN is w="+str(w_2_result_cases_MN)+ " which is less than z_alpha/2 = 1.96 so accept the NULL hypothesis")

walds 2 sample testing for mean of death in MI is w=11.143749021603197 which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis
walds 2 sample testing for mean of cases in MI is w=162.15637581729627 which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis

walds 2 sample testing for mean of death in MN is w=1.7783708657644213 which is less than z_alpha/2 = 1.96 so accept the NULL hypothesis
walds 2 sample testing for mean of cases in MN is w=38.69473932283188 which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis


**z-testing for confirmed cases and deaths in MI and MN**

*NULL hypothesis* H0: mean of confirmed deaths/cases for feb 21 = mean of deaths/cases for march 21

*Alternate hypothesis* H1: mean of confirmed deaths/cases for feb 21 != mean of deaths/cases for march 21 

In [48]:
#z testing for deaths and confirmed cases for both states

def z_test(march_21_mean, feb_21_mean, col_name):
    z_num = march_21_mean - feb_21_mean
    z_den = np.sqrt(variance(df[[col_name]].values)/(len(df)))

    z_result = np.abs(z_num/z_den)
    return z_result


#for death in MI
z_result_death_MI = z_test(march_21_mean_death_MI, feb_21_mean_death_MI, 'MI deaths')
if(z_result_death_MI>1.96):
  print("z-test for mean of death in MI is w="+str(z_result_death_MI) +" which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis")
else:
  print("z-test for mean of death in MI is  w="+str(z_result_death_MI)+ " which is less than z_alpha/2 = 1.96 so accept the NULL hypothesis")

#for cases in MI
z_result_cases_MI = z_test(march_21_mean_cases_MI, feb_21_mean_cases_MI, 'MI confirmed')
if(z_result_cases_MI>1.96):
  print("z-test for mean of cases in MI is w="+str(z_result_cases_MI) +" which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis")
else:
  print("z-test for mean of cases in MI is w="+str(z_result_cases_MI)+ " which is less than z_alpha/2 = 1.96 so accept the NULL hypothesis")

#for death in MN
z_result_death_MN = z_test(march_21_mean_death_MN, feb_21_mean_death_MN, 'MN deaths')
if(z_result_death_MN>1.96):
  print("z-test for mean of death in MN is w="+str(z_result_death_MN) +" which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis")
else:
  print("\nz-test for mean of death in MN is  w="+str(z_result_death_MN)+ " which is less than z_alpha/2 = 1.96 so accept the NULL hypothesis")

#for cases in MN
z_result_cases_MN = z_test(march_21_mean_cases_MN, feb_21_mean_cases_MN, 'MN confirmed')
if(z_result_cases_MN>1.96):
  print("z-test for mean of cases in MN is w="+str(z_result_cases_MN) +" which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis")
else:
  print("z-test for mean of cases in MN is w="+str(z_result_cases_MN)+ " which is less than z_alpha/2 = 1.96 so accept the NULL hypothesis")


z-test for mean of death in MI is w=[5.66552914] which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis
z-test for mean of cases in MI is w=[16.52716098] which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis

z-test for mean of death in MN is  w=[1.47842417] which is less than z_alpha/2 = 1.96 so accept the NULL hypothesis
z-test for mean of cases in MN is w=[3.54678737] which is greater than z_alpha/2 = 1.96 so reject the NULL hypothesis


**Assumptions in z-test/ Is z-test applicable?**

z test only works if either the data is large or normally distributed
Here, the data points are greater than 30. So, we can say that z-test is applicable eventhough data is not normally distributed.

**T one sample testing for confirmed cases and deaths in MI and MN**

*NULL hypothesis* H0: mean of confirmed deaths/cases for feb 21 = mean of deaths/cases for march 21

*Alternate hypothesis* H1: mean of confirmed deaths/cases for feb 21 != mean of deaths/cases for march 21 

In [38]:

# T one sample testing for deaths and confirmed cases for both states

def T_1_sample(col,march_21_mean,feb_21_mean,march_21):
  t_1_num = march_21_mean - feb_21_mean
  t_1_den = np.sqrt(variance(march_21[[col]].values)/len(march_21))
  return np.abs(t_1_num/t_1_den)

# for deaths in MI
t_1_result_death_MI =T_1_sample('MI deaths',march_21_mean_death_MI,feb_21_mean_death_MI,march_21)
if(t_1_result_death_MI>2.3596):
  print("T-Test 1 sample testing for mean of death in MI is T1="+str(t_1_result_death_MI) +" which is greater than t(n-1,alpha/2) = 2.3596 so reject the NULL hypothesis")
else:
  print("T-Test 1 sample testing for mean of death in MI is  T1="+str(t_1_result_death_MI)+ " which is less than t(n-1,alpha/2) = 2.3596 so accept the NULL hypothesis")


# for cases in MI
t_1_result_cases_MI =T_1_sample('MI confirmed',march_21_mean_cases_MI,feb_21_mean_cases_MI,march_21)
if(t_1_result_cases_MI>2.3596):
  print("T-Test 1 sample testing for mean of cases in MI is T1="+str(t_1_result_cases_MI) +" which is greater than t(n-1,alpha/2) = 2.3596 so reject the NULL hypothesis")
else:
  print("T-Test 1 sample testing for mean of cases in MI is T1="+str(t_1_result_cases_MI)+ " which is less than t(n-1,alpha/2) = 2.3596 so accept the NULL hypothesis")

# for deaths in MN
t_1_result_death_MN =T_1_sample('MN deaths',march_21_mean_death_MN,feb_21_mean_death_MN,march_21)
if(t_1_result_death_MN>2.3596):
  print("T-Test 1 sample testing for mean of death in MN is T1="+str(t_1_result_death_MN) +" which is greater than t(n-1,alpha/2) = 2.3596 so reject the NULL hypothesis")
else:
  print("\nT-Test 1 sample testing for mean of death in MN is  T1="+str(t_1_result_death_MN)+ " which is less than t(n-1,alpha/2) = 2.3596 so accept the NULL hypothesis")


# for cases in MN
t_1_result_cases_MN =T_1_sample('MN confirmed',march_21_mean_cases_MN,feb_21_mean_cases_MN,march_21)
if(t_1_result_cases_MN>2.3596):
  print("T-Test 1 sample testing for mean of cases in MN is T1="+str(t_1_result_cases_MN) +" which is greater than t(n-1,alpha/2) = 2.3596 so reject the NULL hypothesis")
else:
  print("T-Test 1 sample testing for mean of cases in MN is T1="+str(t_1_result_cases_MN)+ " which is less than t(n-1,alpha/2) = 2.3596 so accept the NULL hypothesis")



T-Test 1 sample testing for mean of death in MI is T1=[5.28586887] which is greater than t(n-1,alpha/2) = 2.3596 so reject the NULL hypothesis
T-Test 1 sample testing for mean of cases in MI is T1=[4.86763697] which is greater than t(n-1,alpha/2) = 2.3596 so reject the NULL hypothesis

T-Test 1 sample testing for mean of death in MN is  T1=[0.35109704] which is less than t(n-1,alpha/2) = 2.3596 so accept the NULL hypothesis
T-Test 1 sample testing for mean of cases in MN is T1=[4.63262675] which is greater than t(n-1,alpha/2) = 2.3596 so reject the NULL hypothesis


**Assumptions/Is t-test applicable?**

For this course T-test assumes that the data is normally distributed. But here we do not have normally distributed data so it is not a right choice to apply t-test


**T two sample testing for confirmed cases and deaths in MI and MN**

*NULL hypothesis* H0: mean of confirmed deaths/cases for feb 21 = mean of deaths/cases for march 21

*Alternate hypothesis* H1: mean of confirmed deaths/cases for feb 21 != mean of deaths/cases for march 21 

In [21]:
# Unpaired T two sample testing for deaths and confirmed cases for both states.
#Here we consider both samples so m=31 and n =28 so threshold will be t(n+m-2, alpha/2)

def unpaired_T(feb_21_mean, march_21_mean, col):
    T2_num = feb_21_mean - march_21_mean

    feb_21_var = variance(feb_21[[col]].values)
    march_21_var = variance(march_21[[col]].values)
    T2_den = np.sqrt(march_21_var/len(march_21) + feb_21_var/len(feb_21))

    #T test unpaired result
    return np.abs(T2_num/T2_den)

# T 2 sample test for deaths in MI
T2_death_MI = unpaired_T(feb_21_mean_death_MI, march_21_mean_death_MI, 'MI deaths')
if(T2_death_MI>2.3022):
  print("T two sample unpaired  testing for mean of death in MI is T="+str(T2_death_MI) +" which is greater than t(57,alpha/2) = 2.3022 so reject the NULL hypothesis")
else:
  print("T two sample unpaired  testing for mean of death in MI is T="+str(T2_death_MI)+ " which is less than t(57,alpha/2) = 2.3022 so accept the NULL hypothesis")


# T 2 sample test for cases in MI
T2_cases_MI = unpaired_T(feb_21_mean_cases_MI, march_21_mean_cases_MI, 'MI confirmed')
if(T2_cases_MI>2.3022):
  print("T two sample unpaired  testing for mean of cases in MI is T="+str(T2_cases_MI) +" which is greater than t(57,alpha/2) = 2.3022 so reject the NULL hypothesis")
else:
  print("T two sample unpaired  testing for mean of cases in MI is T="+str(T2_cases_MI)+ " which is less than t(57,alpha/2) = 2.3022 so accept the NULL hypothesis")

# T 2 sample test for deaths in MN
T2_death_MN = unpaired_T(feb_21_mean_death_MN, march_21_mean_death_MN, 'MI deaths')
if(T2_death_MN>2.3022):
  print("T two sample unpaired  testing for mean of death in MN is T="+str(T2_death_MN) +" which is greater than t(57,alpha/2) = 2.3022 so reject the NULL hypothesis")
else:
  print("\nT two sample unpaired  testing for mean of death in MN is T="+str(T2_death_MN)+ " which is less than t(57,alpha/2) = 2.3022 so accept the NULL hypothesis")


# T 2 sample test for cases in MN
T2_cases_MN = unpaired_T(feb_21_mean_cases_MN, march_21_mean_cases_MN, 'MI confirmed')
if(T2_cases_MN>2.3022):
  print("T two sample unpaired  testing for mean of cases in MN is T="+str(T2_cases_MN) +" which is greater than t(57,alpha/2) = 2.3022 so reject the NULL hypothesis")
else:
  print("T two sample unpaired  testing for mean of cases in MN is T="+str(T2_cases_MN)+ " which is less than t(57,alpha/2) = 2.3022 so accept the NULL hypothesis")


T two sample unpaired  testing for mean of death in MI is T=[2.18521476] which is less than t(57,alpha/2) = 2.3022 so accept the NULL hypothesis
T two sample unpaired  testing for mean of cases in MI is T=[4.67047039] which is greater than t(57,alpha/2) = 2.3022 so reject the NULL hypothesis

T two sample unpaired  testing for mean of death in MN is T=[0.21816034] which is less than t(57,alpha/2) = 2.3022 so accept the NULL hypothesis
T two sample unpaired  testing for mean of cases in MN is T=[0.7358662] which is less than t(57,alpha/2) = 2.3022 so accept the NULL hypothesis
