In [158]:
import pandas as pd
#reading exact columns to reduce memory usage
df = pd.read_csv("covid19.csv", usecols=["sex", "race", "ethnicity", "death_yn"])
# statistics
df.info(verbose=False, memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19020962 entries, 0 to 19020961
Columns: 4 entries, sex to death_yn
dtypes: object(4)
memory usage: 4.3 GB


In [159]:
# Cleaning the data by removing missing values and unknown

df = df[df['death_yn'].isin(["Missing", 'Unknown']) == False]
df = df[df['race'].isin(['Missing', 'Unknown']) == False]
df = df[df['ethnicity'].isin(['Missing', 'Unknown']) == False]
df = df[df['sex'].isin(['Missing', 'Unknown', 'Other']) == False]


df.dropna()



Unnamed: 0,sex,race,ethnicity,death_yn
4,Female,White,Non-Hispanic/Latino,No
10,Female,Black,Non-Hispanic/Latino,No
12,Female,White,Hispanic/Latino,No
13,Female,White,Hispanic/Latino,No
16,Male,White,Non-Hispanic/Latino,No
...,...,...,...,...
19020936,Female,White,Non-Hispanic/Latino,No
19020943,Female,White,Hispanic/Latino,No
19020944,Female,White,Non-Hispanic/Latino,No
19020949,Female,Black,Non-Hispanic/Latino,No


In [160]:
#mapping yes to value 1 and no to value 0
df['death_yn'] = df['death_yn'].replace({'Yes': int(1), 'No': int(0)})
df.dropna()


Unnamed: 0,sex,race,ethnicity,death_yn
4,Female,White,Non-Hispanic/Latino,0.0
10,Female,Black,Non-Hispanic/Latino,0.0
12,Female,White,Hispanic/Latino,0.0
13,Female,White,Hispanic/Latino,0.0
16,Male,White,Non-Hispanic/Latino,0.0
...,...,...,...,...
19020936,Female,White,Non-Hispanic/Latino,0.0
19020943,Female,White,Hispanic/Latino,0.0
19020944,Female,White,Non-Hispanic/Latino,0.0
19020949,Female,Black,Non-Hispanic/Latino,0.0


In [167]:
#dropping rows with values zeroes
death_rates = df.groupby(['sex', 'ethnicity', 'race'])['death_yn'].mean()
death_counts = death_rates.reset_index()

death_counts = death_counts[death_counts['death_yn']!=0]


death_counts



Unnamed: 0,sex,ethnicity,race,death_yn
2,Female,Hispanic/Latino,Black,0.004266
3,Female,Hispanic/Latino,Multiple/Other,0.039101
5,Female,Hispanic/Latino,White,0.020511
6,Female,Non-Hispanic/Latino,American Indian/Alaska Native,0.003781
7,Female,Non-Hispanic/Latino,Asian,0.019124
8,Female,Non-Hispanic/Latino,Black,0.013169
9,Female,Non-Hispanic/Latino,Multiple/Other,0.000371
11,Female,Non-Hispanic/Latino,White,0.01667
14,Male,Hispanic/Latino,Black,0.006614
15,Male,Hispanic/Latino,Multiple/Other,0.050061


In [37]:
#data before filtering

Unnamed: 0,sex,race,ethnicity,death_yn
1,Female,White,Non-Hispanic/Latino,Unknown
3,Male,White,Non-Hispanic/Latino,Missing
4,Female,White,Non-Hispanic/Latino,No
5,Unknown,White,Non-Hispanic/Latino,Missing
7,Female,,,Missing
...,...,...,...,...
19020955,Male,,,Missing
19020956,Female,White,Hispanic/Latino,Missing
19020957,Female,Black,Unknown,No
19020959,Male,White,Non-Hispanic/Latino,Missing


Hypothesis Testing
Null Hypothesis (H0): There is no association between demographics and death probability.
Alternative Hypothesis (Ha): There is an association between demographics and death probability.


In [168]:

from scipy import stats

print(death_counts['death_yn'])
chi2_statistic1, p_value, expected_frequency, observed_frequency  = stats.chi2_contingency(death_counts['death_yn'])

# Significance level (alpha) = 0.05
print(p_value)

2     0.004266
3     0.039101
5     0.020511
6     0.003781
7     0.019124
8     0.013169
9     0.000371
11    0.016670
14    0.006614
15    0.050061
17    0.040949
18    0.014764
19    0.032137
20    0.019730
21    0.000745
23    0.024328
Name: death_yn, dtype: float64
1.0


death_yn,0.0,1.0
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
FemaleHispanic/LatinoAmerican Indian/Alaska Native,699,0
FemaleHispanic/LatinoAsian,184,0
FemaleHispanic/LatinoBlack,5835,25
FemaleHispanic/LatinoMultiple/Other,983,40
FemaleHispanic/LatinoNative Hawaiian/Other Pacific Islander,7,0
FemaleHispanic/LatinoWhite,186865,3913
FemaleNon-Hispanic/LatinoAmerican Indian/Alaska Native,6324,24
FemaleNon-Hispanic/LatinoAsian,48571,947
FemaleNon-Hispanic/LatinoBlack,292240,3900
FemaleNon-Hispanic/LatinoMultiple/Other,18855,7


p-value of 1.0 suggests no evidence against the null hypothesis, indicating a statistically insignificant association between demographics and COVID-19 death probability.
 It signifies that the observed distribution of death outcomes across different demographic groups is highly likely to have occurred by chance.

In [172]:
df = pd.read_csv("covid19.csv",usecols=["death_yn","icu_yn", "hosp_yn"])
df = df[df['death_yn'].isin(["Missing", 'Unknown']) == False]
df = df[df['icu_yn'].isin(["Missing", 'Unknown']) == False]
df = df[df['hosp_yn'].isin(["Missing", 'Unknown']) == False]

df.dropna()


Unnamed: 0,hosp_yn,icu_yn,death_yn
26,No,No,No
70,Yes,No,No
278,No,No,No
284,No,No,No
312,No,No,No
...,...,...,...
19020769,Yes,No,No
19020842,No,No,No
19020859,Yes,No,No
19020907,No,No,No


In [173]:
df['death_yn'] = df['death_yn'].replace({'Yes': int(1), 'No': int(0)})
df.dropna(subset=['icu_yn', 'hosp_yn'], inplace=True)

In [185]:
death_rates = df.groupby(['icu_yn', "hosp_yn"])['death_yn'].mean()
death_rates = death_rates.reset_index()
death_rates

Unnamed: 0,icu_yn,hosp_yn,death_yn
0,No,No,0.00293
1,No,Yes,0.092487
2,Yes,No,0.071942
3,Yes,Yes,0.469952


Hypothesis Testing
Null Hypothesis (H0): There is no association between being hospialized and icu and death probability.
Alternative Hypothesis (Ha): There is an association between being hospialized and icu and death probability.


In [186]:
chi2_statistic1, p_value, expected_frequency, observed_frequency  = stats.chi2_contingency(death_rates['death_yn'])

# Hypothesis Testing
# Null Hypothesis (H0): There is no association between hospitalization and death.
# Alternative Hypothesis (Ha): There is an association between hospitalization and death.

# Significance level (alpha) = 0.05
if p_value < 0.05:
  print("Reject H0. There is a statistically significant association between hospitalization and death.")
else:
  print("Fail to reject H0. Evidence is inconclusive at the 5% significance level indicating a statistically insignificant association between demographics and COVID-19 death probability.It signifies that the observed distribution of death outcomes across different demographic groups is highly likely to have occurred by chance.")

Fail to reject H0. Evidence is inconclusive at the 5% significance level indicating a statistically insignificant association between demographics and COVID-19 death probability.It signifies that the observed distribution of death outcomes across different demographic groups is highly likely to have occurred by chance.


In [86]:
print(p_value, chi2_statistic1, expected_frequency, observed_frequency)

0.0 112331.71573758239 3 [[2.98690140e+05 1.11688597e+04]
 [4.00436557e+04 1.49734428e+03]
 [2.67979497e+02 1.00205029e+01]
 [1.75642245e+04 6.56775479e+02]]
