# Part 1

In [164]:
import datetime
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

In [165]:
dtype = {
    "ID": str,
    "age": str,
    "sex": str,
    "city": str,
    "province": str,
    "country": str,
    "latitude": float,
    "longitude": float,
    "geo_resolution": str,
    "date_onset_symptoms": str,
    "date_admission_hospital": str,
    "date_confirmation": str,
    "symptoms": str,
    "lives_in_Wuhan": str,
    "travel_history_dates": str,
    "travel_history_location": str,
    "reported_market_exposure": str,
    "additional_information": str,
    "chronic_disease": str,
    "source": str,
    "sequence_available": str,
    "outcome": str,
    "date_death_or_discharge": str,
    "notes_for_discussion": str,
    "location": str,
    "admin1": str,
    "admin2": str,
    "admin3": str,
    "country_new": str,
    "admin_id": float,
    "data_moderator_initials": str,
    "travel_history_binary": str
}

In [166]:
df: pd.DataFrame = pd.read_csv(
    "./latestdata.csv",
    dtype=dtype,
    # nrows=1000
)
df

Unnamed: 0,ID,age,sex,city,province,country,latitude,longitude,geo_resolution,date_onset_symptoms,...,date_death_or_discharge,notes_for_discussion,location,admin3,admin2,admin1,country_new,admin_id,data_moderator_initials,travel_history_binary
0,000-1-1,,male,Shek Lei,Hong Kong,China,22.365019,114.133808,point,,...,,,Shek Lei,,,Hong Kong,China,8029.0,,
1,000-1-10,78,male,Vo Euganeo,Veneto,Italy,45.297748,11.658382,point,,...,22.02.2020,,Vo' Euganeo,,,Veneto,Italy,8954.0,,
2,000-1-100,61,female,,,Singapore,1.353460,103.815100,admin0,,...,17.02.2020,,,,,,Singapore,200.0,,
3,000-1-1000,,,Zhengzhou City,Henan,China,34.629310,113.468000,admin2,,...,,,,,Zhengzhou City,Henan,China,10091.0,,
4,000-1-10000,,,Pingxiang City,Jiangxi,China,27.513560,113.902900,admin2,,...,,,,,Pingxiang City,Jiangxi,China,7060.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2676306,010-99995,52,female,Calleria,Coronel Portillo,Peru,-8.378190,-74.539700,point,,...,,,,,,,Peru,14429.0,,False
2676307,010-99996,52,female,Pueblo Libre,Lima,Peru,-12.076530,-77.067350,point,,...,,,,,,,Peru,14695.0,,False
2676308,010-99997,52,female,Comas,Lima,Peru,-11.932980,-77.040850,point,,...,,,,,,,Peru,14400.0,,False
2676309,010-99998,52,male,Callao,Callao,Peru,-12.000740,-77.118240,point,,...,,,,,,,Peru,14516.0,,False


In [167]:
df.info(verbose=True, max_cols=307382, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676311 entries, 0 to 2676310
Data columns (total 33 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        2676311 non-null  object 
 1   age                       578018 non-null   object 
 2   sex                       580157 non-null   object 
 3   city                      1698630 non-null  object 
 4   province                  2223647 non-null  object 
 5   country                   2676196 non-null  object 
 6   latitude                  2676250 non-null  float64
 7   longitude                 2676250 non-null  float64
 8   geo_resolution            2676250 non-null  object 
 9   date_onset_symptoms       261599 non-null   object 
 10  date_admission_hospital   116211 non-null   object 
 11  date_confirmation         2567822 non-null  object 
 12  symptoms                  2052 non-null     object 
 13  lives_in_Wuhan            4

In [168]:
df = df.drop('latitude', axis=1)

In [169]:
df = df.drop('longitude', axis=1)

In [170]:
df = df.drop('ID', axis=1)

In [171]:
df = df.drop('data_moderator_initials', axis=1)

In [172]:
df = df.drop('source', axis=1)

In [173]:
df = df.drop('notes_for_discussion', axis=1)

In [174]:
df = df.drop('sequence_available', axis=1)

In [175]:
df = df.drop('additional_information', axis=1)

In [176]:
for col in df:
    if df.dtypes[col] == "object":
        df[col] = df[col].str.lower()

In [177]:
def parse_outcome(outcome):
    if isinstance(outcome, float):
        return "recovered"
    else:
        dead = ["died", "deceased", "death", "dead"]
        hospital = ["receiving treatment", "under treatment", "hospitalized",
                    "symptoms only improved with cough. currently hospitalized for follow-up.",
                    "treated in an intensive care unit (14.02.2020)", "stable condition", "stable",
                    "critical condition, intubated as of 14.02.2020", "severe", "critical condition", "severe illness",
                    "unstable", "migrated", "migrated_other"]
        recovered = ["alive", "discharge", "discharged from hospital", "recovering at home 03.03.2020",
                     "not hospitalized", "released from quarantine", "https://www.mspbs.gov.py/covid-19.php",
                     "discharged"]

        if outcome in dead:
            return "death"
        if outcome in hospital:
            return "hospitalized"
        if outcome in recovered:
            return "recovered"
    return outcome


df["outcome"].apply(parse_outcome).value_counts()

outcome
recovered       2466865
hospitalized     203447
death              5999
Name: count, dtype: int64

In [178]:
df["outcome"] = df["outcome"].apply(parse_outcome)

In [179]:
df["country"] = df["country"].fillna(df["country"].mode()[0])

In [180]:
def get_province(row):
    if row["province"].value_counts().empty:
        return float('NaN')
    else:
        return row["province"].value_counts().idxmax()

df["province"] = df["province"].fillna(df.groupby('country').apply(get_province, include_groups=False))

In [181]:
# Clean age field
def age_to_int(age_str):
    if isinstance(age_str, float):
        return None

    if "-" in age_str:
        age_min, age_max = age_str.split("-")
        if age_min == '':
            return int(age_max)
        if age_max == '':
            return int(age_min)
        age_min, age_max = int(age_min), int(age_max)
        return int((age_min + age_max) / 2)

    if "weeks" in age_str:
        return 0

    if "months" in age_str or "month" in age_str:
        num, _ = age_str.split(" ")
        if int(num) < 12:
            return 0
        return int(int(num) / 12)

    if age_str[-1] == "+" or age_str[-1] == "-":
        return int(age_str[:-1])

    return int(float(age_str))


df["age"] = df["age"].apply(age_to_int)
df["age"] = df["age"].fillna(int(df["age"].mean()))

In [182]:
def parse_bool(x):
    return not isinstance(x, float)

In [183]:
df["chronic_disease_binary"] = df["chronic_disease_binary"].apply(parse_bool)

In [184]:
df["lives_in_Wuhan"] = df["lives_in_Wuhan"].apply(parse_bool)

In [185]:
df["travel_history_binary"] = df["travel_history_binary"].apply(parse_bool)

In [186]:
df["reported_market_exposure"] = df["reported_market_exposure"].apply(parse_bool)

In [187]:
def parse_asymptomatic(x):
    return isinstance(x, float)

df["asymptomatic"] = df["date_onset_symptoms"].apply(parse_bool)

In [188]:
df.info(verbose=True, max_cols=307382, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676311 entries, 0 to 2676310
Data columns (total 26 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   age                       2676311 non-null  float64
 1   sex                       580157 non-null   object 
 2   city                      1698630 non-null  object 
 3   province                  2223647 non-null  object 
 4   country                   2676311 non-null  object 
 5   geo_resolution            2676250 non-null  object 
 6   date_onset_symptoms       261599 non-null   object 
 7   date_admission_hospital   116211 non-null   object 
 8   date_confirmation         2567822 non-null  object 
 9   symptoms                  2052 non-null     object 
 10  lives_in_Wuhan            2676311 non-null  bool   
 11  travel_history_dates      2611 non-null     object 
 12  travel_history_location   9222 non-null     object 
 13  reported_market_exposure  2

In [189]:
df['date_confirmation'] = pd.to_datetime(df['date_confirmation'], format='%d.%m.%Y', errors='coerce')

In [190]:
df['days_since_subject_zero'] = df["date_confirmation"].apply(lambda d: (d - datetime.datetime.strptime("17.11.2019", '%d.%m.%Y')).days)

In [191]:
df["outcome_binary"] = df['outcome'].apply(lambda x: 0 if x not in ["recovered", "hospitalized"] else 1)

In [192]:
df["hospitalized_binary"] = df['outcome'].apply(lambda x: 1 if x in ["hospitalized"] else 0)

In [193]:
df["death_binary"] = df['outcome'].apply(lambda x: 0 if x in ["recovered", "hospitalized"] else 1)

In [194]:
df

Unnamed: 0,age,sex,city,province,country,geo_resolution,date_onset_symptoms,date_admission_hospital,date_confirmation,symptoms,...,admin2,admin1,country_new,admin_id,travel_history_binary,asymptomatic,days_since_subject_zero,outcome_binary,hospitalized_binary,death_binary
0,45.0,male,shek lei,hong kong,china,point,,,2020-02-14,,...,,hong kong,china,8029.0,False,False,89.0,1,1,0
1,78.0,male,vo euganeo,veneto,italy,point,,,2020-02-21,,...,,veneto,italy,8954.0,False,False,96.0,0,0,1
2,61.0,female,,,singapore,admin0,,,2020-02-14,,...,,,singapore,200.0,False,False,89.0,1,0,0
3,45.0,,zhengzhou city,henan,china,admin2,,,2020-01-26,,...,zhengzhou city,henan,china,10091.0,False,False,70.0,1,0,0
4,45.0,,pingxiang city,jiangxi,china,admin2,,,2020-02-14,,...,pingxiang city,jiangxi,china,7060.0,False,False,89.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2676306,52.0,female,calleria,coronel portillo,peru,point,,,2020-05-17,,...,,,peru,14429.0,True,False,182.0,1,0,0
2676307,52.0,female,pueblo libre,lima,peru,point,,,2020-05-17,,...,,,peru,14695.0,True,False,182.0,1,0,0
2676308,52.0,female,comas,lima,peru,point,,,2020-05-17,,...,,,peru,14400.0,True,False,182.0,1,0,0
2676309,52.0,male,callao,callao,peru,point,,,2020-05-17,,...,,,peru,14516.0,True,False,182.0,1,0,0


# Correlation Part 1A

In [195]:
def is_correlated(var_name1, var_name2, data_frame: pd.DataFrame):
    try:
        new_frame = data_frame[[var_name1, var_name2]]
        new_frame = new_frame.dropna(how="any", axis=0)
        ct = pd.crosstab(index=new_frame[var_name1], columns=new_frame[var_name2])
        print(ct)
        chi2_res = chi2_contingency(ct, )
        p, x, dof_ = chi2_res[1], "correlated" if chi2_res[1] < 0.05 else "not-correlated", chi2_res[2]
        return p, x, dof_
    except ValueError:
        return 0, "no data", 0

In [196]:
outcome_df = df[df['outcome'].notna()]

In [197]:
corr = [[], []]
to_check = ["sex", "country", "age", "lives_in_Wuhan", "travel_history_binary"]
for var1 in df:
    if var1 != "ID" and var1 in to_check:
        p_val, correlation, dof = is_correlated(var1, "outcome", outcome_df)
        if correlation == "correlated":
            corr[0].append((p_val, dof, var1))
        else:
            corr[1].append((p_val, dof, var1))
        print("")

corr[0].sort()
corr[1].sort()

print("Correlated")

for ln in corr[0]:
    print(ln)

print("Non-Correlated")

for ln in corr[1]:
    print(ln)

outcome  death  hospitalized  recovered
age                                    
0.0          6            33       1747
1.0         11           146        927
2.0          0           151       2584
3.0          0           153        725
4.0          0           131       1068
...        ...           ...        ...
105.0        0             0          6
106.0        0             0          1
107.0        0             0          1
120.0        0             0          4
121.0        0             0          1

[110 rows x 3 columns]

outcome  death  hospitalized  recovered
sex                                    
female     546         10788     261635
male       941         20586     285661

outcome               death  hospitalized  recovered
country                                             
afghanistan               0             0          8
albania                   0             0       1054
algeria                   7             0       1260
andorra                   0  

# MCA
Here we will use MCA instead of PCA in order to produce visualizations for discrete variables

In [198]:
import prince
from sklearn.preprocessing import LabelEncoder

In [216]:
continuous_dataset = df[["outcome_binary", "age", "travel_history_binary", "days_since_subject_zero", "asymptomatic", "lives_in_Wuhan"]]
continuous_dataset =  continuous_dataset.dropna(how="any", axis=0).sample(n=5000, random_state=42)
continuous_dataset.corr()

Unnamed: 0,outcome_binary,age,travel_history_binary,days_since_subject_zero,asymptomatic,lives_in_Wuhan
outcome_binary,1.0,-0.0129,-0.006045,-0.032273,-0.003426,0.001402
age,-0.0129,1.0,0.003018,-0.080521,0.026092,0.016733
travel_history_binary,-0.006045,0.003018,1.0,0.377955,0.029831,-0.129943
days_since_subject_zero,-0.032273,-0.080521,0.377955,1.0,-0.109739,-0.075885
asymptomatic,-0.003426,0.026092,0.029831,-0.109739,1.0,0.037113
lives_in_Wuhan,0.001402,0.016733,-0.129943,-0.075885,0.037113,1.0


In [217]:
continuous_dataset.info(verbose=True, max_cols=307382, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 2387036 to 931788
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   outcome_binary           5000 non-null   int64  
 1   age                      5000 non-null   float64
 2   travel_history_binary    5000 non-null   bool   
 3   days_since_subject_zero  5000 non-null   float64
 4   asymptomatic             5000 non-null   bool   
 5   lives_in_Wuhan           5000 non-null   bool   
dtypes: bool(3), float64(2), int64(1)
memory usage: 170.9 KB


In [219]:
pca = prince.PCA(
    n_components=1000,
    n_iter=5,
    copy=True,
    check_input=True,
    random_state=42,
    engine="sklearn" # same parameter as sklearn.preprocessing.OneHotEncoder
)
pca = pca.fit(continuous_dataset)
pca.eigenvalues_summary

Unnamed: 0_level_0,eigenvalue,% of variance,% of variance (cumulative)
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.447,24.12%,24.12%
1,1.042,17.36%,41.48%
2,1.001,16.68%,58.16%
3,0.979,16.31%,74.46%
4,0.947,15.79%,90.25%
5,0.585,9.75%,100.00%


In [222]:
pca.plot(
    continuous_dataset,
    x_component=0,
    y_component=1
)

In [203]:
pca.column_contributions_.head().style.format('{:.0%}')

component,0,1,2,3,4
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
outcome_binary,0%,7%,81%,11%,0%
age,2%,36%,2%,59%,2%
travel_history_binary,44%,9%,3%,0%,44%
days_since_subject_zero,51%,0%,0%,0%,49%
asymptomatic,2%,48%,15%,30%,5%


#

# Part 2: Bayes Net