In [399]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

In [400]:
dtype = {
    "ID": str,
    "age": str,
    "sex": str,
    "city": str,
    "province": str,
    "country": str,
    "latitude": float,
    "longitude": float,
    "geo_resolution": str,
    "date_onset_symptoms": str,
    "date_admission_hospital": str,
    "date_confirmation": str,
    "symptoms": str,
    "lives_in_Wuhan": str,
    "travel_history_dates": str,
    "travel_history_location": str,
    "reported_market_exposure": str,
    "additional_information": str,
    "chronic_disease": str,
    "source": str,
    "sequence_available": str,
    "outcome": str,
    "date_death_or_discharge": str,
    "notes_for_discussion": str,
    "location": str,
    "admin1": str,
    "admin2": str,
    "admin3": str,
    "country_new": str,
    "admin_id": float,
    "data_moderator_initials": str,
    "travel_history_binary": str
}

In [401]:
df: pd.DataFrame = pd.read_csv(
    "./latestdata.csv",
    dtype=dtype,
    # nrows=1000
)
df

Unnamed: 0,ID,age,sex,city,province,country,latitude,longitude,geo_resolution,date_onset_symptoms,...,date_death_or_discharge,notes_for_discussion,location,admin3,admin2,admin1,country_new,admin_id,data_moderator_initials,travel_history_binary
0,000-1-1,,male,Shek Lei,Hong Kong,China,22.365019,114.133808,point,,...,,,Shek Lei,,,Hong Kong,China,8029.0,,
1,000-1-10,78,male,Vo Euganeo,Veneto,Italy,45.297748,11.658382,point,,...,22.02.2020,,Vo' Euganeo,,,Veneto,Italy,8954.0,,
2,000-1-100,61,female,,,Singapore,1.353460,103.815100,admin0,,...,17.02.2020,,,,,,Singapore,200.0,,
3,000-1-1000,,,Zhengzhou City,Henan,China,34.629310,113.468000,admin2,,...,,,,,Zhengzhou City,Henan,China,10091.0,,
4,000-1-10000,,,Pingxiang City,Jiangxi,China,27.513560,113.902900,admin2,,...,,,,,Pingxiang City,Jiangxi,China,7060.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2676306,010-99995,52,female,Calleria,Coronel Portillo,Peru,-8.378190,-74.539700,point,,...,,,,,,,Peru,14429.0,,False
2676307,010-99996,52,female,Pueblo Libre,Lima,Peru,-12.076530,-77.067350,point,,...,,,,,,,Peru,14695.0,,False
2676308,010-99997,52,female,Comas,Lima,Peru,-11.932980,-77.040850,point,,...,,,,,,,Peru,14400.0,,False
2676309,010-99998,52,male,Callao,Callao,Peru,-12.000740,-77.118240,point,,...,,,,,,,Peru,14516.0,,False


In [402]:
df.info(verbose=True, max_cols=307382, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676311 entries, 0 to 2676310
Data columns (total 33 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        2676311 non-null  object 
 1   age                       578018 non-null   object 
 2   sex                       580157 non-null   object 
 3   city                      1698630 non-null  object 
 4   province                  2223647 non-null  object 
 5   country                   2676196 non-null  object 
 6   latitude                  2676250 non-null  float64
 7   longitude                 2676250 non-null  float64
 8   geo_resolution            2676250 non-null  object 
 9   date_onset_symptoms       261599 non-null   object 
 10  date_admission_hospital   116211 non-null   object 
 11  date_confirmation         2567822 non-null  object 
 12  symptoms                  2052 non-null     object 
 13  lives_in_Wuhan            4

In [403]:
# Total of missing values
total_cells = np.prod(df.shape)
total_missing = df.isnull().sum().sum()
# Percentage of missing data
print((total_missing / total_cells) * 100)

56.49519511043826


In [404]:
df = df.drop('latitude', axis=1)

In [405]:
df = df.drop('longitude', axis=1)

In [406]:
df = df.drop('ID', axis=1)

In [407]:
df = df.drop('data_moderator_initials', axis=1)

In [408]:
df = df.drop('source', axis=1)

In [409]:
df = df.drop('notes_for_discussion', axis=1)

In [410]:
df = df.drop('sequence_available', axis=1)

In [411]:
df = df.drop('additional_information', axis=1)

In [442]:
df["outcome"].value_counts()

outcome
recovered       2466865
hospitalized     203447
death              5999
Name: count, dtype: int64

In [413]:
def parse_outcome(outcome):
    if isinstance(outcome, float):
        return "recovered"
    else:
        dead = ["died", "deceased", "death", "dead"]
        hospital = ["receiving treatment", "under treatment", "hospitalized",
                    "symptoms only improved with cough. currently hospitalized for follow-up.",
                    "treated in an intensive care unit (14.02.2020)", "stable condition", "stable",
                    "critical condition, intubated as of 14.02.2020", "severe", "critical condition", "severe illness",
                    "unstable", "migrated", "migrated_other"]
        recovered = ["alive", "discharge", "discharged from hospital", "recovering at home 03.03.2020",
                     "not hospitalized", "released from quarantine", "https://www.mspbs.gov.py/covid-19.php",
                     "discharged"]

        if outcome in dead:
            return "death"
        if outcome in hospital:
            return "hospitalized"
        if outcome in recovered:
            return "recovered"
    return outcome


df["outcome"].apply(parse_outcome).value_counts()

outcome
recovered                                                                   2373014
Hospitalized                                                                 202475
Recovered                                                                     93656
Deceased                                                                       5011
death                                                                           967
Under treatment                                                                 370
hospitalized                                                                    298
Receiving Treatment                                                             257
Alive                                                                           189
Stable                                                                           31
Dead                                                                             13
Migrated_Other                                                      

In [441]:
df["outcome"] = df["outcome"].apply(parse_outcome)

In [415]:
for col in df:
    if df.dtypes[col] == "object":
        df[col] = df[col].str.lower()

In [416]:
# df["sex"] = df["sex"].fillna(df["sex"].mode()[0])

In [417]:
df["country"] = df["country"].fillna(df["country"].mode()[0])

In [418]:
def get_province(row):
    if row["province"].value_counts().empty:
        return float('NaN')
    else:
        return row["province"].value_counts().idxmax()


df["province"] = df["province"].fillna(df.groupby('country').apply(get_province, include_groups=False))

In [419]:
# Clean age field
def age_to_int(age_str):
    if isinstance(age_str, float):
        return None

    if "-" in age_str:
        age_min, age_max = age_str.split("-")
        if age_min == '':
            return int(age_max)
        if age_max == '':
            return int(age_min)
        age_min, age_max = int(age_min), int(age_max)
        return int((age_min + age_max) / 2)

    if "weeks" in age_str:
        return 0

    if "months" in age_str or "month" in age_str:
        num, _ = age_str.split(" ")
        if int(num) < 12:
            return 0
        return int(int(num) / 12)

    if age_str[-1] == "+" or age_str[-1] == "-":
        return int(age_str[:-1])

    return int(float(age_str))


df["age"] = df["age"].apply(age_to_int)
df["age"] = df["age"].fillna(int(df["age"].mean()))

In [420]:
def parse_bool(x):
    return not isinstance(x, float)

In [421]:
df["chronic_disease_binary"] = df["chronic_disease_binary"].apply(parse_bool)

In [422]:
df["asymptomatic"] = df["date_onset_symptoms"].apply(parse_bool)

In [423]:
df["lives_in_Wuhan"] = df["lives_in_Wuhan"].apply(parse_bool)

In [424]:
df["travel_history_binary"] = df["travel_history_binary"].apply(parse_bool)

In [425]:
df["reported_market_exposure"] = df["reported_market_exposure"].apply(parse_bool)

In [426]:
df.info(verbose=True, max_cols=307382, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676311 entries, 0 to 2676310
Data columns (total 26 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   age                       2676311 non-null  float64
 1   sex                       580157 non-null   object 
 2   city                      1698630 non-null  object 
 3   province                  2223647 non-null  object 
 4   country                   2676311 non-null  object 
 5   geo_resolution            2676250 non-null  object 
 6   date_onset_symptoms       261599 non-null   object 
 7   date_admission_hospital   116211 non-null   object 
 8   date_confirmation         2567822 non-null  object 
 9   symptoms                  2052 non-null     object 
 10  lives_in_Wuhan            2676311 non-null  bool   
 11  travel_history_dates      2611 non-null     object 
 12  travel_history_location   9222 non-null     object 
 13  reported_market_exposure  2

In [427]:
df

Unnamed: 0,age,sex,city,province,country,geo_resolution,date_onset_symptoms,date_admission_hospital,date_confirmation,symptoms,...,outcome,date_death_or_discharge,location,admin3,admin2,admin1,country_new,admin_id,travel_history_binary,asymptomatic
0,45.0,male,shek lei,hong kong,china,point,,,14.02.2020,,...,hospitalized,,shek lei,,,hong kong,china,8029.0,False,False
1,78.0,male,vo euganeo,veneto,italy,point,,,21.02.2020,,...,death,22.02.2020,vo' euganeo,,,veneto,italy,8954.0,False,False
2,61.0,female,,,singapore,admin0,,,14.02.2020,,...,recovered,17.02.2020,,,,,singapore,200.0,False,False
3,45.0,,zhengzhou city,henan,china,admin2,,,26.01.2020,,...,recovered,,,,zhengzhou city,henan,china,10091.0,False,False
4,45.0,,pingxiang city,jiangxi,china,admin2,,,14.02.2020,,...,recovered,,,,pingxiang city,jiangxi,china,7060.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2676306,52.0,female,calleria,coronel portillo,peru,point,,,17.05.2020,,...,recovered,,,,,,peru,14429.0,True,False
2676307,52.0,female,pueblo libre,lima,peru,point,,,17.05.2020,,...,recovered,,,,,,peru,14695.0,True,False
2676308,52.0,female,comas,lima,peru,point,,,17.05.2020,,...,recovered,,,,,,peru,14400.0,True,False
2676309,52.0,male,callao,callao,peru,point,,,17.05.2020,,...,recovered,,,,,,peru,14516.0,True,False


In [428]:
for col in df:
    print(df[col].value_counts())

age
45.0     2105146
47.0       79415
24.0       56325
69.0       35429
80.0       21346
          ...   
103.0          4
120.0          4
121.0          1
107.0          1
106.0          1
Name: count, Length: 110, dtype: int64
sex
male      307188
female    272969
Name: count, dtype: int64
city
moscow                149478
mumbai                 56178
moscow oblast          29170
chennai                23907
milano                 23124
                       ...  
mogpog                     1
palayan                    1
labangan                   1
sampaloc                   1
san pedro de coris         1
Name: count, Length: 8335, dtype: int64
province
central                        206921
england                        156106
maharashtra                    106515
lima                            99326
lombardia                       87238
                                ...  
laghouat                            1
san andres islas                    1
thessaly and central greece  

In [439]:
def is_correlated(var_name1, var_name2, data_frame: pd.DataFrame):
    try:
        new_frame = data_frame[[var_name1, var_name2]]
        new_frame = new_frame.dropna(how="any", axis=0)
        ct = pd.crosstab(index=new_frame[var_name1], columns=new_frame[var_name2])
        print(ct)
        chi2_res = chi2_contingency(ct, )
        p, x, dof_ = chi2_res[1], "correlated" if chi2_res[1] < 0.05 else "not-correlated", chi2_res[2]
        return p, x, dof_
    except ValueError:
        return 0, "no data", 0

In [443]:
outcome_df = df[df['outcome'].notna()]

In [447]:
corr = [[], []]
to_check = ["sex", "country", "sex", "lives_in_Wuhan", "travel_history_binary"]
for var1 in df:
    if var1 != "ID" and var1 in to_check:
        p_val, correlation, dof = is_correlated(var1, "outcome", outcome_df)
        if correlation == "correlated":
            corr[0].append((p_val, dof, var1))
        else:
            corr[1].append((p_val, dof, var1))
        print("")

corr[0].sort()
corr[1].sort()

print("Correlated")

for ln in corr[0]:
    print(ln)

print("Non-Correlated")

for ln in corr[1]:
    print(ln)

outcome  death  hospitalized  recovered
sex                                    
female     546         10788     261635
male       941         20586     285661

outcome               death  hospitalized  recovered
country                                             
afghanistan               0             0          8
albania                   0             0       1054
algeria                   7             0       1260
andorra                   0             0          1
angola                    0             0          4
...                     ...           ...        ...
venezuela                 0             0          2
vietnam                   0             0         34
virgin islands, u.s.      0             0          2
zambia                    0             0         22
zimbabwe                  4            31          1

[146 rows x 3 columns]

outcome         death  hospitalized  recovered
lives_in_Wuhan                                
False            5954        20

In [None]:
outcome_df.head()