In [1]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

In [2]:
dtype = {
        "ID": str,
        "age": str,
        "sex": str,
        "city": str,
        "province": str,
        "country": str,
        "latitude": float,
        "longitude": float,
        "geo_resolution": str,
        "date_onset_symptoms": str,
        "date_admission_hospital": str,
        "date_confirmation": str,
        "symptoms": str,
        "lives_in_Wuhan": str,
        "travel_history_dates": str,
        "travel_history_location": str,
        "reported_market_exposure": str,
        "additional_information": str,
        "chronic_disease": str,
        "source": str,
        "sequence_available": str,
        "outcome": str,
        "date_death_or_discharge": str,
        "notes_for_discussion": str,
        "location": str,
        "admin1": str,
        "admin2": str,
        "admin3": str,
        "country_new": str,
        "admin_id": float,
        "data_moderator_initials": str,
        "travel_history_binary": str
    }

In [3]:
df = pd.read_csv(
    "./latestdata.csv",
    dtype=dtype,
    nrows=1000
)

In [4]:
df = pd.read_csv(
    "./latestdata.csv",
    dtype=dtype
)

In [5]:
df

Unnamed: 0,ID,age,sex,city,province,country,latitude,longitude,geo_resolution,date_onset_symptoms,...,date_death_or_discharge,notes_for_discussion,location,admin3,admin2,admin1,country_new,admin_id,data_moderator_initials,travel_history_binary
0,000-1-1,,male,Shek Lei,Hong Kong,China,22.365019,114.133808,point,,...,,,Shek Lei,,,Hong Kong,China,8029.0,,
1,000-1-10,78,male,Vo Euganeo,Veneto,Italy,45.297748,11.658382,point,,...,22.02.2020,,Vo' Euganeo,,,Veneto,Italy,8954.0,,
2,000-1-100,61,female,,,Singapore,1.353460,103.815100,admin0,,...,17.02.2020,,,,,,Singapore,200.0,,
3,000-1-1000,,,Zhengzhou City,Henan,China,34.629310,113.468000,admin2,,...,,,,,Zhengzhou City,Henan,China,10091.0,,
4,000-1-10000,,,Pingxiang City,Jiangxi,China,27.513560,113.902900,admin2,,...,,,,,Pingxiang City,Jiangxi,China,7060.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2676306,010-99995,52,female,Calleria,Coronel Portillo,Peru,-8.378190,-74.539700,point,,...,,,,,,,Peru,14429.0,,False
2676307,010-99996,52,female,Pueblo Libre,Lima,Peru,-12.076530,-77.067350,point,,...,,,,,,,Peru,14695.0,,False
2676308,010-99997,52,female,Comas,Lima,Peru,-11.932980,-77.040850,point,,...,,,,,,,Peru,14400.0,,False
2676309,010-99998,52,male,Callao,Callao,Peru,-12.000740,-77.118240,point,,...,,,,,,,Peru,14516.0,,False


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676311 entries, 0 to 2676310
Data columns (total 33 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   ID                        object 
 1   age                       object 
 2   sex                       object 
 3   city                      object 
 4   province                  object 
 5   country                   object 
 6   latitude                  float64
 7   longitude                 float64
 8   geo_resolution            object 
 9   date_onset_symptoms       object 
 10  date_admission_hospital   object 
 11  date_confirmation         object 
 12  symptoms                  object 
 13  lives_in_Wuhan            object 
 14  travel_history_dates      object 
 15  travel_history_location   object 
 16  reported_market_exposure  object 
 17  additional_information    object 
 18  chronic_disease_binary    bool   
 19  chronic_disease           object 
 20  source                  

In [7]:
# Total of missing values
total_cells = np.prod(df.shape)
total_missing = df.isnull().sum().sum()
# Percentage of missing data
print((total_missing / total_cells) * 100)

56.49519511043826


In [8]:
def age_to_int(age_str):
    if isinstance(age_str, float):
        return None

    if "-" in age_str:
        age_min, age_max = age_str.split("-")
        if age_min == '':
            return int(age_max)
        if age_max == '':
            return int(age_min)
        age_min, age_max = int(age_min), int(age_max)
        return int((age_min + age_max) / 2)

    if "weeks" in age_str:
        return 0

    if "months" in age_str or "month" in age_str:
        num, _ = age_str.split(" ")
        if int(num) < 12:
            return 0
        return int(int(num) / 12)

    if age_str[-1] == "+" or age_str[-1] == "-":
        return int(age_str[:-1])

    return int(float(age_str))


df["age"] = df["age"].apply(age_to_int)
df["age"] = df["age"].fillna(int(df["age"].mean()))

In [9]:
def is_correlated(var_name1, var_name2):
    try:
        ct = pd.crosstab(index=df[var_name1], columns=df[var_name2])
        chi2_res = chi2_contingency(ct, )
        p, x = chi2_res[1], "correlated" if chi2_res[1] < 0.05 else "not-correlated"
        return p, x
    except ValueError:
        return 0, "no data"

In [10]:
df = df[df['outcome'].notna()]
df

Unnamed: 0,ID,age,sex,city,province,country,latitude,longitude,geo_resolution,date_onset_symptoms,...,date_death_or_discharge,notes_for_discussion,location,admin3,admin2,admin1,country_new,admin_id,data_moderator_initials,travel_history_binary
0,000-1-1,45.0,male,Shek Lei,Hong Kong,China,22.365019,114.133808,point,,...,,,Shek Lei,,,Hong Kong,China,8029.0,,
1,000-1-10,78.0,male,Vo Euganeo,Veneto,Italy,45.297748,11.658382,point,,...,22.02.2020,,Vo' Euganeo,,,Veneto,Italy,8954.0,,
2,000-1-100,61.0,female,,,Singapore,1.353460,103.815100,admin0,,...,17.02.2020,,,,,,Singapore,200.0,,
113,000-1-101,28.0,male,,,Singapore,1.353460,103.815100,admin0,,...,20.02.2020,,,,,,Singapore,200.0,,
224,000-1-102,56.0,female,,,Singapore,1.353460,103.815100,admin0,,...,02.03.2020,,,,,,Singapore,200.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670224,005-6014,80.0,female,Chittenden County,Vermont,United States,44.461123,-73.081581,admin2,,...,19.03.2020,,,,Chittenden County,Vermont,United States,2239.0,,False
672529,005-809,64.0,male,Pennington County,South Dakota,United States,44.003584,-102.826120,admin2,,...,,,,,Pennington County,South Dakota,United States,6948.0,,False
672651,005-82,94.0,female,King County,Washington,United States,47.491332,-121.803640,admin2,,...,03.03.2020,,,,King County,Washington,United States,5030.0,,False
672674,005-822,70.0,male,Multnomah County,Oregon,United States,45.546910,-122.414900,admin2,,...,,,,,Multnomah County,Oregon,United States,6375.0,,False


In [12]:
corr = [[],[]]
for var1 in df:
    if var1 != "ID":
        p_val, correlation = is_correlated(var1, "outcome")
        if correlation == "correlated":
            corr[0].append((p_val, var1))
        else:
            corr[1].append((p_val, var1))

corr[0].sort()
corr[1].sort()

print(corr[0],"\n", corr[1])

[(0.0, 'additional_information'), (0.0, 'admin1'), (0.0, 'admin2'), (0.0, 'admin_id'), (0.0, 'age'), (0.0, 'chronic_disease_binary'), (0.0, 'city'), (0.0, 'country'), (0.0, 'country_new'), (0.0, 'data_moderator_initials'), (0.0, 'date_confirmation'), (0.0, 'date_onset_symptoms'), (0.0, 'geo_resolution'), (0.0, 'latitude'), (0.0, 'location'), (0.0, 'longitude'), (0.0, 'outcome'), (0.0, 'province'), (0.0, 'source'), (0.0, 'travel_history_binary'), (3.8560107107766313e-308, 'symptoms'), (1.5876092869825784e-289, 'travel_history_location'), (1.5025665056853097e-199, 'date_death_or_discharge'), (2.6699875336261378e-107, 'travel_history_dates'), (9.904067618578582e-71, 'sex'), (2.28192235693824e-69, 'notes_for_discussion'), (1.2184421147388877e-34, 'date_admission_hospital'), (3.2000224823838043e-16, 'chronic_disease'), (3.83310375845142e-12, 'lives_in_Wuhan'), (6.599756150209438e-05, 'admin3')] 
 [(0.3590242323724538, 'reported_market_exposure'), (0.5134171190325922, 'sequence_available')]
