In [None]:
import os
import pandas as pd
import numpy as np
import pickle

# os.chdir('/container/mount/point')

from utils.pair_matching import discrepancyMatrix, construct_network, process_matched_pairs, generate_simulated_outcomes
from utils.helper import check_samples_overlap
from utils.plotting import plot_sex_ratio, plot_activity_ratio, plot_smoking_ratio
from utils.plotting import plot_bmi_probability_density

ModuleNotFoundError: No module named 'utils'

In [5]:
%ls container/mount/point/

README.md  [0m[01;34mdata[0m/    [01;34mmodels[0m/    [01;34mpreprocessing[0m/       [01;34mscripts[0m/
[01;34manalysis[0m/  [01;34mdist[0m/    [01;34mpipeline[0m/  [01;34mq2_classo.egg-info[0m/  [01;34mtests[0m/
[01;34mbuild[0m/     [01;34mdocker[0m/  [01;34mplots[0m/     requirements.txt     [01;34mutils[0m/


### IgE data

In [5]:
# covariates = pd.read_csv("data/latent_v6.csv", sep=",", low_memory=False, index_col="u3_16s_id") ### unscaled
covariates = pd.read_csv("data/qa_ige_food.csv", index_col=0)
print("Covariates data: {0}".format(covariates.shape))

asv = pd.read_csv(str("data/feature_table.tsv"), index_col=0, sep='\t')
print("Count data: {0} \n".format(asv.shape))


experiments = ['qa', 'agg', 'single', 'multiple', 'not_only', 'bermuda', 'birch', 'bromelain', 'dust_mite']

df_dict = dict()

for exp in experiments:
    print("Experiment: {0}".format(exp))
    ige = pd.read_csv("data/{0}_ige_food.csv".format(exp), index_col=0)
    print("IgE data: {0}".format(ige.shape))
    
    # if a person has at least 1 allergy W=1, otherwise W=0
    if "qa" in exp:
        ige['W'] = ige["allergy_or_intolerance"]
    elif "bermuda" in exp:
        ige['W'] = ige["bermuda_grass"]
    elif "birch" in exp:
        ige['W'] = ige["birch"]
    elif "bromelain" in exp:
        ige['W'] = ige["bromelain"]
    elif "dust_mite" in exp:
        ige['W'] = ige["dust_mite_pter"]
    else:
        ige['W'] = (ige != 0).any(axis=1).astype(int)

    
    ### merge covariates with IgE
    if "qa" in exp:
        df = ige.copy() # we don't need to merge covariates with covariates
    else:
        ige = ige.replace({0: 1, 1: 0}) # replace 0 with 1 and 1 with 0 for consistency with u3su_d3d2 variable 1 - no, 0 -yes
        df = pd.merge(ige, covariates, left_index=True, right_index=True)
        
    df = check_samples_overlap(df, asv) # check if datasets overlap
    df["W_str"] = df["W"].map({0: "Yes", 1: "No"}) # str allergy status
    df['u3_16s_id'] = df.index
    
    allergic, control = df[df["W"] == 0], df[df["W"] == 1]
    
    print("Number of allergics - {0}".format(len(allergic)))
    print("Number of control - {0}".format(len(control)))
    
    if len(allergic) >= 6:
        df_dict[exp] = {"data": df, "allergic": allergic, "control": control} # add resulted data to dict for plotting
    else:
        print("Excluded!")
    
    with open('data/exp_dict.pkl', 'wb') as f:
        pickle.dump(df_dict, f)
    
    print("\n")

Covariates data: (954, 53)
Count data: (15170, 2034) 

Experiment: qa
IgE data: (954, 53)
Samples that are present in matched pairs, but not in ASVs: 81
Number of allergics - 329
Number of control - 544


Experiment: agg
IgE data: (209, 9)
Number of allergics - 45
Number of control - 163


Experiment: single
IgE data: (197, 9)
Number of allergics - 33
Number of control - 163


Experiment: multiple
IgE data: (176, 9)
Number of allergics - 12
Number of control - 163


Experiment: not_only
IgE data: (508, 108)
Number of allergics - 249
Number of control - 210


Experiment: bermuda
IgE data: (107, 100)
Samples that are present in matched pairs, but not in ASVs: 9
Number of allergics - 63
Number of control - 19


Experiment: birch
IgE data: (107, 100)
Samples that are present in matched pairs, but not in ASVs: 9
Number of allergics - 59
Number of control - 23


Experiment: bromelain
IgE data: (27, 100)
Samples that are present in matched pairs, but not in ASVs: 2
Number of allergics - 22
Nu

Soy and bean allergies are always co-present with other type of IgE, so we exclude it from the further analysis.

We also exclude wheat, milk and egg allergies due to low number of cases. Thus, we are further investigating nut,shrimp, meat and kiwi IgEs.

### Plotting

For reproducing plots for allergy analysis use kora66 dataframe.

In [5]:
show=False
save = False

for exp, item in df_dict.items():
    print("Experiment: {0}".format(exp))
    ###Sex
    sex_fig = plot_sex_ratio(item["data"], show=show, save=save, name="{0}_design_sex_before_matching".format(exp))
    ### Age
    #age_fig = plot_age_ratio(item["data"], show=show, save=save, name="{0}_design_age_before_matching".format(exp))
    ### Phys activity
    phys_fig = plot_activity_ratio(item["data"], show=show, save=save, name="{0}_design_phys_before_matching".format(exp))
    ### Smoking behaviour
    smoking_fig = plot_smoking_ratio(item["data"], show=show, save=save, name="{0}_design_smoking_before_matching".format(exp))
    ### BMI
    #bmi_fig = plot_bmi_probability_density(item["allergic"], item["control"], show=show, save=save, name="{0}_design_bmi_before_matching".format(exp))
    ### Waist-hip ratio
    #whr_fig = plot_waist_hip_ratio_probability_density(item["allergic"], item["control"], show=show, save=save, name="{0}_design_hip_before_matching".format(exp))

Experiment: qa
Experiment: agg
Experiment: single
Experiment: multiple
Experiment: not_only
Experiment: bermuda
Experiment: birch
Experiment: bromelain
Experiment: dust_mite


There are no people with bermuda allergy in 30s group.
People with bromelain allergy have unique BMI and whr (check which one)

### Create pairs of samples

In [11]:
for exp in ['kiwi', 'shrimp', 'nut', 'meat']:
    df_dict[exp] = df_dict["agg"]

In [12]:
n_col = 10000 # number of randomizations
common_columns = ['W', 'sex', 'age', 'bmi', 'BMI_(ter)', 'phys_activ', 'smoking_behaviour']
column_mappings = ["kiwi", "shrimp", "nut", "meat", "bermuda_grass", "birch"]

for exp, item in df_dict.items():
    print("Experiment: {0}".format(exp))
        
    df = item['data'].copy()
    
    specific_column = next((exp for column in column_mappings if column in exp), None)

    if specific_column:
        df_match = df[common_columns + [specific_column]]
    else:
        df_match = df[common_columns]
    
    
    df_match["is_treated"] = df_match["W"].astype(bool)
    df_match["pair_nb"] = np.nan
    
    # Set the thresholds for each covariate, default is Inf (i.e. no matching)
    thresholds =  np.empty((df_match.shape[1], ))
    thresholds[:] = np.nan
    
    # Matching, e.g., the diff = 0 -> the same sex;
    # column_thresholds = { "sex": 0, "age": 5, "BMI_(ter)": 0, "kiwi": 0, "shrimp": 0, "nut": 0, "meat": 0}
    column_thresholds = {"sex": 0}

    # Set thresholds using the dictionary
    for column_name, threshold_value in column_thresholds.items():
        
        if column_name not in df_match.columns:
            continue

        column_index = df_match.columns.get_loc(column_name)
        thresholds[column_index] = threshold_value

    # TO DO describe this step
    treated_units = df_match[df_match["is_treated"] is True]
    control_units = df_match[df_match["is_treated"] is False]

    N_treated, N_control = treated_units.shape[0], control_units.shape[0]
    print("Number of treated units: {0}".format(N_treated))
    print("Number of control units: {0}".format(N_control))
    
    # Optional weights for each covariate when computing the distances
    # WARNING: the order of the items in scaling needs to be the same as the order of the covariates (i.e. columns)
    scaling =  np.ones((df_match.shape[1], ), dtype=int) 

    discrepancies = discrepancyMatrix(treated_units, control_units, thresholds, scaling)

    g, pairs_dict = construct_network(discrepancies, N_treated, N_control)
    matched_df = process_matched_pairs(pairs_dict, treated_units, control_units)

    print("Number of pairs: {0}".format(len(matched_df.W)))
    print("Number of allergics: {0}".format(len(matched_df[matched_df.W == 0])))
    print("Number of non-allergics: {0} \n".format(len(matched_df[matched_df.W == 1])))

    matched_df.to_csv('data/matched_ige_{0}.csv'.format(exp), index=True)

    simulated_outcomes = generate_simulated_outcomes(matched_df, n_col)

    ### Save the result
    simulated_outcomes.to_csv('data/W_paired_ige_{0}.csv'.format(exp), index=True)

Experiment: qa
Experiment: agg
Experiment: single
Experiment: multiple
Experiment: not_only
Experiment: bermuda
Experiment: birch
Experiment: bromelain
Experiment: dust_mite


In [13]:
df_match
thresholds

array([nan, nan, nan, nan, nan, nan, nan, nan, nan])

### Plots after matching

In [None]:
show=False
save = False

for exp, item in df_dict.items():
    
    matched_df = pd.read_csv("data/matched_ige_{0}.csv".format(exp), sep=",", low_memory=False, index_col=0)
    
    print("Experiment: {0}".format(exp))
    ###Sex
    sex_fig = plot_sex_ratio(item["data"], show=show, save=save, name="{0}_design_sex_after_matching".format(exp))
    ### BMI
    bmi_fig = plot_bmi_probability_density(item["allergic"], item["control"], show=show, save=save, name="{0}_design_bmi_after_matching".format(exp))
    
    ### Save the result
    # matched_df.to_csv('data/matched_ige_{0}.csv'.format(key), index=True)
    
    ### TO DO: fix number of bins according to age categories for each dataset
    ### Age 
    # age_fig = plot_age_ratio(item["data"], show=show, save=save, name="{0}_design_age_before_matching".format(key))

In [None]:
### old provenance of experiments

# matched_df.to_csv('../data/matched.csv', index=True)
# matched_df.to_csv('../data/matched_relax.csv', index=True)
# matched_df.to_csv('../data/matched_age_sex_bmi.csv', index=True)
# matched_df.to_csv('data/matched_ige_binary.csv', index=True)

# matched_df.to_csv('data/matched_ige_agg.csv', index=True)
# matched_df.to_csv('data/matched_ige_single.csv', index=True)
# matched_df.to_csv('data/matched_ige_multiple.csv', index=True)
# matched_df.to_csv('data/matched_ige_nut.csv', index=True)

### simulated_outcomes = W_unique
# W_unique.to_csv('../data/W_paired.csv', index=True)
# W_unique.to_csv('../data/W_paired_relax.csv', index=True)
# W_unique.to_csv('../data/W_paired_age_sex_bmi.csv', index=True)
# W_unique.to_csv('../data/W_paired_nuts.csv', index=True)
# W_unique.to_csv('data/W_paired_ige_binary.csv', index=True)

# W_unique.to_csv('data/W_paired_ige_agg.csv', index=True)
# W_unique.to_csv('data/W_paired_ige_single.csv', index=True)
# W_unique.to_csv('data/W_paired_ige_multiple.csv', index=True)
# W_unique.to_csv('data/W_paired_ige_nut.csv', index=True)
# W_unique.to_csv('data/W_paired_ige_nut.csv', index=True)