## Libraries

In [229]:
import qiime2
from tempfile import mkdtemp
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition
import pandas as pd
import os
import numpy as np
import random

## Functions

In [176]:
def lorenz_formula(row, opt):
    try:
        if opt == 'Male':
            lorenz_weight = int(row['height_cm']) - 100 - ((int(row['height_cm']) - 150)/4)
        else:
            lorenz_weight = int(row['height_cm']) - 100 - ((int(row['height_cm']) - 150)/2)
    except:
        lorenz_weight = -1
    return lorenz_weight

def compute_ideal_weight(dataset):
    normal = list()
    cont = 0
    for _, row in dataset.iterrows():
        cont += 1
        if row['sex'] == 'male':
            ideal_weight = lorenz_formula(row, 'Male')
            value = 1
        elif row['sex'] == 'female':
            ideal_weight = lorenz_formula(row, 'female')
            value = 1
        else:
            value = -1

        if value != -1:
            try:
                if int(row['weight_kg']) <= ideal_weight + 5 and int(row['weight_kg']) >= ideal_weight -5:
                    value = 1
                else:
                    value = -1
            except:
                value = -1
        normal.append(value)
    return normal

## Main

In [238]:
df = pd.read_csv("./data/american_gut.txt", delimiter="\t", header=0)
df.replace(' ', np.nan, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [136]:
for column in df.columns:
    print(column, end = ', ')

sample_name, acid_reflux, acne_medication, acne_medication_otc, add_adhd, age_cat, age_corrected, age_years, alcohol_consumption, alcohol_frequency, alcohol_types, alcohol_types_beercider, alcohol_types_red_wine, alcohol_types_sour_beers, alcohol_types_spiritshard_alcohol, alcohol_types_unspecified, alcohol_types_white_wine, allergic_to, allergic_to_i_have_no_food_allergies_that_i_know_of, allergic_to_other, allergic_to_peanuts, allergic_to_shellfish, allergic_to_tree_nuts, allergic_to_unspecified, altitude, alzheimers, animal_age, animal_free_text, animal_gender, animal_origin, animal_type, anonymized_name, antibiotic_history, appendix_removed, artificial_sweeteners, asd, assigned_from_geo, autoimmune, birth_year, bmi, bmi_cat, bmi_corrected, body_habitat, body_product, body_site, bowel_movement_frequency, bowel_movement_quality, breastmilk_formula_ensure, cancer, cancer_treatment, cardiovascular_disease, cat, cdiff, census_region, chickenpox, clinical_condition, collection_date, coll

In [198]:
def manage_age(ages):
    int_ages = list()
    for age in ages:
        try:
            age = int(age)
        except:
            age
            age = -1
        int_ages.append(age)
    return int_ages

In [217]:
filt = (df['smoking_frequency'] == 'Never') & (df['alcohol_frequency'] == 'Never') 
sane = df[filt]

filt = (sane['cancer'] == 'I do not have this condition')
sane = sane[filt]

sane['normal'] = compute_ideal_weight(sane)
filt = (sane['normal'] == 1)
sane = sane[filt]

sane['age_years'] = manage_age(sane['age_years'])
filt = (sane['age_years'] >= 20) & (sane['age_years'] <= 40)
sane = sane[filt]

filt = (sane['sex'] == 'male')
sane_man = sane[filt]
sane_woman = sane[filt == False]

242


In [234]:
sample_man = set()
final_sample = pd.Dataframe(columns=df.columns)
for i in range(25):
    element = random.randint(0, len(sane_man))
    while element in sample_man:
        element = random.randint(0, len(sane_man))
    sample_man.add(element)
    

{1,
 3,
 5,
 17,
 19,
 20,
 28,
 30,
 33,
 38,
 40,
 45,
 46,
 48,
 50,
 51,
 57,
 59,
 60,
 61,
 62,
 64,
 65,
 73,
 74}

In [225]:
len(sane_woman) + len(sane_man)

242

In [218]:
np.mean(sane_woman['age_years'])

# mean man and woman: 31.694214876033058
# mean man: 32.013333333333335
# mean woman: 31.550898203592816

31.550898203592816

In [190]:
sane.head()

Unnamed: 0,sample_name,acid_reflux,acne_medication,acne_medication_otc,add_adhd,age_cat,age_corrected,age_years,alcohol_consumption,alcohol_frequency,...,vitamin_b_supplement_frequency,vitamin_d_supplement_frequency,vivid_dreams,weight_cat,weight_change,weight_kg,weight_units,whole_eggs,whole_grain_frequency,normal
554,10317.000002223,I do not have this condition,No,No,Self-diagnosed,30s,36.0,36,No,Never,...,Rarely (a few times/month),Never,Never,,Remained stable,69,kilograms,Regularly (3-5 times/week),Unspecified,1
555,10317.000002224,I do not have this condition,false,false,Self-diagnosed,30s,36.0,36,false,Never,...,Rarely (a few times/month),Never,Never,,Remained stable,69,kilograms,Regularly (3-5 times/week),Not provided,1
3338,10317.000011356,I do not have this condition,No,No,I do not have this condition,60s,64.0,64,No,Never,...,Never,Never,Never,,Remained stable,60,kilograms,Daily,Regularly (3-5 times/week),1
5026,10317.000015938,I do not have this condition,No,No,I do not have this condition,50s,54.0,54,No,Never,...,Occasionally (1-2 times/week),Occasionally (1-2 times/week),Occasionally (1-2 times/week),,Remained stable,58,kilograms,Occasionally (1-2 times/week),Unspecified,1
5546,10317.0,"Diagnosed by a medical professional (doctor, p...",No,No,I do not have this condition,30s,33.0,33,No,Never,...,Regularly (3-5 times/week),Daily,Regularly (3-5 times/week),,Remained stable,56,kilograms,Regularly (3-5 times/week),Rarely (less than once/week),1


In [205]:
filt = ((df['smoking_frequency'] == 'Regularly (3-5 times/week)') | (df['smoking_frequency'] == 'Daily') & (df['alcohol_frequency'] == 'Regularly (3-5 times/week)') | (df['alcohol_frequency'] == 'Daily'))
not_sane = df[filt]

filt = (not_sane['cancer'] == 'I do not have this condition')
not_sane = not_sane[filt] 

not_sane['normal'] = compute_ideal_weight(not_sane)
filt = (not_sane['normal'] == -1)
not_sane = not_sane[filt]

not_sane['age_years'] = manage_age(not_sane['age_years'])
filt = (not_sane['age_years'] >= 20) & (not_sane['age_years'] <= 40)
not_sane = not_sane[filt]



117

In [208]:
np.mean(not_sane['age_years'])

33.162393162393165

In [211]:
not_sane['body_site'].value_counts()

UBERON:feces                       93
UBERON:tongue                       6
UBERON:skin of trunk                4
UBERON:skin of head                 4
UBERON:eye                          2
UBERON:external auditory meatus     2
UBERON:nostril                      2
UBERON:skin of leg                  2
UBERON:skin of hand                 2
Name: body_site, dtype: int64

In [206]:
not_sane.head()

Unnamed: 0,sample_name,acid_reflux,acne_medication,acne_medication_otc,add_adhd,age_cat,age_corrected,age_years,alcohol_consumption,alcohol_frequency,...,vitamin_b_supplement_frequency,vitamin_d_supplement_frequency,vivid_dreams,weight_cat,weight_change,weight_kg,weight_units,whole_eggs,whole_grain_frequency,normal
7525,10317.0,I do not have this condition,No,No,I do not have this condition,30s,33.0,33,Yes,Daily,...,Never,Never,Rarely (a few times/month),,Remained stable,80,kilograms,Regularly (3-5 times/week),Regularly (3-5 times/week),-1
7780,10317.0,I do not have this condition,No,No,I do not have this condition,30s,35.0,35,Yes,Daily,...,Daily,Daily,Rarely (a few times/month),,Remained stable,71,kilograms,Daily,Occasionally (1-2 times/week),-1
8950,10317.0,I do not have this condition,false,false,I do not have this condition,30s,33.0,33,true,Daily,...,Never,Never,Regularly (3-5 times/week),,Remained stable,68,kilograms,Occasionally (1-2 times/week),Occasionally (1-2 times/week),-1
9651,10317.0,I do not have this condition,false,false,"Diagnosed by a medical professional (doctor, p...",30s,32.0,32,true,Daily,...,Never,Daily,Never,,Remained stable,81,kilograms,Occasionally (1-2 times/week),Regularly (3-5 times/week),-1
10448,10317.00003816,Not provided,false,false,Not provided,30s,34.0,34,true,Regularly (3-5 times/week),...,Never,Never,Never,,Remained stable,91,kilograms,Occasionally (1-2 times/week),Occasionally (1-2 times/week),-1


In [241]:
df['mental_illness_type'].value_counts()

Unspecified        9812
Not applicable     1339
LabControl test     681
not applicable      149
not collected         3
Name: mental_illness_type, dtype: int64