## Libraries

In [229]:
import qiime2
from tempfile import mkdtemp
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition
import pandas as pd
import os
import numpy as np
import random

#### Studies
* exercise_frequency
* flossing_frequency
* vitamin_d_supplement_frequency
* weight_change
* fruit_frequency

## Functions

In [383]:
# 0 --> to int, 1 --> to float
def manage_string_to_int(values, opt):
    int_values = list()
    for value in values:
        try:
            if opt == 0:
                value = int(value)
            elif opt == 1:
                value = float(value)
        except:
            value = -1
        int_values.append(value)
    return int_values

def generate_single_sample(already_taken, from_df):
    element = random.randint(0, len(from_df) - 1)
    while element in already_taken or from_df.iloc[element]['sample_name'] == '10317':
        element = random.randint(0, len(from_df) - 1)
    return element

def get_final_sample(n_samples, starter_dataset_man, starter_dataset_woman):
    sample_man = set()
    sample_woman = set()
    final_sample = pd.DataFrame(columns=starter_dataset_man.columns)
    
    for i in range(n_samples):
        new_sample_man = generate_single_sample(sample_man, starter_dataset_man)
        new_sample_woman = generate_single_sample(sample_woman, starter_dataset_woman)
        
        sample_man.add(new_sample_man)
        sample_woman.add(new_sample_woman)

        final_sample = final_sample.append(starter_dataset_man.iloc[new_sample_man], ignore_index=True)
        final_sample = final_sample.append(starter_dataset_woman.iloc[new_sample_woman], ignore_index=True)
        
    return final_sample

## Main

In [238]:
df = pd.read_csv("./data/american_gut.txt", delimiter="\t", header=0)
df.replace(' ', np.nan, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [136]:
for column in df.columns:
    print(column, end = ', ')

sample_name, acid_reflux, acne_medication, acne_medication_otc, add_adhd, age_cat, age_corrected, age_years, alcohol_consumption, alcohol_frequency, alcohol_types, alcohol_types_beercider, alcohol_types_red_wine, alcohol_types_sour_beers, alcohol_types_spiritshard_alcohol, alcohol_types_unspecified, alcohol_types_white_wine, allergic_to, allergic_to_i_have_no_food_allergies_that_i_know_of, allergic_to_other, allergic_to_peanuts, allergic_to_shellfish, allergic_to_tree_nuts, allergic_to_unspecified, altitude, alzheimers, animal_age, animal_free_text, animal_gender, animal_origin, animal_type, anonymized_name, antibiotic_history, appendix_removed, artificial_sweeteners, asd, assigned_from_geo, autoimmune, birth_year, bmi, bmi_cat, bmi_corrected, body_habitat, body_product, body_site, bowel_movement_frequency, bowel_movement_quality, breastmilk_formula_ensure, cancer, cancer_treatment, cardiovascular_disease, cat, cdiff, census_region, chickenpox, clinical_condition, collection_date, coll

In [312]:
filt = (df['smoking_frequency'] == 'Never') & (df['alcohol_frequency'] == 'Never') 
sane = df[filt]

filt = (sane['cancer'] == 'I do not have this condition')
sane = sane[filt]

sane['bmi'] = manage_string_to_int(sane['bmi'], 1)
filt = (sane['bmi'] >= 18.5) & (sane['bmi'] <= 25)
sane = sane[filt]

sane['age_years'] = manage_string_to_int(sane['age_years'], 0)
filt = (sane['age_years'] >= 20) & (sane['age_years'] <= 50)
sane = sane[filt]

filt = (sane['sex'] == 'male')
sane_man = sane[filt]
sane_woman = sane[filt == False]

In [405]:
np.mean(sane_woman['age_years'])
# mean man and woman: 36.606531881804045
# mean man: 36.554307116104866
# mean woman: 36.6436170212766
# sample len: 643 (376 female and 267 male)

36.6436170212766

In [436]:
final_sample_sane = get_final_sample(59, sane_man, sane_woman)

In [440]:
final_sample_sane['exercise_frequency'].value_counts()

Regularly (3-5 times/week)       38
Occasionally (1-2 times/week)    31
Daily                            23
Rarely (a few times/month)       15
Never                             8
Not provided                      3
Name: exercise_frequency, dtype: int64

In [347]:
len(final_sample_sane)

80

In [348]:
filt = ((df['smoking_frequency'] == 'Regularly (3-5 times/week)') | (df['smoking_frequency'] == 'Daily') & (df['alcohol_frequency'] == 'Regularly (3-5 times/week)') | (df['alcohol_frequency'] == 'Daily'))
not_sane = df[filt]

filt = (not_sane['cancer'] == 'I do not have this condition')
not_sane = not_sane[filt] 

not_sane['bmi'] = manage_string_to_int(not_sane['bmi'], 1)
filt = ((not_sane['bmi'] < 18.5) | (not_sane['bmi'] > 25)) & (not_sane['bmi'] != -1)
not_sane = not_sane[filt]

not_sane['age_years'] = manage_string_to_int(not_sane['age_years'], 0)
filt = (not_sane['age_years'] >= 20) & (not_sane['age_years'] <= 50)
not_sane = not_sane[filt]

filt = (not_sane['sex'] == 'male')
not_sane_man = not_sane[filt]
not_sane_woman = not_sane[filt == False]

In [349]:
np.mean(not_sane_man['age_years'])
# mean man and woman: 39.808510638297875
# mean man: 39.926829268292686
# mean woman: 39.644067796610166
# sample len: 141 (59 female and 82 male)

39.926829268292686

In [434]:
final_sample_not_sane = get_final_sample(59, not_sane_man, not_sane_woman)

In [441]:
final_sample_not_sane['exercise_frequency'].value_counts()

Occasionally (1-2 times/week)    41
Regularly (3-5 times/week)       31
Rarely (a few times/month)       30
Daily                            13
Never                             3
Name: exercise_frequency, dtype: int64