## Libraries

In [22]:
import qiime2
from tempfile import mkdtemp
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition
import pandas as pd
import os
import numpy as np
import random
import logging
from datetime import datetime
from Bio import Entrez
from pprint import pprint

#### Studies
* exercise_frequency
* flossing_frequency
* vitamin_d_supplement_frequency
* weight_change
* fruit_frequency

## Functions

In [2]:
def generate_single_sample(already_taken, from_df):
    element = random.randint(0, len(from_df) - 1)
    while element in already_taken or from_df.iloc[element]['sample_name'] == '10317':
        element = random.randint(0, len(from_df) - 1)
    return element

def get_final_sample(n_samples, starter_dataset_man, starter_dataset_woman):
    sample_man = set()
    sample_woman = set()
    final_sample = pd.DataFrame(columns=starter_dataset_man.columns)
    
    for i in range(n_samples):
        new_sample_man = generate_single_sample(sample_man, starter_dataset_man)
        new_sample_woman = generate_single_sample(sample_woman, starter_dataset_woman)
        
        sample_man.add(new_sample_man)
        sample_woman.add(new_sample_woman)

        final_sample = final_sample.append(starter_dataset_man.iloc[new_sample_man], ignore_index=True)
        final_sample = final_sample.append(starter_dataset_woman.iloc[new_sample_woman], ignore_index=True)
        
    return final_sample

def write_age_mean(total, man, woman, typology):
    mean_age_total = round(np.mean(total['age_years']), 4)
    mean_age_man = round(np.mean(man['age_years']), 4)
    mean_age_woman = round(np.mean(woman['age_years']), 4)
    if typology != 'sample':
        logging.info(f'--------------------{typology.upper()}--------------------')
    logging.info(f'Total number of {typology} people: {len(total)}')
    logging.info(f'Total number of {typology} man: {len(man)}')
    logging.info(f'Total number of {typology} woman: {len(woman)}')
    
    logging.info(f'Mean Age for total: {mean_age_total}')
    logging.info(f'Mean Age for man: {mean_age_man}')
    logging.info(f'Mean Age for woman: {mean_age_woman}')
    logging.info(f'\n\n')
    
    print(f'--------------------{typology.upper()}--------------------')
    print(f'Total number of {typology} people: {len(total)}')
    print(f'Total number of {typology} man: {len(man)}')
    print(f'Total number of {typology} woman: {len(woman)}')
    
    print(f'Mean Age for total: {mean_age_total}')
    print(f'Mean Age for man: {mean_age_man}')
    print(f'Mean Age for woman: {mean_age_woman}')

    

def write_sample_info(sample, typology):
    man = sample.query("sex == 'male'")
    woman = sample.query("sex == 'female'")
    write_age_mean(sample, man, woman, 'sample')
    sample.to_csv(f'./result_extraction/{typology}_sample.csv')
    
    

## Main

In [3]:
df = pd.read_csv("./data/american_gut.txt", delimiter="\t", header=0)
df.replace(' ', np.nan, inplace=True)
df.replace('Not provided', np.nan, inplace=True)
df.replace('Unspecified', np.nan, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
logging.basicConfig(filename='./result_extraction/sampling.log', level=logging.INFO, format='%(message)s')
today = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
logging.info(f'RUN TIME: {today}')

## All columns

In [6]:
for column in df.columns:
    print(column, end = ', ')

sample_name, acid_reflux, acne_medication, acne_medication_otc, add_adhd, age_cat, age_corrected, age_years, alcohol_consumption, alcohol_frequency, alcohol_types, alcohol_types_beercider, alcohol_types_red_wine, alcohol_types_sour_beers, alcohol_types_spiritshard_alcohol, alcohol_types_unspecified, alcohol_types_white_wine, allergic_to, allergic_to_i_have_no_food_allergies_that_i_know_of, allergic_to_other, allergic_to_peanuts, allergic_to_shellfish, allergic_to_tree_nuts, allergic_to_unspecified, altitude, alzheimers, animal_age, animal_free_text, animal_gender, animal_origin, animal_type, anonymized_name, antibiotic_history, appendix_removed, artificial_sweeteners, asd, assigned_from_geo, autoimmune, birth_year, bmi, bmi_cat, bmi_corrected, body_habitat, body_product, body_site, bowel_movement_frequency, bowel_movement_quality, breastmilk_formula_ensure, cancer, cancer_treatment, cardiovascular_disease, cat, cdiff, census_region, chickenpox, clinical_condition, collection_date, coll

## Healthy extraction

In [7]:
healthy = df.query("smoking_frequency == 'Never' and alcohol_frequency == 'Never'")

healthy = healthy.query("cancer == 'I do not have this condition'")

healthy['bmi'] = healthy['bmi'].apply(lambda x: float(x))
healthy = healthy.query("bmi >= 18.5 and bmi <= 24.99")

healthy['age_years'] = healthy['age_years'].apply(lambda x: float(x))
healthy = healthy.query("age_years >= 20 and age_years <= 50")

healthy_man = healthy.query("sex == 'male'")
healthy_woman = healthy.query("sex == 'female'")

In [8]:
write_age_mean(healthy, healthy_man, healthy_woman, 'healthy')

--------------------HEALTHY--------------------
Total number of healthy people: 696
Total number of healthy man: 284
Total number of healthy woman: 412
Mean Age for total: 36.6121
Mean Age for man: 36.6585
Mean Age for woman: 36.5801


In [9]:
healthy['sex'].value_counts()

female    412
male      284
Name: sex, dtype: int64

In [12]:
final_sample_healthy = get_final_sample(20, healthy_man, healthy_woman)

In [13]:
write_sample_info(final_sample_healthy, 'healthy')

--------------------SAMPLE--------------------
Total number of sample people: 40
Total number of sample man: 20
Total number of sample woman: 20
Mean Age for total: 37.65
Mean Age for man: 36.25
Mean Age for woman: 39.05


## Not healthy extraction

In [52]:
not_healthy = df.query("smoking_frequency == 'Occasionally (1-2 times/week)' or smoking_frequency == 'Daily' or smoking_frequency == 'Regularly (3-5 times/week)'")
not_healthy = not_healthy.query("alcohol_frequency == 'Occasionally (1-2 times/week)' or alcohol_frequency == 'Daily' or alcohol_frequency == 'Regularly (3-5 times/week)'")

not_healthy = not_healthy.query("cancer == 'I do not have this condition'")

not_healthy['bmi'] = not_healthy['bmi'].apply(lambda x: float(x))
not_healthy = not_healthy.query("bmi < 18.5 or bmi > 24.99")

not_healthy['age_years'] = not_healthy['age_years'].apply(lambda x: float(x))
not_healthy = not_healthy.query("age_years >= 20 and age_years <= 50")

not_healty = not_healthy.query("body_site == 'UBERON:feces'")

not_healthy_man = not_healthy.query("sex == 'male'")
not_healthy_woman = not_healthy.query("sex == 'female'")

In [53]:
write_age_mean(not_healthy, not_healthy_man, not_healthy_woman, 'not healthy')

--------------------NOT HEALTHY--------------------
Total number of not healthy people: 81
Total number of not healthy man: 44
Total number of not healthy woman: 26
Mean Age for total: 35.642
Mean Age for man: 37.1136
Mean Age for woman: 36.0


In [16]:
final_sample_not_healthy = get_final_sample(20, not_healthy_man, not_healthy_woman)

In [17]:
write_sample_info(final_sample_not_healthy, 'not_healthy')

--------------------SAMPLE--------------------
Total number of sample people: 40
Total number of sample man: 20
Total number of sample woman: 20
Mean Age for total: 36.075
Mean Age for man: 36.55
Mean Age for woman: 35.6


In [49]:
not_healthy['body_site'].value_counts()

UBERON:feces                       70
UBERON:tongue                       2
UBERON:skin of trunk                2
UBERON:skin of head                 2
UBERON:nostril                      1
UBERON:eye                          1
UBERON:skin of leg                  1
UBERON:skin of hand                 1
UBERON:external auditory meatus     1
Name: body_site, dtype: int64

## NCBI Quering

In [33]:
Entrez.email = "giacomo.villa.mi@gmail.com"

def good_print(text):
    print(pprint(text))

def esearch(db, query, num_max = 20):
    handle = Entrez.esearch(db = db, term = query, retmax = num_max)
    record = Entrez.read(handle, validate = True)
    return record

def esummary(db, id_val):
    handle = Entrez.esummary(db = db, id = id_val)
    record = Entrez.read(handle, validate = True)
    return record

def efetch(db, id_val):
    handle = Entrez.efetch(db  = db, id = id_val)
    record = Entrez.read(handle, validate = True)
    return record

In [39]:
healty_sample = pd.read_csv("./result_extraction/not_healthy_sample.csv", header=0)

In [40]:
id_ncbi = healty_sample.iloc[0]['sample_name']

In [44]:
not_healthy_man.head()

Unnamed: 0,sample_name,acid_reflux,acne_medication,acne_medication_otc,add_adhd,age_cat,age_corrected,age_years,alcohol_consumption,alcohol_frequency,...,vioscreen_zinc,vitamin_b_supplement_frequency,vitamin_d_supplement_frequency,vivid_dreams,weight_cat,weight_change,weight_kg,weight_units,whole_eggs,whole_grain_frequency
9142,10317.0,,No,No,,50s,50.0,50.0,Yes,Regularly (3-5 times/week),...,,Occasionally (1-2 times/week),Daily,Rarely (a few times/month),,Remained stable,72.0,kilograms,Regularly (3-5 times/week),Rarely (less than once/week)
9626,10317.0,I do not have this condition,false,false,I do not have this condition,30s,36.0,36.0,true,Regularly (3-5 times/week),...,,Never,Daily,Rarely (a few times/month),,Remained stable,84.0,kilograms,Daily,Regularly (3-5 times/week)
10307,10317.000037933,I do not have this condition,false,false,I do not have this condition,20s,24.0,24.0,true,Occasionally (1-2 times/week),...,,Never,Never,Never,,Decreased more than 10 pounds,100.0,kilograms,Occasionally (1-2 times/week),Daily
10448,10317.00003816,,false,false,,30s,34.0,34.0,true,Regularly (3-5 times/week),...,,Never,Never,Never,,Remained stable,91.0,kilograms,Occasionally (1-2 times/week),Occasionally (1-2 times/week)
11491,10317.0,I do not have this condition,false,false,I do not have this condition,30s,39.0,39.0,true,Regularly (3-5 times/week),...,,Never,Daily,Rarely (a few times/month),,Remained stable,95.0,kilograms,Regularly (3-5 times/week),Occasionally (1-2 times/week)


In [48]:
for i in healthy_woman['sample_name']:
    if i != 10317:
        print(i)

10317.000001179
10317.000014528
10317.000017777
10317.000017778
10317.00002004
10317.000021947
10317.00002209
10317.000022091
10317.000022155
10317.000023592
10317.000023895
10317.000031488
10317.000032666
10317.000032754
10317.000032796
10317.000032816999
10317.000032904
10317.000033102
10317.000033125
10317.00003326
10317.00003336
10317.000033566
10317.000033567
10317.000036038
10317.000036043
10317.000037542
10317.000037925
10317.000037981
10317.000038037
10317.000038146
10317.000038233
10317.000038354
10317.000039528
10317.000039564
10317.000039729
10317.000039774
10317.000039899
10317.000039907
10317.000039924
10317.000039952
10317.000040002
10317.000040043
10317.000040053
10317.000040068
10317.000040073
10317.000040347
10317.000041325
10317.000041330
10317.000041524
10317.000041622
10317.000041667
10317.000041677
10317.0000417
10317.000041729
10317.000041745
10317.000041779
10317.000041784
10317.000041969
10317.000043083
10317.000043224
10317.00004456
10317.000046092
10317.000046

In [34]:
efetch('sra', id_ncbi)

ValueError: As the XML data contained neither a Document Type Definition (DTD) nor an XML Schema, Bio.Entrez is unable to parse these data. We recommend using a generic XML parser from the Python standard library instead, for example ElementTree.