## Libraries

In [1]:
import qiime2
from tempfile import mkdtemp
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition
import pandas as pd
import os
import numpy as np
import random
import logging
from datetime import datetime
from Bio import Entrez
from pprint import pprint
from sklearn.utils import shuffle

#### Studies
* exercise_frequency
* flossing_frequency
* vitamin_d_supplement_frequency
* weight_change
* fruit_frequency

## Functions

In [4]:
def generate_single_sample(already_taken, from_df):
    element = random.randint(0, len(from_df) - 1)
    while element in already_taken or from_df.iloc[element]['sample_name'] == '10317':
        element = random.randint(0, len(from_df) - 1)
    return element

def get_final_sample(n_samples, starter_dataset_man, starter_dataset_woman):
    sample_man = set()
    sample_woman = set()
    final_sample = pd.DataFrame(columns=starter_dataset_man.columns)
    
    for i in range(n_samples):
        new_sample_man = generate_single_sample(sample_man, starter_dataset_man)
        new_sample_woman = generate_single_sample(sample_woman, starter_dataset_woman)
        
        sample_man.add(new_sample_man)
        sample_woman.add(new_sample_woman)

        final_sample = final_sample.append(starter_dataset_man.iloc[new_sample_man], ignore_index=True)
        final_sample = final_sample.append(starter_dataset_woman.iloc[new_sample_woman], ignore_index=True)
        
    return final_sample

def write_age_mean(total, man, woman, typology):
    mean_age_total = round(np.mean(total['age_years']), 4)
    mean_age_man = round(np.mean(man['age_years']), 4)
    mean_age_woman = round(np.mean(woman['age_years']), 4)
    if typology != 'sample':
        logging.info(f'--------------------{typology.upper()}--------------------')
    logging.info(f'Total number of {typology} people: {len(total)}')
    logging.info(f'Total number of {typology} man: {len(man)}')
    logging.info(f'Total number of {typology} woman: {len(woman)}')
    
    logging.info(f'Mean Age for total: {mean_age_total}')
    logging.info(f'Mean Age for man: {mean_age_man}')
    logging.info(f'Mean Age for woman: {mean_age_woman}')
    logging.info(f'\n\n')
    
    print(f'--------------------{typology.upper()}--------------------')
    print(f'Total number of {typology} people: {len(total)}')
    print(f'Total number of {typology} man: {len(man)}')
    print(f'Total number of {typology} woman: {len(woman)}')
    
    print(f'Mean Age for total: {mean_age_total}')
    print(f'Mean Age for man: {mean_age_man}')
    print(f'Mean Age for woman: {mean_age_woman}')

    

def write_sample_info(sample, typology):
    man = sample.query("sex == 'male'")
    woman = sample.query("sex == 'female'")
    write_age_mean(sample, man, woman, 'sample')
    sample.to_csv(f'./result_extraction/{typology}_sample.csv')
    
    

## Main

In [5]:
df = pd.read_csv("./data/american_gut.txt", delimiter="\t", header=0)
df.replace(' ', np.nan, inplace=True)
df.replace('Not provided', np.nan, inplace=True)
df.replace('Unspecified', np.nan, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
logging.basicConfig(filename='./result_extraction/sampling.log', level=logging.INFO, format='%(message)s')
today = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
logging.info(f'RUN TIME: {today}')

## All columns

In [7]:
for column in df.columns:
    print(column, end = ', ')

sample_name, acid_reflux, acne_medication, acne_medication_otc, add_adhd, age_cat, age_corrected, age_years, alcohol_consumption, alcohol_frequency, alcohol_types, alcohol_types_beercider, alcohol_types_red_wine, alcohol_types_sour_beers, alcohol_types_spiritshard_alcohol, alcohol_types_unspecified, alcohol_types_white_wine, allergic_to, allergic_to_i_have_no_food_allergies_that_i_know_of, allergic_to_other, allergic_to_peanuts, allergic_to_shellfish, allergic_to_tree_nuts, allergic_to_unspecified, altitude, alzheimers, animal_age, animal_free_text, animal_gender, animal_origin, animal_type, anonymized_name, antibiotic_history, appendix_removed, artificial_sweeteners, asd, assigned_from_geo, autoimmune, birth_year, bmi, bmi_cat, bmi_corrected, body_habitat, body_product, body_site, bowel_movement_frequency, bowel_movement_quality, breastmilk_formula_ensure, cancer, cancer_treatment, cardiovascular_disease, cat, cdiff, census_region, chickenpox, clinical_condition, collection_date, coll

## Healthy extraction

In [44]:
healthy = df.query("smoking_frequency == 'Never' and alcohol_frequency == 'Never'")

healthy = healthy.query("cancer == 'I do not have this condition'")

healthy['bmi'] = healthy['bmi'].apply(lambda x: float(x))
healthy = healthy.query("bmi >= 18.5 and bmi <= 24.99")

healthy['age_years'] = healthy['age_years'].apply(lambda x: float(x))
healthy = healthy.query("age_years >= 20 and age_years <= 50")

healthy = healthy.query("body_site == 'UBERON:feces'")

healthy_man = healthy.query("sex == 'male'")
healthy_woman = healthy.query("sex == 'female'")

In [36]:
write_age_mean(healthy, healthy_man, healthy_woman, 'healthy')

--------------------HEALTHY--------------------
Total number of healthy people: 615
Total number of healthy man: 244
Total number of healthy woman: 371
Mean Age for total: 36.9545
Mean Age for man: 37.0984
Mean Age for woman: 36.8598


In [81]:
for i in healthy['sample_name']:
    print(i)

10317.000002224
10317.000012376
10317.000014528
10317.00002004
10317.00002075
10317.000021947
10317.00002209
10317.000022155
10317.000022223001
10317.000022224
10317.000022225
10317.000022282
10317.000023135
10317.000023269
10317.000023531
10317.000023592
10317.000023741999
10317.000023895
10317.000027696
10317.000029546
10317.000030297
10317.000031332
10317.00003142
10317.000031488
10317.000031782
10317.000032666
10317.000032754
10317.000032796
10317.000032816999
10317.000032903
10317.000032904
10317.000033102
10317.000033104001
10317.000033125
10317.00003326
10317.00003336
10317.000033566
10317.000033786999
10317.000036038
10317.000036043
10317.000036848
10317.000037542
10317.000037925
10317.000037981
10317.000038037
10317.000038115
10317.000038146
10317.000038150
10317.000038233
10317.000038354
10317.000039512
10317.000039528
10317.000039564
10317.000039729
10317.000039774
10317.000039899
10317.000039907
10317.000039924
10317.000039938
10317.000039952
10317.000039994
10317.000040002

In [134]:
esearch('sra', '10317.000014528')

{'Count': '1', 'RetMax': '1', 'RetStart': '0', 'IdList': ['3047563'], 'TranslationSet': [], 'TranslationStack': [{'Term': '10317.000014528[All Fields]', 'Field': 'All Fields', 'Count': '1', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': '10317.000014528[All Fields]'}

In [117]:
import xml.etree.ElementTree as ET

In [164]:
def efetch(db, id_val):
    handle = Entrez.efetch(db  = db, id = id_val)
    handle = handle.read()
    return handle

In [139]:
c = esummary('sra', '3047563')

In [140]:
good_print(c[0])

{'CreateDate': '2016/08/29',
 'ExpXml': '<Summary><Title>Illumina MiSeq sequencing; qiita_ptid_2060:10317.000014528</Title><Platform instrument_model="Illumina MiSeq">ILLUMINA</Platform><Statistics total_runs="1" total_spots="23535" total_bases="3530250" total_size="2600727" load_done="true" cluster_name="public"/></Summary><Submitter acc="ERA693849" center_name="" contact_name="European Nucleotide Archive" lab_name="European Nucleotide Archive"/><Experiment acc="ERX1667551" ver="2" status="public" name="Illumina MiSeq sequencing; qiita_ptid_2060:10317.000014528"/><Study acc="ERP012803" name="American Gut Project"/><Organism taxid="408170" ScientificName="human gut metagenome"/><Sample acc="ERS1305372" name=""/><Instrument ILLUMINA="Illumina MiSeq"/><Library_descriptor><LIBRARY_NAME>10317.000014528</LIBRARY_NAME><LIBRARY_STRATEGY>OTHER</LIBRARY_STRATEGY><LIBRARY_SOURCE>METAGENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>PCR</LIBRARY_SELECTION><LIBRARY_LAYOUT> <SINGLE/> </LIBRARY_LAYOUT><LIB

In [165]:
efetch('sra', 'ERR1596984')

b'<?xml version="1.0"  ?>\n<EXPERIMENT_PACKAGE_SET>\n<EXPERIMENT_PACKAGE><EXPERIMENT alias="qiita_ptid_2060:10317.000014528" accession="ERX1667551" broker_name=""><IDENTIFIERS><PRIMARY_ID>ERX1667551</PRIMARY_ID></IDENTIFIERS><TITLE>Illumina MiSeq sequencing; qiita_ptid_2060:10317.000014528</TITLE><STUDY_REF accession="ERP012803"><IDENTIFIERS><PRIMARY_ID>ERP012803</PRIMARY_ID></IDENTIFIERS></STUDY_REF><DESIGN><DESIGN_DESCRIPTION>fecal, saliva, skin and environment samples from the American Gut Project</DESIGN_DESCRIPTION><SAMPLE_DESCRIPTOR accession="ERS1305372"><IDENTIFIERS><PRIMARY_ID>ERS1305372</PRIMARY_ID><EXTERNAL_ID namespace="BioSample">SAMEA4393923</EXTERNAL_ID></IDENTIFIERS></SAMPLE_DESCRIPTOR><LIBRARY_DESCRIPTOR><LIBRARY_NAME>10317.000014528</LIBRARY_NAME><LIBRARY_STRATEGY>OTHER</LIBRARY_STRATEGY><LIBRARY_SOURCE>METAGENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>PCR</LIBRARY_SELECTION><LIBRARY_LAYOUT><SINGLE/></LIBRARY_LAYOUT><LIBRARY_CONSTRUCTION_PROTOCOL>Illumina MiSeq 515fbc, 8

In [166]:
root = ET.fromstring(efetch('sra', 'ERR1596984'))

In [168]:
root.tag

'EXPERIMENT_PACKAGE_SET'

## Not healthy extraction

In [None]:
not_healthy = df.query("smoking_frequency == 'Occasionally (1-2 times/week)' or smoking_frequency == 'Daily' or smoking_frequency == 'Regularly (3-5 times/week)'")
not_healthy = not_healthy.query("alcohol_frequency == 'Occasionally (1-2 times/week)' or alcohol_frequency == 'Daily' or alcohol_frequency == 'Regularly (3-5 times/week)'")

not_healthy = not_healthy.query("cancer == 'I do not have this condition'")

not_healthy['bmi'] = not_healthy['bmi'].apply(lambda x: float(x))
not_healthy = not_healthy.query("bmi < 18.5 or bmi > 24.99")

not_healthy['age_years'] = not_healthy['age_years'].apply(lambda x: float(x))
not_healthy = not_healthy.query("age_years >= 20 and age_years <= 50")

not_healty = not_healthy.query("body_site == 'UBERON:feces'")

not_healthy_man = not_healthy.query("sex == 'male'")
not_healthy_woman = not_healthy.query("sex == 'female'")

In [None]:
write_age_mean(not_healthy, not_healthy_man, not_healthy_woman, 'not healthy')

## NCBI Quering

In [103]:
Entrez.email = "giacomo.villa.mi@gmail.com"

def good_print(text):
    print(pprint(text))

def esearch(db, query, num_max = 20):
    handle = Entrez.esearch(db = db, term = query, retmax = num_max)
    record = Entrez.read(handle, validate = True)
    return record

def esummary(db, id_val):
    handle = Entrez.esummary(db = db, id = id_val)
    record = Entrez.read(handle, validate = True)
    return record



In [None]:
healty_sample = pd.read_csv("./result_extraction/not_healthy_sample.csv", header=0)

In [None]:
id_ncbi = healty_sample.iloc[0]['sample_name']

In [None]:
not_healthy_man.head()

In [None]:
for i in healthy_woman['sample_name']:
    if i != 10317:
        print(i)

In [None]:
esummary('sra', 10317.000001179)

In [None]:
esummary('sra', 'SRR004230')