## Libraries

In [210]:
import qiime2
from tempfile import mkdtemp
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition
import pandas as pd
import os
import numpy as np
import random
import logging
from datetime import datetime
from Bio import Entrez
from pprint import pprint
from sklearn.utils import shuffle
import xml.etree.ElementTree as ET
import os
from tqdm.notebook import tqdm

#### Studies
* exercise_frequency
* flossing_frequency
* vitamin_d_supplement_frequency
* weight_change
* fruit_frequency

## Functions

In [4]:
def generate_single_sample(already_taken, from_df):
    element = random.randint(0, len(from_df) - 1)
    while element in already_taken or from_df.iloc[element]['sample_name'] == '10317':
        element = random.randint(0, len(from_df) - 1)
    return element

def get_final_sample(n_samples, starter_dataset_man, starter_dataset_woman):
    sample_man = set()
    sample_woman = set()
    final_sample = pd.DataFrame(columns=starter_dataset_man.columns)
    
    for i in range(n_samples):
        new_sample_man = generate_single_sample(sample_man, starter_dataset_man)
        new_sample_woman = generate_single_sample(sample_woman, starter_dataset_woman)
        
        sample_man.add(new_sample_man)
        sample_woman.add(new_sample_woman)

        final_sample = final_sample.append(starter_dataset_man.iloc[new_sample_man], ignore_index=True)
        final_sample = final_sample.append(starter_dataset_woman.iloc[new_sample_woman], ignore_index=True)
        
    return final_sample

def write_age_mean(total, man, woman, typology):
    mean_age_total = round(np.mean(total['age_years']), 4)
    mean_age_man = round(np.mean(man['age_years']), 4)
    mean_age_woman = round(np.mean(woman['age_years']), 4)
    if typology != 'sample':
        logging.info(f'--------------------{typology.upper()}--------------------')
    logging.info(f'Total number of {typology} people: {len(total)}')
    logging.info(f'Total number of {typology} man: {len(man)}')
    logging.info(f'Total number of {typology} woman: {len(woman)}')
    
    logging.info(f'Mean Age for total: {mean_age_total}')
    logging.info(f'Mean Age for man: {mean_age_man}')
    logging.info(f'Mean Age for woman: {mean_age_woman}')
    logging.info(f'\n\n')
    
    print(f'--------------------{typology.upper()}--------------------')
    print(f'Total number of {typology} people: {len(total)}')
    print(f'Total number of {typology} man: {len(man)}')
    print(f'Total number of {typology} woman: {len(woman)}')
    
    print(f'Mean Age for total: {mean_age_total}')
    print(f'Mean Age for man: {mean_age_man}')
    print(f'Mean Age for woman: {mean_age_woman}')

def write_sample_info(sample, typology):
    man = sample.query("sex == 'male'")
    woman = sample.query("sex == 'female'")
    write_age_mean(sample, man, woman, 'sample')
    sample.to_csv(f'./result_extraction/{typology}_sample.csv')
    
def get_sequences(sample_name):
    handleSce = esearch('biosample', sample_name)
    biosampleId = handleSce['IdList'][0]
    print(f'Biosample ID {biosampleId}')
    handleSra = Entrez.efetch(db='biosample', id=biosampleId, retmode='xml')
    root = ET.fromstring(handleSra.read())
    identifier = root.findall('.//BioSample//Ids//Id')
    for i in identifier:
        if i.attrib['db'] == 'SRA':
            sraId = i.text
    print(f'SRA ID: {sraId}')
    handleSra = Entrez.esearch(db='sra', term=sraId)
    resultsSra = Entrez.read(handleSra)['IdList']
    for s in resultsSra:
        handlesngSraId = Entrez.efetch(db='sra', id=s, retmode='xml')
        root = ET.fromstring(handlesngSraId.read())
        identifier = root.find('.//EXPERIMENT_PACKAGE//RUN_SET//RUN')
        runId = identifier.attrib['accession']
        command1 = 'fastq-dump --fasta --readids --outdir "./rawData" ' + runId
        os.system(command1)
        command2 = 'fastq-dump --readids --outdir "./rawData" ' + runId
        os.system(command2) 

## Main

In [5]:
df = pd.read_csv("./data/american_gut.txt", delimiter="\t", header=0)
df.replace(' ', np.nan, inplace=True)
df.replace('Not provided', np.nan, inplace=True)
df.replace('Unspecified', np.nan, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
logging.basicConfig(filename='./result_extraction/sampling.log', level=logging.INFO, format='%(message)s')
today = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
logging.info(f'RUN TIME: {today}')

## All columns

In [7]:
for column in df.columns:
    print(column, end = ', ')

sample_name, acid_reflux, acne_medication, acne_medication_otc, add_adhd, age_cat, age_corrected, age_years, alcohol_consumption, alcohol_frequency, alcohol_types, alcohol_types_beercider, alcohol_types_red_wine, alcohol_types_sour_beers, alcohol_types_spiritshard_alcohol, alcohol_types_unspecified, alcohol_types_white_wine, allergic_to, allergic_to_i_have_no_food_allergies_that_i_know_of, allergic_to_other, allergic_to_peanuts, allergic_to_shellfish, allergic_to_tree_nuts, allergic_to_unspecified, altitude, alzheimers, animal_age, animal_free_text, animal_gender, animal_origin, animal_type, anonymized_name, antibiotic_history, appendix_removed, artificial_sweeteners, asd, assigned_from_geo, autoimmune, birth_year, bmi, bmi_cat, bmi_corrected, body_habitat, body_product, body_site, bowel_movement_frequency, bowel_movement_quality, breastmilk_formula_ensure, cancer, cancer_treatment, cardiovascular_disease, cat, cdiff, census_region, chickenpox, clinical_condition, collection_date, coll

## Healthy extraction

In [44]:
healthy = df.query("smoking_frequency == 'Never' and alcohol_frequency == 'Never'")

healthy = healthy.query("cancer == 'I do not have this condition'")

healthy['bmi'] = healthy['bmi'].apply(lambda x: float(x))
healthy = healthy.query("bmi >= 18.5 and bmi <= 24.99")

healthy['age_years'] = healthy['age_years'].apply(lambda x: float(x))
healthy = healthy.query("age_years >= 20 and age_years <= 50")

healthy = healthy.query("body_site == 'UBERON:feces'")

healthy_man = healthy.query("sex == 'male'")
healthy_woman = healthy.query("sex == 'female'")

In [36]:
write_age_mean(healthy, healthy_man, healthy_woman, 'healthy')

--------------------HEALTHY--------------------
Total number of healthy people: 615
Total number of healthy man: 244
Total number of healthy woman: 371
Mean Age for total: 36.9545
Mean Age for man: 37.0984
Mean Age for woman: 36.8598


In [229]:
valid_id = list()
for try_id in tqdm(healthy['sample_name']):
    handleSce = esearch('biosample', try_id)
    if handleSce['IdList'] != []:
        valid_id.append(try_id)

HBox(children=(FloatProgress(value=0.0, max=615.0), HTML(value='')))




In [231]:
valid_healthy = pd.DataFrame(columns=healthy.columns)
index = 0
for _, row in tqdm(healthy.iterrows()):
    if row['sample_name'] in valid_id:
        valid_healthy.loc[index] = row
        index += 1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [233]:
valid_healthy['sex'].value_counts()

female    209
male      134
Name: sex, dtype: int64

## Not healthy extraction

In [248]:
not_healthy = df.query("smoking_frequency == 'Occasionally (1-2 times/week)' or smoking_frequency == 'Daily' or smoking_frequency == 'Regularly (3-5 times/week)'")
not_healthy = not_healthy.query("alcohol_frequency == 'Occasionally (1-2 times/week)' or alcohol_frequency == 'Daily' or alcohol_frequency == 'Regularly (3-5 times/week)'")

not_healthy = not_healthy.query("cancer == 'I do not have this condition'")

not_healthy['bmi'] = not_healthy['bmi'].apply(lambda x: float(x))
not_healthy = not_healthy.query("bmi < 18.5 or bmi > 24.99")

not_healthy['age_years'] = not_healthy['age_years'].apply(lambda x: float(x))
not_healthy = not_healthy.query("age_years >= 20 and age_years <= 50")

not_healty = not_healthy.query("body_site == 'UBERON:feces'")

not_healthy_man = not_healthy.query("sex == 'male'")
not_healthy_woman = not_healthy.query("sex == 'female'")

In [249]:
write_age_mean(not_healthy, not_healthy_man, not_healthy_woman, 'not healthy')

--------------------NOT HEALTHY--------------------
Total number of not healthy people: 81
Total number of not healthy man: 44
Total number of not healthy woman: 26
Mean Age for total: 35.642
Mean Age for man: 37.1136
Mean Age for woman: 36.0


In [244]:
valid_id = list()
for try_id in tqdm(not_healthy['sample_name']):
    handleSce = esearch('biosample', try_id)
    if handleSce['IdList'] != []:
        valid_id.append(try_id)

HBox(children=(FloatProgress(value=0.0, max=91.0), HTML(value='')))




In [245]:
valid_not_healthy = pd.DataFrame(columns=healthy.columns)
index = 0
for _, row in tqdm(not_healthy.iterrows()):
    if row['sample_name'] in valid_id:
        valid_not_healthy.loc[index] = row
        index += 1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [246]:
valid_not_healthy['sex'].value_counts()

male      28
female    17
Name: sex, dtype: int64

## NCBI Quering

In [103]:
Entrez.email = "giacomo.villa.mi@gmail.com"

def good_print(text):
    print(pprint(text))

def esearch(db, query, num_max = 20):
    handle = Entrez.esearch(db = db, term = query, retmax = num_max)
    record = Entrez.read(handle, validate = True)
    return record

def esummary(db, id_val):
    handle = Entrez.esummary(db = db, id = id_val)
    record = Entrez.read(handle, validate = True)
    return record



In [None]:
healty_sample = pd.read_csv("./result_extraction/not_healthy_sample.csv", header=0)

In [None]:
id_ncbi = healty_sample.iloc[0]['sample_name']

In [None]:
not_healthy_man.head()

In [None]:
for i in healthy_woman['sample_name']:
    if i != 10317:
        print(i)

In [None]:
esummary('sra', 10317.000001179)

In [None]:
esummary('sra', 'SRR004230')