## Libraries

In [1]:
import qiime2
from tempfile import mkdtemp
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition
import pandas as pd
import os
import numpy as np
import random
import logging
from datetime import datetime
from Bio import Entrez
from pprint import pprint
from sklearn.utils import shuffle
import xml.etree.ElementTree as ET
import os
from tqdm.notebook import tqdm
from Bio import SeqIO
import ast
import csv

  import pandas.util.testing as pdt


#### Studies
* exercise_frequency
* flossing_frequency
* vitamin_d_supplement_frequency
* weight_change
* fruit_frequency

## Functions

### Pipeline cleaning

In [77]:
# Puliscel a working directory (result_extraction), elimina tutti i file e le cartelle fatta eccezione per il logfile
# il quale viene solo svuotato
def clean_workspace():
    
    # ci spostiamo nella working directory
    starting_path = os.getcwd()
    path = os.path.join(os.getcwd(), f'result_extraction')
    os.chdir(path)
    
    # prendiamo tutti i file contenuti nella working directory e eliminiamo a meno che non sia il log file
    files = os.listdir()
    for file in files:
        if file != 'sra_querying.log':
            delete_command = f'rm -r {file}'
            os.system(delete_command)
            
    # puliamo il log file
    delete_old_log = 'cat > sra_querying.log'
    os.system(delete_old_log)  
    
    # torniamo nella starting directory ./Human microbiome
    os.chdir(starting_path)

### NCBI utilities

In [78]:
# Funzioni per semplificare l'interrogazioen di NCBI
Entrez.email = "giacomo.villa.mi@gmail.com"

def good_print(text):
    print(pprint(text))

def esearch(db, query, num_max = 20):
    handle = Entrez.esearch(db = db, term = query, retmax = num_max)
    record = Entrez.read(handle, validate = True)
    return record

def esummary(db, id_val):
    handle = Entrez.esummary(db = db, id = id_val)
    record = Entrez.read(handle, validate = True)
    return record

### Final Sample generation

In [79]:
# Prende in input il dataset completo, data la query, 
def write_age_mean(total, typology, experiment):
    
    man = total.query('sex == "male"')
    woman = total.query('sex == "female"')
    
    mean_age_total = round(np.mean(total['age_years']), 4)
    mean_age_man = round(np.mean(man['age_years']), 4)
    mean_age_woman = round(np.mean(woman['age_years']), 4)
    not_valid_sex = total.query("sex != 'female' and sex != 'male'")
    
    if typology != 'sample':
        logging.info(f'--------------------{typology.upper()}--------------------')
    logging.info(f'Total number of {typology} people: {len(total)}')
    logging.info(f'Total number of {typology} man: {len(man)}')
    logging.info(f'Total number of {typology} woman: {len(woman)}')
    logging.info(f'Total number of non valid sex {len(not_valid_sex)}')
    
    logging.info(f'Mean Age for total: {mean_age_total}')
    logging.info(f'Mean Age for man: {mean_age_man}')
    logging.info(f'Mean Age for woman: {mean_age_woman}')
    logging.info(f'\n')
    
    print(f'--------------------{typology.upper()}--------------------')
    print(f'Total number of {typology} people: {len(total)}')
    print(f'Total number of {typology} man: {len(man)}')
    print(f'Total number of {typology} woman: {len(woman)}')
    print(f'Total number of non valid sex {len(not_valid_sex)}')
    
    print(f'Mean Age for total: {mean_age_total}')
    print(f'Mean Age for man: {mean_age_man}')
    print(f'Mean Age for woman: {mean_age_woman}')
    
    starting_path = os.getcwd()
    path = os.path.join(os.getcwd(), f'result_extraction/')
    os.chdir(path)
    directories = os.listdir()
    if f'{experiment}' not in directories:
        os.mkdir(f'{experiment}')
        
    os.chdir(f'{experiment}')
    if typology != 'sample':
        total.to_csv(f"./dataset_query_result_{typology}.csv", index=False, encoding='utf-8')
    
    os.chdir(starting_path)

# Prende in input il dataset da cui campionare e gli index degli element già presi, finché non campiona qualcosa di 
# nuovo non termina
def generate_single_sample(already_taken, from_df):
    element = random.randint(0, len(from_df) - 1)
    while (element in already_taken):
        element = random.randint(0, len(from_df) - 1)
    return element

# Dato il dataset di partenza contenente solo elementi effettivamente presenti su NCBI
# genera un nuovo dataset composto da n_samples elementi
def get_final_sample(started_dataset, n_samples):
    taken = set()
    final_sample = pd.DataFrame(columns=started_dataset.columns)
    for i in tqdm(range(n_samples), desc='Sampling data'):
        new_sample = generate_single_sample(taken, started_dataset)
        taken.add(new_sample)
        final_sample = final_sample.append(started_dataset.iloc[new_sample], ignore_index=True)
    return final_sample

def write_not_valid_ids(not_valid_names):
    files = os.listdir(f'./result_extraction')
    
    if 'not_valid_sample_names.csv' in files:
        all_not_valid_names = pd.read_csv("./result_extraction/not_valid_sample_names.csv", header=0, dtype=str)
        elements = [all_not_valid_names, not_valid_names]
        final_not_valid = pd.concat(elements, ignore_index=False, sort=False)
        final_not_valid.to_csv(f"./result_extraction/not_valid_sample_names.csv", index=False)
    else:
        not_valid_names.to_csv(f"./result_extraction/not_valid_sample_names.csv", index=False)
    
# Dato il dataset di partenza con tutti gli elementi che rispettano la query posta al gut, tiene in considraz
def sampling_data(start_dataset, typology, n_samples, experiment):
    write_age_mean(start_dataset, typology, experiment)
    valid_id = list()
    not_valid_id = list()
    for try_id in tqdm(start_dataset['sample_name'], desc='NCBI ids validation'):
        handleSce = esearch('biosample', try_id)
        if len(handleSce['IdList']) != 0:
            valid_id.append(try_id)
        else:
            not_valid_id.append(str(try_id))
            print(try_id)
            
    print(f'Total rows: {len(start_dataset)}')
    print(f'Valid rows: {len(valid_id)}')
    
    not_valid_names = pd.DataFrame(data={"not_valid_sample_name": not_valid_id, "typology": [typology]*len(not_valid_id)})
    print(len(not_valid_names))
    write_not_valid_ids(not_valid_names)
    
    logging.info(f'Total rows: {len(start_dataset)}')
    logging.info(f'Valid rows: {len(valid_id)}')
            
    valid_start_dataset = pd.DataFrame(columns=start_dataset.columns)
    index = 0
    for _, row in start_dataset.iterrows():
        if row['sample_name'] in valid_id:
            valid_start_dataset.loc[index] = row
            index += 1
            
    valid_start_dataset = shuffle(valid_start_dataset)
    final_sample = get_final_sample(valid_start_dataset, n_samples)
    write_sample_info(final_sample, typology, experiment)

# Funzione richiamata sul sample finale, traduce in csv il campione finale creando un csv di due colonne: sample_name,
# typology (e.g. healthy/not_healthy), richiama write_age_mean per scrivere sul log file l'età media dei soggetti
def write_sample_info(sample, typology, experiment):
    man = sample.query("sex == 'male'")
    woman = sample.query("sex == 'female'")
    write_age_mean(sample, 'sample', experiment)
    
    sample = sample[['sample_name']]
    sample['typology'] = [typology]*len(sample)
    
    files = os.listdir(f'./result_extraction/{experiment}')
    
    if f'final_sample_{experiment}.csv' in files:
        final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", header=0, dtype=str)
        final_sample = final_sample[['sample_name', 'typology']]
        elements = [final_sample, sample]
        final_sample = pd.concat(elements, ignore_index=False, sort=False)
        final_sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False)
    else:
        sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False)
        
    close_dashes = '-'*len(typology.upper())
    logging.info(f'--------------------{close_dashes}--------------------')

### SRA operation

In [80]:
# Funzione controller, prende in input il nome dell'esperimento (eg. healthy vs not_healthy) e le tipologie di campione
# (e.g. healthy e not_healty), chiama la funzione che interroga SRA di NCBI, in seguito richiama la funzione per 
# concatenare i file fasta e infine la funzione per prendere le sequenze che ricorrono più spesso. Infine salva su un
# file csv i fasta delle sequenze più popolose aggiungendo il campo che si rifà all'id di Biosample.
# Il file csv delle frequenze più popole è l'input per blast
def sra_querying(experiment, types):
    
    # Legge il csv contente i 30 campioni di una tipologia e i 30 campioni dell'altra tipologia dato l'esperimento
    # (e.g. esperimento: healthy vs not_healthy estrare il csv che contiene i 30 sample_name degli healthy e i 
    # 30 sample_name dei not_healthy)
    final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", header=0, dtype=str)
    
    # crea una lista che conterrà, dati i record, il corrispettivo id di Biosample
    bio_sample_id = list()
    
    sra_ids = list()
    
    # per ogni riga del csv dei 30 campioni di una tipologia e i 30 dell'altra tipologia dato l'esperimento
    for index, row in final_sample.iterrows():
        print(f'File number: {index+1}')
        
        # Gestione del problema sulla lettura di un sample_name con la concatenazione della stringa '001'
        # in generale estrae l'input per la funzione che farà la query su SRA
        record_id = str(row[0])[0:15]
        record_typology = row[1]
        
        # data la singola interrogazione, aggiunge alla lista degli id di bio_sample l'id.
        query_result = get_sequences(record_id, record_typology, experiment)
        bio_sample_id.append(query_result[0])
        sra_ids.append(query_result[1])
        
    # una volta scaricate tutti i file fasta data l'esperimento, per ogni tipologia (e.g healthy/not_healthy) 
    # crea un unico file con tutte le sequenze e poi prende, da questo file, solo quelle più popolose
    for typology in types:
        concatenate_fast_file(typology, 'fasta', experiment)
        get_top_sequences(typology, experiment)
        
    # crea una nuova colonna dove, per ogni sample_name, vi sarà l'id di biosample associato e salva il nuovo csv
    final_sample['bio_sample_id'] = bio_sample_id
    final_sample['runId'] = sra_ids
    final_sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False)
    
    get_sample_top_sequences_count(experiment, types[0])
    get_sample_top_sequences_count(experiment, types[1])
    
# Funzione che interroga SRA, dato il sample_name. Richiede anche la tipologia del campione (e.g. healthy o not_healthy)
# e il nome dell'esperimento (e.g. healthy_vs_not_healthy), per andare a salvare correttamente nelle cartelle facenti
# riferimento all'esperimento
def get_sequences(sample_name, typology, experiment):
    
    # in funzione della tipologia del campione (e.g. healthy o not_healthy) e dell'esperimento (e.g. healthy_vs_not_healthy)
    # definisce il path corretto dove andare a salvare il risultato
    path = f'"./result_extraction/{experiment}/SRA_{typology}" '
    command1 = f'fastq-dump --fasta --readids --outdir {path}'
    command2 = f'fastq-dump --readids --outdir {path}'
        
    # Query su SRA e print utili
    print(f'Sample id: {sample_name}')
    handleSce = esearch('biosample', sample_name)
    biosampleId = handleSce['IdList'][0]
    print(f'Biosample ID {biosampleId}')
    print(f'Typology: {typology}')
    handleSra = Entrez.efetch(db='biosample', id=biosampleId, retmode='xml')
    root = ET.fromstring(handleSra.read())
    identifier = root.findall('.//BioSample//Ids//Id')
    for i in identifier:
        if i.attrib['db'] == 'SRA':
            sraId = i.text
    handleSra = Entrez.esearch(db='sra', term=sraId)
    resultsSra = Entrez.read(handleSra)['IdList']
    run_ids = list()
    for s in resultsSra:
        handlesngSraId = Entrez.efetch(db='sra', id=s, retmode='xml')
        root = ET.fromstring(handlesngSraId.read())
        identifier = root.find('.//EXPERIMENT_PACKAGE//RUN_SET//RUN')
        runId = identifier.attrib['accession']
        os.system(f'{command1}{runId}')
        os.system(f'{command2}{runId}') 
        print(f'Run ID: {runId}')
        run_ids.append(runId)
    print()
    return [biosampleId, run_ids]
        

# Dato il risultato delle query su SRA, concatena i file fasta facenti riferimento a una certa tipologia di record
# (e.g. healthy o not_healthy) dato un certo esperimento (e.g. healthy_vs_not_healthy)
def concatenate_fast_file(typology, file_format, experiment):
    
    # Prende tutti i file data la tipologia del record (e.g. healthy o not_healthy) contenuti nella cartella dove,
    # dato l'esperimento (e.g. healthy_vs_not_healthy), la query su SRA ha riposto i risultati
    files = os.listdir(f'./result_extraction/{experiment}/SRA_{typology}')
    
    # concatenazione file fasta
    compact_files = list()
    for file in files:
        if file_format in file:
            f = open(f'./result_extraction/{experiment}/SRA_{typology}/{file}', "r")
            compact_files.append(f.read())
            f.close()
    f = open(f'./result_extraction/{experiment}/SRA_{typology}/final_{file_format}_{typology}.{file_format}', 'w')
    for file in compact_files:
        f.write(file)
    f.close()
    
    # Eliminazione dei 
    starting_path = os.getcwd()
    path = os.path.join(os.getcwd(), f'result_extraction/{experiment}/SRA_{typology}')
    os.chdir(path)
    command = 'rm *[0-9].fasta'
    #os.system(command) 
    os.chdir(starting_path)
        
def get_top_sequences(typology, experiment):
    records = list(SeqIO.parse(f"./result_extraction/{experiment}/SRA_{typology}/final_fasta_{typology}.fasta", format="fasta"))
    print(f'Number of sequences for {typology}: {len(records)}')
    logging.info(f'Number of sequences for {typology}: {len(records)}')
    
    sequences = dict()
    for record in tqdm(records, desc='Compacting fasta'):
        if record.seq in sequences:
            sequences[record.seq][0] += 1
        else:
            sequences[record.seq] = [1, f'>{record.description}']
    
    print(f'Number of grouped sequences: {len(sequences)}')
    logging.info(f'Number of grouped sequences: {len(sequences)}')
    
    sequences_ord = {k: v for k, v in sorted(sequences.items(), key=lambda item: item[1], reverse=True)}
    
    cont = 0
    f = open(f'./result_extraction/{experiment}/SRA_{typology}/top_sequences_{typology}.fasta', 'w')
    
    for element in sequences_ord:
        if sequences_ord[element][0] >= 100:
            f.write(f'{sequences_ord[element][1]} number of reps {sequences_ord[element][0]}')
            f.write('\n')
            f.write(str(element))
            f.write('\n')
            cont += 1
    f.close()
    
    print(f'Number of taken sequences: {cont}')
    logging.info(f'Number of taken sequences: {cont}')
    print()
    logging.info('\n')
    
def get_sample_top_sequences_count(experiment, typology):
    top_sequences = list(SeqIO.parse(f"./result_extraction/{experiment}/SRA_{typology}/top_sequences_{typology}.fasta", format="fasta"))
    os.mkdir(f'./result_extraction/{experiment}/SRA_{typology}/tmp')
    
    columns = ['sample_name']
    for i in range(len(top_sequences)):
        columns.append(f'seq {i+1}')
    final_summary = pd.DataFrame(columns = columns)
    
    final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", dtype=str)
    target_sample = final_sample.query(f"typology == '{typology}'")

    for index, row in target_sample.iterrows():

        print(f'Sample number: {index+1}')
        run_ids = ast.literal_eval(row['runId'])
        typology = row['typology']
        sample_name = row['sample_name']
        compact_files = list()
        
        print(f'Sample name: {sample_name}')

        for run_id in run_ids:
            f = open(f'./result_extraction/{experiment}/SRA_{typology}/{run_id}.fasta', "r")
            compact_files.append(f.read())
            f.close()
            print(f'Run ID: {run_id}')

            f = open(f'./result_extraction/{experiment}/SRA_{typology}/tmp/{sample_name}.fasta', 'w')
            for file in compact_files:
                f.write(file)
            f.close()

        sample_sequences = list(SeqIO.parse(f"./result_extraction/{experiment}/SRA_{typology}/tmp/{sample_name}.fasta", format="fasta"))

        top_sequences_in_sample = dict()
        for top_sequence in tqdm(top_sequences):
            top_sequences_in_sample[top_sequence.description] = 0
            for sample_sequence in sample_sequences:
                if top_sequence.seq == sample_sequence.seq:
                    top_sequences_in_sample[top_sequence.description] += 1
        numbers = [sample_name]
        for seq in top_sequences_in_sample:
            numbers.append(top_sequences_in_sample[seq])
        final_summary.loc[len(final_summary)] = numbers

    index = 0
    for column in final_summary.columns:
        if column != 'sample_name':
            sum_column = final_summary[column].sum()
            real_value = int(top_sequences[index].description.split('number of reps')[1].strip())
            if sum_column != real_value:
                print(f'Problem to column: {column}')

            index += 1    

    final_summary.to_csv(f"./result_extraction/{experiment}/{typology}_top_sequences_distribution.csv", index=False)

### Quiime2 operation

In [81]:
def quality_analysis(experiment, types):
    files = os.listdir(f'./result_extraction/{experiment}')
    
    if not(f'seq_artifact_{experiment}.qza' in files):
        manifest_operation(experiment, types)
        artifact = qiime2.Artifact.import_data('SampleData[SequencesWithQuality]', f'./result_extraction/{experiment}/manifest.tsv',
                                        view_type='SingleEndFastqManifestPhred33V2')
        artifact.save(f'./result_extraction/{experiment}/seq_artifact_{experiment}.qza')
    else:
        artifact = qiime2.Artifact.load(f'./result_extraction/{experiment}/seq_artifact_{experiment}.qza')
        
    return artifact
    
    #demux_filter_stats = quality_filter.methods.q_score(artifact)
    #filter_stats = metadata.visualizers.tabulate(demux_filter_stats.filter_stats.view(qiime2.Metadata))
    #filter_stats.visualization



# Questa funzione compatta i file dataset_query_result (sostanzialmente ciò che si estreva da GUT) e quindi crea
# date le due tipologie (e.g. healthy e not_healthy) dato l'esperimento (not_healthy_vs_healthy) un singolo file
# che sarà di fatto la somma dei due dataset di partenza. Ovviamente viene inserito un nuovo campo (nuova colonna)
# dove si specifica la tipologia del record (e.g. healthy o not_healthy)
def manifest_operation(experiment, typology):
    typology_1 = pd.read_csv(f'./result_extraction/{experiment}/dataset_query_result_{typology[0]}.csv', dtype=str)
    typology_2 = pd.read_csv(f'./result_extraction/{experiment}/dataset_query_result_{typology[1]}.csv', dtype=str)
    
    typology_1['typology'] = [typology[0]]*len(typology_1)
    typology_2['typology'] = [typology[1]]*len(typology_2)
    
    frames = [typology_1, typology_2]
    final_dataset = pd.concat(frames)
    if len(final_dataset) == (len(typology_1) + len(typology_2)):
        with open(f'./result_extraction/{experiment}/dataset_query_result_{experiment}.tsv', "w", newline='') as dataset_query_result:
            tsv_output = csv.writer(dataset_query_result, delimiter='\t')
            
            tsv_output.writerow(list(final_dataset.columns))
            cont = 0
            for _, row in final_dataset.iterrows():
                tsv_output.writerow(list(row))
                cont += 1
        
    add_id_sample(experiment)
    create_manifest(experiment)

# Questa funzione, dato l'experimenti (e.g. healthy_vs_not_healthy) crea il file manifest come mostrato nella pagina 
# https://docs.qiime2.org/2020.2/tutorials/importing/. 
def create_manifest(experiment):
    
    # leggiamo il file che contiene i sample_name campionati (il file con sample_name, typology, biosample_id, sra_id)
    final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", dtype=str)
    manifest = list()
    
    # per ogni riga andiamo a copiare i file fastq e a inserirli in una nuova cartella con un nuovo nome
    # sample_name.fasq, andando a compattare nel caso in cui a un sample_name fossero associati più file 
    # fastq
    for index, row in final_sample.iterrows():
        run_ids = ast.literal_eval(row['runId'])
        sample_identificator = row['sample_identificator']
        sample_name = row['sample_name']
        typology = row['typology']
        print(f'Sample identificator: {sample_identificator}')
        print(f'Sample name: {sample_name}')
        print(f'Typology: {typology}')
        print(f'Run ids: {run_ids}')
        print()
        
        directories = os.listdir(f'./result_extraction/{experiment}/SRA_{typology}')
        if 'tmp_fastq' not in directories:
            os.mkdir(f'./result_extraction/{experiment}/SRA_{typology}/tmp_fastq')
        
        compact_files = list()
        for run_id in run_ids:
            f = open(f'./result_extraction/{experiment}/SRA_{typology}/{run_id}.fastq', "r")
            compact_files.append(f.read())
            f.close()
            
            
        f = open(f'./result_extraction/{experiment}/SRA_{typology}/tmp_fastq/{sample_identificator}.fastq', 'w')
        for file in compact_files:
            f.write(file)
        f.close()
        
        # nel manifest appendiamo la riga 'sample_identificator, absolute_path_to_fastq'
        manifest.append([sample_identificator, f'/result_extraction/{experiment}/SRA_{typology}/tmp_fastq/{sample_identificator}.fastq'])
    
    # scriviamo il manifest 
    with open(f'./result_extraction/{experiment}/manifest.tsv', "w",newline='') as manifest_file:
        tsv_output = csv.writer(manifest_file, delimiter='\t')
        tsv_output.writerow(['sample-id', 'absolute-filepath'])
        for line in manifest:
            tsv_output.writerow([line[0], f'{os.getcwd()}{line[1]}'])
            
            
def add_id_sample(experiment):
    final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", dtype=str)
    sample_id = [f'Sample{i}' for i in range(len(final_sample))]
    final_sample['sample_identificator'] = sample_id
    final_sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False, encoding='utf-8')

## Main

In [58]:
# Carica il dataset gut
df = pd.read_csv("./data/american_gut.txt", delimiter="\t", dtype=str)

# Sostituisce con NAN valori non validi
df.replace(' ', np.nan, inplace=True)
df.replace('Not provided', np.nan, inplace=True)
df.replace('Unspecified', np.nan, inplace=True)

# Elimina dalla working directory tutti i risultati dello scorso esperimento, ripulisce il logfile
# clean_workspace()

In [None]:
# Inizializza il log file
logging.basicConfig(filename='./result_extraction/sra_querying.log', level=logging.INFO, format='%(message)s')
today = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
logging.info(f'RUN TIME: {today}')

## All columns

In [None]:
for column in df.columns:
    print(column, end = ', ')

## Healthy vs not healthy study
### Healthy extraction

In [None]:
# Estraiamo dal dataset i dati di interesse
healthy = df.query("smoking_frequency == 'Never' and alcohol_frequency == 'Never'")

healthy = healthy.query("cancer == 'I do not have this condition'")

healthy['bmi'] = healthy['bmi'].apply(lambda x: float(x))
healthy = healthy.query("bmi >= 18.5 and bmi <= 24.99")

healthy['age_years'] = healthy['age_years'].apply(lambda x: float(x))
healthy = healthy.query("age_years >= 20 and age_years <= 50")

healthy = healthy.query("body_site == 'UBERON:feces'")

In [None]:
sampling_data(healthy, 'healthy', 30, 'healthy_vs_not_healthy')

### Not healthy extraction

In [None]:
not_healthy = df.query("smoking_frequency == 'Occasionally (1-2 times/week)' or smoking_frequency == 'Daily' or smoking_frequency == 'Regularly (3-5 times/week)'")
not_healthy = not_healthy.query("alcohol_frequency == 'Occasionally (1-2 times/week)' or alcohol_frequency == 'Daily' or alcohol_frequency == 'Regularly (3-5 times/week)'")

not_healthy = not_healthy.query("cancer == 'I do not have this condition'")

not_healthy['bmi'] = not_healthy['bmi'].apply(lambda x: float(x))
not_healthy = not_healthy.query("bmi < 18.5 or bmi > 24.99")

not_healthy['age_years'] = not_healthy['age_years'].apply(lambda x: float(x))
not_healthy = not_healthy.query("age_years >= 20 and age_years <= 50")

not_healthy = not_healthy.query("body_site == 'UBERON:feces'")

In [None]:
sampling_data(not_healthy, 'not_healthy', 30, 'healthy_vs_not_healthy')

## Not healthy old vs not healthy young studies
### Not healthy old extraction

In [93]:
not_healthy = df.query("smoking_frequency == 'Occasionally (1-2 times/week)' or smoking_frequency == 'Daily' or smoking_frequency == 'Regularly (3-5 times/week)'")
not_healthy = not_healthy.query("alcohol_frequency == 'Occasionally (1-2 times/week)' or alcohol_frequency == 'Daily' or alcohol_frequency == 'Regularly (3-5 times/week)'")

not_healthy = not_healthy.query("cancer == 'I do not have this condition'")

not_healthy['bmi'] = not_healthy['bmi'].apply(lambda x: float(x))
not_healthy = not_healthy.query("bmi < 18.5 or bmi > 24.99")

not_healthy['age_years'] = not_healthy['age_years'].apply(lambda x: float(x))
not_healthy_old = not_healthy.query("age_years >= 40 and age_years <= 50")

not_healthy_old = not_healthy_old.query("body_site == 'UBERON:feces'")

In [94]:
sampling_data(not_healthy_old, 'not_healthy_old', 17, 'not_healthy_old_vs_not_healthy_young')

--------------------NOT_HEALTHY_OLD--------------------
Total number of not_healthy_old people: 26
Total number of not_healthy_old man: 17
Total number of not_healthy_old woman: 9
Total number of non valid sex 0
Mean Age for total: 46.3077
Mean Age for man: 46.2941
Mean Age for woman: 46.3333


HBox(children=(FloatProgress(value=0.0, description='NCBI ids validation', max=26.0, style=ProgressStyle(descr…

10317.000032727
10317.000052110
10317.000102626
10317.000103722
10317.000108087
10317.000109110
10317.000113732
10317.000114513
10317.000114522

Total rows: 26
Valid rows: 17
9


HBox(children=(FloatProgress(value=0.0, description='Sampling data', max=17.0, style=ProgressStyle(description…


--------------------SAMPLE--------------------
Total number of sample people: 17
Total number of sample man: 12
Total number of sample woman: 5
Total number of non valid sex 0
Mean Age for total: 46.2941
Mean Age for man: 46.25
Mean Age for woman: 46.4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Not healthy young extraction

In [95]:
not_healthy_young = not_healthy.query("age_years >= 20 and age_years <= 30")

not_healthy_young = not_healthy_young.query("body_site == 'UBERON:feces'")

In [96]:
sampling_data(not_healthy_young, 'not_healthy_young', 14, 'not_healthy_old_vs_not_healthy_young')

--------------------NOT_HEALTHY_YOUNG--------------------
Total number of not_healthy_young people: 19
Total number of not_healthy_young man: 11
Total number of not_healthy_young woman: 6
Total number of non valid sex 2
Mean Age for total: 25.7368
Mean Age for man: 26.2727
Mean Age for woman: 23.8333


HBox(children=(FloatProgress(value=0.0, description='NCBI ids validation', max=19.0, style=ProgressStyle(descr…

10317.000033511
10317.000108111
10317.000108185
10317.000108626
10317.000110036

Total rows: 19
Valid rows: 14
5


HBox(children=(FloatProgress(value=0.0, description='Sampling data', max=14.0, style=ProgressStyle(description…


--------------------SAMPLE--------------------
Total number of sample people: 14
Total number of sample man: 10
Total number of sample woman: 2
Total number of non valid sex 2
Mean Age for total: 25.9286
Mean Age for man: 26.1
Mean Age for woman: 22.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Mental illness vs food disorders
### Mental illness

In [None]:
mental_illness = df.query("country_residence == 'United States'")

mental_illness = mental_illness.query("body_site == 'UBERON:feces'")

mental_illness = mental_illness.query("mental_illness == 'true' or mental_illness == 'Yes'")

mental_illness['age_years'] = mental_illness['age_years'].apply(lambda x: float(x))

In [None]:
sampling_data(mental_illness, 'mental_illness', 30, 'mental_ill_vs_food_dis')

### Food disorders

In [None]:
food_disorders = df.query("country_residence == 'United States'")

food_disorders = food_disorders.query("body_site == 'UBERON:feces'")

food_disorders = food_disorders.query("mental_illness == 'false' or mental_illness == 'No'")

food_disorders['bmi'] =  food_disorders['bmi'].apply(lambda x : float(x))
food_disorders = food_disorders.query("bmi < 18.5 or bmi > 24.99")

food_disorders = food_disorders.query("(fruit_frequency == 'Never' or fruit_frequency == 'Rarely (less than once/week)')")

food_disorders = food_disorders.query("exercise_frequency=='Rarely (a few times/month)' or exercise_frequency=='Never'")

food_disorders['age_years'] = food_disorders['age_years'].apply(lambda x: float(x))

In [None]:
sampling_data(food_disorders, 'food_disorders', 30, 'mental_ill_vs_food_dis')

## NCBI Quering

In [None]:
sra_querying('mental_ill_vs_food_dis', ['mental_illness', 'food_disorders'])

In [None]:
sra_querying('healthy_vs_not_healthy', ['healthy', 'not_healthy'])

In [97]:
sra_querying('not_healthy_old_vs_not_healthy_young', ['not_healthy_old', 'not_healthy_young'])

File number: 1
Sample id: 10317.000093400
Biosample ID 9541016
Typology: not_healthy_old
Run ID: ERR2579553

File number: 2
Sample id: 10317.000059040
Biosample ID 6366200
Typology: not_healthy_old
Run ID: ERR1842899

File number: 3
Sample id: 10317.000070708
Biosample ID 8728589
Typology: not_healthy_old
Run ID: ERR2404950

File number: 4
Sample id: 10317.000047151
Biosample ID 6365612
Typology: not_healthy_old
Run ID: ERR1842295

File number: 5
Sample id: 10317.000097663
Biosample ID 9657767
Typology: not_healthy_old
Run ID: ERR2697902

File number: 6
Sample id: 10317.000033063
Biosample ID 5158638
Typology: not_healthy_old
Run ID: ERR1417508

File number: 7
Sample id: 10317.000050274
Biosample ID 6366103
Typology: not_healthy_old
Run ID: ERR1842802

File number: 8
Sample id: 10317.000038305
Biosample ID 4916250
Typology: not_healthy_old
Run ID: ERR1389836

File number: 9
Sample id: 10317.000076672
Biosample ID 8945554
Typology: not_healthy_old
Run ID: ERR2523964

File number: 10
Sam

HBox(children=(FloatProgress(value=0.0, description='Compacting fasta', max=1330267.0, style=ProgressStyle(des…


Number of grouped sequences: 999335
Number of taken sequences: 263

Number of sequences for not_healthy_young: 425553


HBox(children=(FloatProgress(value=0.0, description='Compacting fasta', max=425553.0, style=ProgressStyle(desc…


Number of grouped sequences: 141336
Number of taken sequences: 247

Sample number: 1
Sample name: 10317.000093400
Run ID: ERR2579553


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 2
Sample name: 10317.000059040
Run ID: ERR1842899


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 3
Sample name: 10317.000070708
Run ID: ERR2404950


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 4
Sample name: 10317.000047151
Run ID: ERR1842295


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 5
Sample name: 10317.000097663
Run ID: ERR2697902


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 6
Sample name: 10317.000033063
Run ID: ERR1417508


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 7
Sample name: 10317.000050274
Run ID: ERR1842802


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 8
Sample name: 10317.000038305
Run ID: ERR1389836


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 9
Sample name: 10317.000076672
Run ID: ERR2523964


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 10
Sample name: 10317.000092754
Run ID: ERR2696591


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 11
Sample name: 10317.000060228
Run ID: ERR2319389


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 12
Sample name: 10317.000075933
Run ID: ERR2696475


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 13
Sample name: 10317.000074586
Run ID: ERR4019071


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 14
Sample name: 10317.000065320
Run ID: ERR1854752


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 15
Sample name: 10317.000058971
Run ID: ERR2696443


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 16
Sample name: 10317.000066629
Run ID: ERR2242120
Run ID: ERR2239395
Run ID: ERR2057058


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 17
Sample name: 10317.000042631
Run ID: ERR1597057


HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))


Sample number: 18
Sample name: 10317.000069001
Run ID: ERR2033544


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 19
Sample name: 10317.000054289
Run ID: ERR2319344


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 20
Sample name: 10317.000069002
Run ID: ERR2314217


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 21
Sample name: 10317.000068657
Run ID: ERR2306142


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 22
Sample name: 10317.000093103
Run ID: ERR2696999


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 23
Sample name: 10317.000062086
Run ID: ERR1842637


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 24
Sample name: 10317.000059959
Run ID: ERR2303862


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 25
Sample name: 10317.000037933
Run ID: ERR1842199
Run ID: ERR2056801


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 26
Sample name: 10317.000075909
Run ID: ERR2523933


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 27
Sample name: 10317.000101067
Run ID: ERR4020553


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 28
Sample name: 10317.000047222
Run ID: ERR1842761


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 29
Sample name: 10317.000041832
Run ID: ERR1842728


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 30
Sample name: 10317.000094611
Run ID: ERR2696668


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))


Sample number: 31
Sample name: 10317.000097473
Run ID: ERR2697882


HBox(children=(FloatProgress(value=0.0, max=247.0), HTML(value='')))




## Quiime2

In [54]:
artifact = quality_analysis('mental_ill_vs_food_dis', ['mental_illness', 'food_disorders'])

In [55]:
demux_sequences = demux.visualizers.summarize(artifact)
demux_sequences.visualization

In [56]:
artifact = quality_analysis('healthy_vs_not_healthy', ['healthy', 'not_healthy'])

Sample identificator: Sample0
Sample name: 10317.000093022
Typology: healthy
Run ids: ['ERR2696639']

Sample identificator: Sample1
Sample name: 10317.000058340
Typology: healthy
Run ids: ['ERR1846087']

Sample identificator: Sample2
Sample name: 10317.000107254
Typology: healthy
Run ids: ['ERR4019307']

Sample identificator: Sample3
Sample name: 10317.000032796
Typology: healthy
Run ids: ['ERR2239612', 'ERR1417466']

Sample identificator: Sample4
Sample name: 10317.000046476
Typology: healthy
Run ids: ['ERR2032691']

Sample identificator: Sample5
Sample name: 10317.000079781
Typology: healthy
Run ids: ['ERR2304091']

Sample identificator: Sample6
Sample name: 10317.000040396
Typology: healthy
Run ids: ['ERR1841615', 'ERR1843602']

Sample identificator: Sample7
Sample name: 10317.000069615
Typology: healthy
Run ids: ['ERR2319529']

Sample identificator: Sample8
Sample name: 10317.000050243
Typology: healthy
Run ids: ['ERR1841757', 'ERR1843760']

Sample identificator: Sample9
Sample nam

In [57]:
demux_sequences = demux.visualizers.summarize(artifact)
demux_sequences.visualization

In [98]:
artifact = quality_analysis('not_healthy_old_vs_not_healthy_young', ['not_healthy_old', 'not_healthy_young'])

Sample identificator: Sample0
Sample name: 10317.000093400
Typology: not_healthy_old
Run ids: ['ERR2579553']

Sample identificator: Sample1
Sample name: 10317.000059040
Typology: not_healthy_old
Run ids: ['ERR1842899']

Sample identificator: Sample2
Sample name: 10317.000070708
Typology: not_healthy_old
Run ids: ['ERR2404950']

Sample identificator: Sample3
Sample name: 10317.000047151
Typology: not_healthy_old
Run ids: ['ERR1842295']

Sample identificator: Sample4
Sample name: 10317.000097663
Typology: not_healthy_old
Run ids: ['ERR2697902']

Sample identificator: Sample5
Sample name: 10317.000033063
Typology: not_healthy_old
Run ids: ['ERR1417508']

Sample identificator: Sample6
Sample name: 10317.000050274
Typology: not_healthy_old
Run ids: ['ERR1842802']

Sample identificator: Sample7
Sample name: 10317.000038305
Typology: not_healthy_old
Run ids: ['ERR1389836']

Sample identificator: Sample8
Sample name: 10317.000076672
Typology: not_healthy_old
Run ids: ['ERR2523964']

Sample ide

In [99]:
demux_sequences = demux.visualizers.summarize(artifact)
demux_sequences.visualization