## Libraries

In [1]:
import qiime2
from tempfile import mkdtemp
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition
import pandas as pd
import os
import numpy as np
import random
import logging
from datetime import datetime
from Bio import Entrez
from pprint import pprint
from sklearn.utils import shuffle
import xml.etree.ElementTree as ET
import os
from tqdm.notebook import tqdm
from Bio import SeqIO
import ast
import csv

  import pandas.util.testing as pdt


## Functions

### Pipeline cleaning

In [2]:
# Puliscel a working directory (result_extraction), elimina tutti i file e le cartelle fatta eccezione per il logfile
# il quale viene solo svuotato
def clean_workspace():
    
    # ci spostiamo nella working directory
    starting_path = os.getcwd()
    path = os.path.join(os.getcwd(), f'result_extraction')
    os.chdir(path)
    
    # prendiamo tutti i file contenuti nella working directory e eliminiamo a meno che non sia il log file
    files = os.listdir()
    for file in files:
        if file != 'sra_querying.log':
            delete_command = f'rm -r {file}'
            os.system(delete_command)
            
    # puliamo il log file
    delete_old_log = 'cat > sra_querying.log'
    os.system(delete_old_log)  
    
    # torniamo nella starting directory ./Human microbiome
    os.chdir(starting_path)

### NCBI utilities

In [3]:
# Funzioni per semplificare l'interrogazioen di NCBI
Entrez.email = "giacomo.villa.mi@gmail.com"

def good_print(text):
    print(pprint(text))

def esearch(db, query, num_max = 20):
    handle = Entrez.esearch(db = db, term = query, retmax = num_max)
    record = Entrez.read(handle, validate = True)
    return record

def esummary(db, id_val):
    handle = Entrez.esummary(db = db, id = id_val)
    record = Entrez.read(handle, validate = True)
    return record

### Final Sample generation

In [4]:
# Prende in input il dataset completo, data la query, 
def write_age_mean(total, typology, experiment):
    
    man = total.query('sex == "male"')
    woman = total.query('sex == "female"')
    
    mean_age_total = round(np.mean(total['age_years']), 4)
    mean_age_man = round(np.mean(man['age_years']), 4)
    mean_age_woman = round(np.mean(woman['age_years']), 4)
    not_valid_sex = total.query("sex != 'female' and sex != 'male'")
    
    if typology != 'sample':
        logging.info(f'--------------------{typology.upper()}--------------------')
    logging.info(f'Total number of {typology} people: {len(total)}')
    logging.info(f'Total number of {typology} man: {len(man)}')
    logging.info(f'Total number of {typology} woman: {len(woman)}')
    logging.info(f'Total number of non valid sex {len(not_valid_sex)}')
    
    logging.info(f'Mean Age for total: {mean_age_total}')
    logging.info(f'Mean Age for man: {mean_age_man}')
    logging.info(f'Mean Age for woman: {mean_age_woman}')
    logging.info(f'\n')
    
    print(f'--------------------{typology.upper()}--------------------')
    print(f'Total number of {typology} people: {len(total)}')
    print(f'Total number of {typology} man: {len(man)}')
    print(f'Total number of {typology} woman: {len(woman)}')
    print(f'Total number of non valid sex {len(not_valid_sex)}')
    
    print(f'Mean Age for total: {mean_age_total}')
    print(f'Mean Age for man: {mean_age_man}')
    print(f'Mean Age for woman: {mean_age_woman}')
    
    starting_path = os.getcwd()
    path = os.path.join(os.getcwd(), f'result_extraction/')
    os.chdir(path)
    directories = os.listdir()
    if f'{experiment}' not in directories:
        os.mkdir(f'{experiment}')
        
    os.chdir(f'{experiment}')
    if typology != 'sample':
        total.to_csv(f"./dataset_query_result_{typology}.csv", index=False, encoding='utf-8')
    
    os.chdir(starting_path)

# Prende in input il dataset da cui campionare e gli index degli element già presi, finché non campiona qualcosa di 
# nuovo non termina
def generate_single_sample(already_taken, from_df):
    element = random.randint(0, len(from_df) - 1)
    while (element in already_taken):
        element = random.randint(0, len(from_df) - 1)
    return element

# Dato il dataset di partenza contenente solo elementi effettivamente presenti su NCBI
# genera un nuovo dataset composto da n_samples elementi
def get_final_sample(started_dataset, n_samples):
    taken = set()
    final_sample = pd.DataFrame(columns=started_dataset.columns)
    for i in tqdm(range(n_samples), desc='Sampling data'):
        new_sample = generate_single_sample(taken, started_dataset)
        taken.add(new_sample)
        final_sample = final_sample.append(started_dataset.iloc[new_sample], ignore_index=True)
    return final_sample

def write_not_valid_ids(not_valid_names):
    files = os.listdir(f'./result_extraction')
    
    if 'not_valid_sample_names.csv' in files:
        all_not_valid_names = pd.read_csv("./result_extraction/not_valid_sample_names.csv", header=0, dtype=str)
        elements = [all_not_valid_names, not_valid_names]
        final_not_valid = pd.concat(elements, ignore_index=False, sort=False)
        final_not_valid.to_csv(f"./result_extraction/not_valid_sample_names.csv", index=False)
    else:
        not_valid_names.to_csv(f"./result_extraction/not_valid_sample_names.csv", index=False)
    
# Dato il dataset di partenza con tutti gli elementi che rispettano la query posta al gut, tiene in considraz
def sampling_data(start_dataset, typology, n_samples, experiment):
    write_age_mean(start_dataset, typology, experiment)
    valid_id = list()
    not_valid_id = list()
    for try_id in tqdm(start_dataset['sample_name'], desc='NCBI ids validation'):
        handleSce = esearch('biosample', try_id)
        if len(handleSce['IdList']) != 0:
            valid_id.append(try_id)
        else:
            not_valid_id.append(str(try_id))
            print(try_id)
            
    print(f'Total rows: {len(start_dataset)}')
    print(f'Valid rows: {len(valid_id)}')
    
    not_valid_names = pd.DataFrame(data={"not_valid_sample_name": not_valid_id, "typology": [typology]*len(not_valid_id)})
    print(len(not_valid_names))
    write_not_valid_ids(not_valid_names)
    
    logging.info(f'Total rows: {len(start_dataset)}')
    logging.info(f'Valid rows: {len(valid_id)}')
            
    valid_start_dataset = pd.DataFrame(columns=start_dataset.columns)
    index = 0
    for _, row in start_dataset.iterrows():
        if row['sample_name'] in valid_id:
            valid_start_dataset.loc[index] = row
            index += 1
            
    valid_start_dataset = shuffle(valid_start_dataset)
    final_sample = get_final_sample(valid_start_dataset, n_samples)
    write_sample_info(final_sample, typology, experiment)

# Funzione richiamata sul sample finale, traduce in csv il campione finale creando un csv di due colonne: sample_name,
# typology (e.g. healthy/not_healthy), richiama write_age_mean per scrivere sul log file l'età media dei soggetti
def write_sample_info(sample, typology, experiment):
    man = sample.query("sex == 'male'")
    woman = sample.query("sex == 'female'")
    write_age_mean(sample, 'sample', experiment)
    
    sample = sample[['sample_name']]
    sample['typology'] = [typology]*len(sample)
    
    files = os.listdir(f'./result_extraction/{experiment}')
    
    if f'final_sample_{experiment}.csv' in files:
        final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", header=0, dtype=str)
        final_sample = final_sample[['sample_name', 'typology']]
        elements = [final_sample, sample]
        final_sample = pd.concat(elements, ignore_index=False, sort=False)
        final_sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False)
    else:
        sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False)
        
    close_dashes = '-'*len(typology.upper())
    logging.info(f'--------------------{close_dashes}--------------------')

### SRA operation

In [5]:
# Funzione controller, prende in input il nome dell'esperimento (eg. healthy vs not_healthy) e le tipologie di campione
# (e.g. healthy e not_healty), chiama la funzione che interroga SRA di NCBI, in seguito richiama la funzione per 
# concatenare i file fasta e infine la funzione per prendere le sequenze che ricorrono più spesso. Infine salva su un
# file csv i fasta delle sequenze più popolose aggiungendo il campo che si rifà all'id di Biosample.
# Il file csv delle frequenze più popole è l'input per blast
def sra_querying(experiment, types):
    
    # Legge il csv contente i 30 campioni di una tipologia e i 30 campioni dell'altra tipologia dato l'esperimento
    # (e.g. esperimento: healthy vs not_healthy estrare il csv che contiene i 30 sample_name degli healthy e i 
    # 30 sample_name dei not_healthy)
    final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", header=0, dtype=str)
    
    # crea una lista che conterrà, dati i record, il corrispettivo id di Biosample
    bio_sample_id = list()
    
    sra_ids = list()
    
    # per ogni riga del csv dei 30 campioni di una tipologia e i 30 dell'altra tipologia dato l'esperimento
    for index, row in final_sample.iterrows():
        print(f'File number: {index+1}')
        
        # Gestione del problema sulla lettura di un sample_name con la concatenazione della stringa '001'
        # in generale estrae l'input per la funzione che farà la query su SRA
        record_id = str(row[0])[0:15]
        record_typology = row[1]
        
        # data la singola interrogazione, aggiunge alla lista degli id di bio_sample l'id.
        query_result = get_sequences(record_id, record_typology, experiment)
        bio_sample_id.append(query_result[0])
        sra_ids.append(query_result[1])
        
    # una volta scaricate tutti i file fasta data l'esperimento, per ogni tipologia (e.g healthy/not_healthy) 
    # crea un unico file con tutte le sequenze e poi prende, da questo file, solo quelle più popolose
    for typology in types:
        concatenate_fast_file(typology, 'fasta', experiment)
        get_top_sequences(typology, experiment)
        
    # crea una nuova colonna dove, per ogni sample_name, vi sarà l'id di biosample associato e salva il nuovo csv
    final_sample['bio_sample_id'] = bio_sample_id
    final_sample['runId'] = sra_ids
    final_sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False)
    
    get_sample_top_sequences_count(experiment, types[0])
    get_sample_top_sequences_count(experiment, types[1])
    
# Funzione che interroga SRA, dato il sample_name. Richiede anche la tipologia del campione (e.g. healthy o not_healthy)
# e il nome dell'esperimento (e.g. healthy_vs_not_healthy), per andare a salvare correttamente nelle cartelle facenti
# riferimento all'esperimento
def get_sequences(sample_name, typology, experiment):
    
    # in funzione della tipologia del campione (e.g. healthy o not_healthy) e dell'esperimento (e.g. healthy_vs_not_healthy)
    # definisce il path corretto dove andare a salvare il risultato
    path = f'"./result_extraction/{experiment}/SRA_{typology}" '
    command1 = f'fastq-dump --fasta --readids --outdir {path}'
    command2 = f'fastq-dump --readids --outdir {path}'
        
    # Query su SRA e print utili
    print(f'Sample id: {sample_name}')
    handleSce = esearch('biosample', sample_name)
    biosampleId = handleSce['IdList'][0]
    print(f'Biosample ID {biosampleId}')
    print(f'Typology: {typology}')
    handleSra = Entrez.efetch(db='biosample', id=biosampleId, retmode='xml')
    root = ET.fromstring(handleSra.read())
    identifier = root.findall('.//BioSample//Ids//Id')
    for i in identifier:
        if i.attrib['db'] == 'SRA':
            sraId = i.text
    handleSra = Entrez.esearch(db='sra', term=sraId)
    resultsSra = Entrez.read(handleSra)['IdList']
    run_ids = list()
    for s in resultsSra:
        handlesngSraId = Entrez.efetch(db='sra', id=s, retmode='xml')
        root = ET.fromstring(handlesngSraId.read())
        identifier = root.find('.//EXPERIMENT_PACKAGE//RUN_SET//RUN')
        runId = identifier.attrib['accession']
        os.system(f'{command1}{runId}')
        os.system(f'{command2}{runId}') 
        print(f'Run ID: {runId}')
        run_ids.append(runId)
    print()
    return [biosampleId, run_ids]
        

# Dato il risultato delle query su SRA, concatena i file fasta facenti riferimento a una certa tipologia di record
# (e.g. healthy o not_healthy) dato un certo esperimento (e.g. healthy_vs_not_healthy)
def concatenate_fast_file(typology, file_format, experiment):
    
    # Prende tutti i file data la tipologia del record (e.g. healthy o not_healthy) contenuti nella cartella dove,
    # dato l'esperimento (e.g. healthy_vs_not_healthy), la query su SRA ha riposto i risultati
    files = os.listdir(f'./result_extraction/{experiment}/SRA_{typology}')
    
    # concatenazione file fasta
    compact_files = list()
    for file in files:
        if file_format in file:
            f = open(f'./result_extraction/{experiment}/SRA_{typology}/{file}', "r")
            compact_files.append(f.read())
            f.close()
    f = open(f'./result_extraction/{experiment}/SRA_{typology}/final_{file_format}_{typology}.{file_format}', 'w')
    for file in compact_files:
        f.write(file)
    f.close()
    
    # Eliminazione dei 
    starting_path = os.getcwd()
    path = os.path.join(os.getcwd(), f'result_extraction/{experiment}/SRA_{typology}')
    os.chdir(path)
    command = 'rm *[0-9].fasta'
    #os.system(command) 
    os.chdir(starting_path)
        
def get_top_sequences(typology, experiment):
    records = list(SeqIO.parse(f"./result_extraction/{experiment}/SRA_{typology}/final_fasta_{typology}.fasta", format="fasta"))
    print(f'Number of sequences for {typology}: {len(records)}')
    logging.info(f'Number of sequences for {typology}: {len(records)}')
    
    sequences = dict()
    for record in tqdm(records, desc='Compacting fasta'):
        if record.seq in sequences:
            sequences[record.seq][0] += 1
        else:
            sequences[record.seq] = [1, f'>{record.description}']
    
    print(f'Number of grouped sequences: {len(sequences)}')
    logging.info(f'Number of grouped sequences: {len(sequences)}')
    
    sequences_ord = {k: v for k, v in sorted(sequences.items(), key=lambda item: item[1], reverse=True)}
    
    cont = 0
    f = open(f'./result_extraction/{experiment}/SRA_{typology}/top_sequences_{typology}.fasta', 'w')
    
    for element in sequences_ord:
        if sequences_ord[element][0] >= 100:
            f.write(f'{sequences_ord[element][1]} number of reps {sequences_ord[element][0]}')
            f.write('\n')
            f.write(str(element))
            f.write('\n')
            cont += 1
    f.close()
    
    print(f'Number of taken sequences: {cont}')
    logging.info(f'Number of taken sequences: {cont}')
    print()
    logging.info('\n')
    
def get_sample_top_sequences_count(experiment, typology):
    top_sequences = list(SeqIO.parse(f"./result_extraction/{experiment}/SRA_{typology}/top_sequences_{typology}.fasta", format="fasta"))
    os.mkdir(f'./result_extraction/{experiment}/SRA_{typology}/tmp')
    
    columns = ['sample_name']
    for i in range(len(top_sequences)):
        columns.append(f'seq {i+1}')
    final_summary = pd.DataFrame(columns = columns)
    
    final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", dtype=str)
    target_sample = final_sample.query(f"typology == '{typology}'")

    for index, row in target_sample.iterrows():

        print(f'Sample number: {index+1}')
        run_ids = ast.literal_eval(row['runId'])
        typology = row['typology']
        sample_name = row['sample_name']
        compact_files = list()
        
        print(f'Sample name: {sample_name}')

        for run_id in run_ids:
            f = open(f'./result_extraction/{experiment}/SRA_{typology}/{run_id}.fasta', "r")
            compact_files.append(f.read())
            f.close()
            print(f'Run ID: {run_id}')

            f = open(f'./result_extraction/{experiment}/SRA_{typology}/tmp/{sample_name}.fasta', 'w')
            for file in compact_files:
                f.write(file)
            f.close()

        sample_sequences = list(SeqIO.parse(f"./result_extraction/{experiment}/SRA_{typology}/tmp/{sample_name}.fasta", format="fasta"))

        top_sequences_in_sample = dict()
        for top_sequence in tqdm(top_sequences):
            top_sequences_in_sample[top_sequence.description] = 0
            for sample_sequence in sample_sequences:
                if top_sequence.seq == sample_sequence.seq:
                    top_sequences_in_sample[top_sequence.description] += 1
        numbers = [sample_name]
        for seq in top_sequences_in_sample:
            numbers.append(top_sequences_in_sample[seq])
        final_summary.loc[len(final_summary)] = numbers

    index = 0
    for column in final_summary.columns:
        if column != 'sample_name':
            sum_column = final_summary[column].sum()
            real_value = int(top_sequences[index].description.split('number of reps')[1].strip())
            if sum_column != real_value:
                print(f'Problem to column: {column}')

            index += 1    

    final_summary.to_csv(f"./result_extraction/{experiment}/{typology}_top_sequences_distribution.csv", index=False)

### Quiime2 operation

In [6]:
def quality_analysis(experiment, types):
    files = os.listdir(f'./result_extraction/{experiment}')
    
    if not(f'seq_artifact_{experiment}.qza' in files):
        manifest_operation(experiment, types)
        artifact = qiime2.Artifact.import_data('SampleData[SequencesWithQuality]', f'./result_extraction/{experiment}/manifest.tsv',
                                        view_type='SingleEndFastqManifestPhred33V2')
        artifact.save(f'./result_extraction/{experiment}/seq_artifact_{experiment}.qza')
    else:
        artifact = qiime2.Artifact.load(f'./result_extraction/{experiment}/seq_artifact_{experiment}.qza')
        
    return artifact
    
    #demux_filter_stats = quality_filter.methods.q_score(artifact)
    #filter_stats = metadata.visualizers.tabulate(demux_filter_stats.filter_stats.view(qiime2.Metadata))
    #filter_stats.visualization



# Questa funzione compatta i file dataset_query_result (sostanzialmente ciò che si estreva da GUT) e quindi crea
# date le due tipologie (e.g. healthy e not_healthy) dato l'esperimento (not_healthy_vs_healthy) un singolo file
# che sarà di fatto la somma dei due dataset di partenza. Ovviamente viene inserito un nuovo campo (nuova colonna)
# dove si specifica la tipologia del record (e.g. healthy o not_healthy)
def manifest_operation(experiment, typology):
    typology_1 = pd.read_csv(f'./result_extraction/{experiment}/dataset_query_result_{typology[0]}.csv', dtype=str)
    typology_2 = pd.read_csv(f'./result_extraction/{experiment}/dataset_query_result_{typology[1]}.csv', dtype=str)
    
    typology_1['typology'] = [typology[0]]*len(typology_1)
    typology_2['typology'] = [typology[1]]*len(typology_2)
    
    frames = [typology_1, typology_2]
    final_dataset = pd.concat(frames)
    if len(final_dataset) == (len(typology_1) + len(typology_2)):
        with open(f'./result_extraction/{experiment}/dataset_query_result_{experiment}.tsv', "w", newline='') as dataset_query_result:
            tsv_output = csv.writer(dataset_query_result, delimiter='\t')
            
            tsv_output.writerow(list(final_dataset.columns))
            cont = 0
            for _, row in final_dataset.iterrows():
                tsv_output.writerow(list(row))
                cont += 1
        
    add_id_sample(experiment)
    create_manifest(experiment)

# Questa funzione, dato l'experimenti (e.g. healthy_vs_not_healthy) crea il file manifest come mostrato nella pagina 
# https://docs.qiime2.org/2020.2/tutorials/importing/. 
def create_manifest(experiment):
    
    # leggiamo il file che contiene i sample_name campionati (il file con sample_name, typology, biosample_id, sra_id)
    final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", dtype=str)
    manifest = list()
    
    # per ogni riga andiamo a copiare i file fastq e a inserirli in una nuova cartella con un nuovo nome
    # sample_name.fasq, andando a compattare nel caso in cui a un sample_name fossero associati più file 
    # fastq
    for index, row in final_sample.iterrows():
        run_ids = ast.literal_eval(row['runId'])
        sample_identificator = row['sample_identificator']
        sample_name = row['sample_name']
        typology = row['typology']
        print(f'Sample identificator: {sample_identificator}')
        print(f'Sample name: {sample_name}')
        print(f'Typology: {typology}')
        print(f'Run ids: {run_ids}')
        print()
        
        directories = os.listdir(f'./result_extraction/{experiment}/SRA_{typology}')
        if 'tmp_fastq' not in directories:
            os.mkdir(f'./result_extraction/{experiment}/SRA_{typology}/tmp_fastq')
        
        compact_files = list()
        for run_id in run_ids:
            f = open(f'./result_extraction/{experiment}/SRA_{typology}/{run_id}.fastq', "r")
            compact_files.append(f.read())
            f.close()
            
            
        f = open(f'./result_extraction/{experiment}/SRA_{typology}/tmp_fastq/{sample_identificator}.fastq', 'w')
        for file in compact_files:
            f.write(file)
        f.close()
        
        # nel manifest appendiamo la riga 'sample_identificator, absolute_path_to_fastq'
        manifest.append([sample_identificator, f'/result_extraction/{experiment}/SRA_{typology}/tmp_fastq/{sample_identificator}.fastq'])
    
    # scriviamo il manifest 
    with open(f'./result_extraction/{experiment}/manifest.tsv', "w",newline='') as manifest_file:
        tsv_output = csv.writer(manifest_file, delimiter='\t')
        tsv_output.writerow(['sample-id', 'absolute-filepath'])
        for line in manifest:
            tsv_output.writerow([line[0], f'{os.getcwd()}{line[1]}'])
            
            
def add_id_sample(experiment):
    final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", dtype=str)
    sample_id = [f'Sample{i}' for i in range(len(final_sample))]
    final_sample['sample_identificator'] = sample_id
    final_sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False, encoding='utf-8')

## Main

In [None]:
# Carica il dataset gut
df = pd.read_csv("./data/american_gut.txt", delimiter="\t", dtype=str)

# Sostituisce con NAN valori non validi
df.replace(' ', np.nan, inplace=True)
df.replace('Not provided', np.nan, inplace=True)
df.replace('Unspecified', np.nan, inplace=True)

# Elimina dalla working directory tutti i risultati dello scorso esperimento, ripulisce il logfile
# clean_workspace()

In [None]:
# Inizializza il log file
logging.basicConfig(filename='./result_extraction/sra_querying.log', level=logging.INFO, format='%(message)s')
today = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
logging.info(f'RUN TIME: {today}')

## All columns

In [None]:
for column in df.columns:
    print(column, end = ', ')

## Healthy vs not healthy study
### Healthy extraction

In [None]:
# Estraiamo dal dataset i dati di interesse
healthy = df.query("smoking_frequency == 'Never' and alcohol_frequency == 'Never'")

healthy = healthy.query("cancer == 'I do not have this condition'")

healthy['bmi'] = healthy['bmi'].apply(lambda x: float(x))
healthy = healthy.query("bmi >= 18.5 and bmi <= 24.99")

healthy['age_years'] = healthy['age_years'].apply(lambda x: float(x))
healthy = healthy.query("age_years >= 20 and age_years <= 50")

healthy = healthy.query("body_site == 'UBERON:feces'")

In [None]:
sampling_data(healthy, 'healthy', 40, 'healthy_vs_not_healthy')

### Not healthy extraction

In [None]:
not_healthy = df.query("smoking_frequency == 'Occasionally (1-2 times/week)' or smoking_frequency == 'Daily' or smoking_frequency == 'Regularly (3-5 times/week)'")
not_healthy = not_healthy.query("alcohol_frequency == 'Occasionally (1-2 times/week)' or alcohol_frequency == 'Daily' or alcohol_frequency == 'Regularly (3-5 times/week)'")

not_healthy = not_healthy.query("cancer == 'I do not have this condition'")

not_healthy['bmi'] = not_healthy['bmi'].apply(lambda x: float(x))
not_healthy = not_healthy.query("bmi < 18.5 or bmi > 24.99")

not_healthy['age_years'] = not_healthy['age_years'].apply(lambda x: float(x))
not_healthy = not_healthy.query("age_years >= 20 and age_years <= 50")

not_healthy = not_healthy.query("body_site == 'UBERON:feces'")

In [None]:
sampling_data(not_healthy, 'not_healthy', 40, 'healthy_vs_not_healthy')

## Not healthy old vs not healthy young studies
### Not healthy old extraction

In [None]:
not_healthy = df.query("smoking_frequency == 'Occasionally (1-2 times/week)' or smoking_frequency == 'Daily' or smoking_frequency == 'Regularly (3-5 times/week)'")
not_healthy = not_healthy.query("alcohol_frequency == 'Occasionally (1-2 times/week)' or alcohol_frequency == 'Daily' or alcohol_frequency == 'Regularly (3-5 times/week)'")

not_healthy = not_healthy.query("cancer == 'I do not have this condition'")

not_healthy['bmi'] = not_healthy['bmi'].apply(lambda x: float(x))
not_healthy = not_healthy.query("bmi < 18.5 or bmi > 24.99")

not_healthy['age_years'] = not_healthy['age_years'].apply(lambda x: float(x))
not_healthy_old = not_healthy.query("age_years >= 40 and age_years <= 50")

not_healthy_old = not_healthy_old.query("body_site == 'UBERON:feces'")

In [None]:
sampling_data(not_healthy_old, 'not_healthy_old', 17, 'not_healthy_old_vs_not_healthy_young')

### Not healthy young extraction

In [None]:
not_healthy_young = not_healthy.query("age_years >= 20 and age_years <= 30")

not_healthy_young = not_healthy_young.query("body_site == 'UBERON:feces'")

In [None]:
sampling_data(not_healthy_young, 'not_healthy_young', 14, 'not_healthy_old_vs_not_healthy_young')

## Mental illness vs food disorders
### Mental illness

In [None]:
mental_illness = df.query("country_residence == 'United States'")

mental_illness = mental_illness.query("body_site == 'UBERON:feces'")

mental_illness = mental_illness.query("mental_illness == 'true' or mental_illness == 'Yes'")

mental_illness['age_years'] = mental_illness['age_years'].apply(lambda x: float(x))

In [None]:
sampling_data(mental_illness, 'mental_illness', 30, 'mental_ill_vs_food_dis')

### Food disorders

In [None]:
food_disorders = df.query("country_residence == 'United States'")

food_disorders = food_disorders.query("body_site == 'UBERON:feces'")

food_disorders = food_disorders.query("mental_illness == 'false' or mental_illness == 'No'")

food_disorders['bmi'] =  food_disorders['bmi'].apply(lambda x : float(x))
food_disorders = food_disorders.query("bmi < 18.5 or bmi > 24.99")

food_disorders = food_disorders.query("(fruit_frequency == 'Never' or fruit_frequency == 'Rarely (less than once/week)')")

food_disorders = food_disorders.query("exercise_frequency=='Rarely (a few times/month)' or exercise_frequency=='Never'")

food_disorders['age_years'] = food_disorders['age_years'].apply(lambda x: float(x))

In [None]:
sampling_data(food_disorders, 'food_disorders', 30, 'mental_ill_vs_food_dis')

## NCBI Quering

In [7]:
sra_querying('healthy_vs_not_healthy', ['healthy', 'not_healthy'])

File number: 1
Sample id: 10317.000066612
Biosample ID 7496859
Typology: healthy
Run ID: ERR2092040

File number: 2
Sample id: 10317.000051168
Biosample ID 6365139
Typology: healthy
Run ID: ERR1841778
Run ID: ERR1843783

File number: 3
Sample id: 10317.000065770
Biosample ID 7354089
Typology: healthy
Run ID: ERR2032865

File number: 4
Sample id: 10317.000073253
Biosample ID 8577551
Typology: healthy
Run ID: ERR2314357

File number: 5
Sample id: 10317.000059984
Biosample ID 7439204
Typology: healthy
Run ID: ERR2239360
Run ID: ERR2056948

File number: 6
Sample id: 10317.000101148
Biosample ID 14621644
Typology: healthy
Run ID: ERR4020623

File number: 7
Sample id: 10317.000047064
Biosample ID 5462573
Typology: healthy
Run ID: ERR1551686

File number: 8
Sample id: 10317.000069794
Biosample ID 14618495
Typology: healthy
Run ID: ERR4018981

File number: 9
Sample id: 10317.000033260
Biosample ID 5709843
Typology: healthy
Run ID: ERR1597017

File number: 10
Sample id: 10317.000092700
Biosampl

Run ID: ERR2033469

File number: 79
Sample id: 10317.000050274
Biosample ID 6366103
Typology: not_healthy
Run ID: ERR1842802

File number: 80
Sample id: 10317.000075933
Biosample ID 9653092
Typology: not_healthy
Run ID: ERR2696475

Number of sequences for healthy: 4658723


HBox(children=(FloatProgress(value=0.0, description='Compacting fasta', max=4658723.0, style=ProgressStyle(des…


Number of grouped sequences: 3296480
Number of taken sequences: 731

Number of sequences for not_healthy: 978740


HBox(children=(FloatProgress(value=0.0, description='Compacting fasta', max=978740.0, style=ProgressStyle(desc…


Number of grouped sequences: 249806
Number of taken sequences: 588

Sample number: 1
Sample name: 10317.000066612
Run ID: ERR2092040


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 2
Sample name: 10317.000051168
Run ID: ERR1841778
Run ID: ERR1843783


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 3
Sample name: 10317.000065770
Run ID: ERR2032865


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 4
Sample name: 10317.000073253
Run ID: ERR2314357


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 5
Sample name: 10317.000059984
Run ID: ERR2239360
Run ID: ERR2056948


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 6
Sample name: 10317.000101148
Run ID: ERR4020623


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 7
Sample name: 10317.000047064
Run ID: ERR1551686


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 8
Sample name: 10317.000069794
Run ID: ERR4018981


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 9
Sample name: 10317.000033260
Run ID: ERR1597017


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 10
Sample name: 10317.000092700
Run ID: ERR2697719


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 11
Sample name: 10317.000069303
Run ID: ERR2092106


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 12
Sample name: 10317.000041330
Run ID: ERR1316217


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 13
Sample name: 10317.000074674
Run ID: ERR2309379


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 14
Sample name: 10317.000039729
Run ID: ERR2579655


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 15
Sample name: 10317.000042712
Run ID: ERR1597059


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 16
Sample name: 10317.000101145
Run ID: ERR4020620


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 17
Sample name: 10317.000075930
Run ID: ERR2696890


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 18
Sample name: 10317.000069640
Run ID: ERR2242527
Run ID: ERR2057131


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 19
Sample name: 10317.000044559
Run ID: ERR2579667
Run ID: ERR2404919


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 20
Sample name: 10317.000098180
Run ID: ERR2696729


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 21
Sample name: 10317.000072387
Run ID: ERR2304009


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 22
Sample name: 10317.000094722
Run ID: ERR4019216


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 23
Sample name: 10317.000043083
Run ID: ERR1389989


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 24
Sample name: 10317.000073364
Run ID: ERR2579972


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 25
Sample name: 10317.000106821
Run ID: ERR4019290


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 26
Sample name: 10317.000100395
Run ID: ERR4019231


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 27
Sample name: 10317.000069702
Run ID: ERR2057147


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 28
Sample name: 10317.000058525
Run ID: ERR2319347
Run ID: ERR1854734


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 29
Sample name: 10317.000072118
Run ID: ERR2404965


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 30
Sample name: 10317.000075872
Run ID: ERR2523922


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 31
Sample name: 10317.000097102
Run ID: ERR2697076


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 32
Sample name: 10317.000071471
Run ID: ERR4018999


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 33
Sample name: 10317.000046285
Run ID: ERR2032618


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 34
Sample name: 10317.000058423
Run ID: ERR1854712


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 35
Sample name: 10317.000087299
Run ID: ERR2696520


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 36
Sample name: 10317.000058421
Run ID: ERR1854710


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 37
Sample name: 10317.000058955
Run ID: ERR1854738


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 38
Sample name: 10317.000072431
Run ID: ERR2308673


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 39
Sample name: 10317.000083010
Run ID: ERR2405201


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 40
Sample name: 10317.000052325
Run ID: ERR2523873


HBox(children=(FloatProgress(value=0.0, max=731.0), HTML(value='')))


Sample number: 41
Sample name: 10317.000058971
Run ID: ERR2696443


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 42
Sample name: 10317.000047222
Run ID: ERR1842761


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 43
Sample name: 10317.000068173
Run ID: ERR2319482


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 44
Sample name: 10317.000070708
Run ID: ERR2404950


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 45
Sample name: 10317.000093103
Run ID: ERR2696999


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 46
Sample name: 10317.000059959
Run ID: ERR2303862


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 47
Sample name: 10317.000092754
Run ID: ERR2696591


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 48
Sample name: 10317.000105371
Run ID: ERR4019279


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 49
Sample name: 10317.000038305
Run ID: ERR1389836


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 50
Sample name: 10317.000069002
Run ID: ERR2314217


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 51
Sample name: 10317.000037933
Run ID: ERR1842199
Run ID: ERR2056801


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 52
Sample name: 10317.000033728
Run ID: ERR1315996


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 53
Sample name: 10317.000101067
Run ID: ERR4020553


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 54
Sample name: 10317.000074586
Run ID: ERR4019071


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 55
Sample name: 10317.000038160
Run ID: ERR2313969


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 56
Sample name: 10317.000075909
Run ID: ERR2523933


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 57
Sample name: 10317.000097663
Run ID: ERR2697902


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 58
Sample name: 10317.000098679
Run ID: ERR2697993


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 59
Sample name: 10317.000097473
Run ID: ERR2697882


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 60
Sample name: 10317.000107269
Run ID: ERR4019312


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 61
Sample name: 10317.000065320
Run ID: ERR1854752


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 62
Sample name: 10317.000079833
Run ID: ERR2304099


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 63
Sample name: 10317.000068175
Run ID: ERR2033470


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 64
Sample name: 10317.000076672
Run ID: ERR2523964


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 65
Sample name: 10317.000105372
Run ID: ERR4019280


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 66
Sample name: 10317.000062086
Run ID: ERR1842637


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 67
Sample name: 10317.000069651
Run ID: ERR2314251


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 68
Sample name: 10317.000093400
Run ID: ERR2579553


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 69
Sample name: 10317.000069197
Run ID: ERR4018975


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 70
Sample name: 10317.000054208
Run ID: ERR1842587


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 71
Sample name: 10317.000033063
Run ID: ERR1417508


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 72
Sample name: 10317.000068233
Run ID: ERR2092459


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 73
Sample name: 10317.000042630
Run ID: ERR1597056


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 74
Sample name: 10317.000047463
Run ID: ERR2032698


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 75
Sample name: 10317.000054289
Run ID: ERR2319344


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 76
Sample name: 10317.000042631
Run ID: ERR1597057


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 77
Sample name: 10317.000069001
Run ID: ERR2033544


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 78
Sample name: 10317.000068174
Run ID: ERR2033469


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 79
Sample name: 10317.000050274
Run ID: ERR1842802


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))


Sample number: 80
Sample name: 10317.000075933
Run ID: ERR2696475


HBox(children=(FloatProgress(value=0.0, max=588.0), HTML(value='')))




In [None]:
sra_querying('not_healthy_old_vs_not_healthy_young', ['not_healthy_old', 'not_healthy_young'])

In [None]:
sra_querying('mental_ill_vs_food_dis', ['mental_illness', 'food_disorders'])

In [10]:
sra_querying('cancer_and_cardiovascular', ['cancer', 'cardiovascular'])

File number: 1
Sample id: 10317.000068682
Biosample ID 8568993
Typology: cancer
Run ID: ERR2303942

File number: 2
Sample id: 10317.000074849
Biosample ID 8569573
Typology: cancer
Run ID: ERR2306239

File number: 3
Sample id: 10317.000093401
Biosample ID 9653618
Typology: cancer
Run ID: ERR2697007

File number: 4
Sample id: 10317.000046446
Biosample ID 7353903
Typology: cancer
Run ID: ERR2032678

File number: 5
Sample id: 10317.000046444
Biosample ID 7353901
Typology: cancer
Run ID: ERR2032676

File number: 6
Sample id: 10317.000102894
Biosample ID 14618726
Typology: cancer
Run ID: ERR4019242

File number: 7
Sample id: 10317.000046431
Biosample ID 7353888
Typology: cancer
Run ID: ERR2032663

File number: 8
Sample id: 10317.000042649
Biosample ID 6367575
Typology: cancer
Run ID: ERR1845970

File number: 9
Sample id: 10317.000093587
Biosample ID 9653264
Typology: cancer
Run ID: ERR2696648

File number: 10
Sample id: 10317.000040121
Biosample ID 7353392
Typology: cancer
Run ID: ERR2313986

HBox(children=(FloatProgress(value=0.0, description='Compacting fasta', max=4014332.0, style=ProgressStyle(des…


Number of grouped sequences: 1868460
Number of taken sequences: 464

Number of sequences for cardiovascular: 718076


HBox(children=(FloatProgress(value=0.0, description='Compacting fasta', max=718076.0, style=ProgressStyle(desc…


Number of grouped sequences: 201889
Number of taken sequences: 392

Sample number: 1
Sample name: 10317.000068682
Run ID: ERR2303942


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 2
Sample name: 10317.000074849
Run ID: ERR2306239


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 3
Sample name: 10317.000093401
Run ID: ERR2697007


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 4
Sample name: 10317.000046446
Run ID: ERR2032678


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 5
Sample name: 10317.000046444
Run ID: ERR2032676


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 6
Sample name: 10317.000102894
Run ID: ERR4019242


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 7
Sample name: 10317.000046431
Run ID: ERR2032663


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 8
Sample name: 10317.000042649
Run ID: ERR1845970


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 9
Sample name: 10317.000093587
Run ID: ERR2696648


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 10
Sample name: 10317.000040121
Run ID: ERR2313986
Run ID: ERR2032153


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 11
Sample name: 10317.000046434
Run ID: ERR2032666


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 12
Sample name: 10317.000051244
Run ID: ERR1842459


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 13
Sample name: 10317.000037493
Run ID: ERR2318004


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 14
Sample name: 10317.000046440
Run ID: ERR2032672


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 15
Sample name: 10317.000076643
Run ID: ERR2405043


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 16
Sample name: 10317.000065565
Run ID: ERR2242102
Run ID: ERR2239382
Run ID: ERR2238686
Run ID: ERR2057011


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 17
Sample name: 10317.000046420
Run ID: ERR2032652


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 18
Sample name: 10317.000046441
Run ID: ERR2032673


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 19
Sample name: 10317.000047140
Run ID: ERR1842292


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 20
Sample name: 10317.000051560
Run ID: ERR1842469


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 21
Sample name: 10317.000046445
Run ID: ERR2032677


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 22
Sample name: 10317.000040351
Run ID: ERR1233446


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 23
Sample name: 10317.000051573
Run ID: ERR1841787
Run ID: ERR1843792


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 24
Sample name: 10317.000046447
Run ID: ERR2032679


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 25
Sample name: 10317.000092706
Run ID: ERR2696578


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 26
Sample name: 10317.000076358
Run ID: ERR2696499


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 27
Sample name: 10317.000087039
Run ID: ERR2304148


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 28
Sample name: 10317.000053460
Run ID: ERR1842835


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 29
Sample name: 10317.000054273
Run ID: ERR2314040


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 30
Sample name: 10317.000050653
Run ID: ERR2319851


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 31
Sample name: 10317.000050525
Run ID: ERR2319778


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 32
Sample name: 10317.000023590
Run ID: ERR1845804


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 33
Sample name: 10317.000013105
Run ID: ERR1079964


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 34
Sample name: 10317.000028806
Run ID: ERR1458766
Run ID: ERR1315860


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 35
Sample name: 10317.000044555
Run ID: ERR1842255


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 36
Sample name: 10317.000082872
Run ID: ERR2405133


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 37
Sample name: 10317.000039806
Run ID: ERR1316052


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 38
Sample name: 10317.000062081
Run ID: ERR1842632


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 39
Sample name: 10317.000031332
Run ID: ERR1389678


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 40
Sample name: 10317.000097233
Run ID: ERR2697834


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 41
Sample name: 10317.000040396
Run ID: ERR1841615
Run ID: ERR1843602


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 42
Sample name: 10317.000050475
Run ID: ERR2319745


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 43
Sample name: 10317.000029157
Run ID: ERR1842682


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 44
Sample name: 10317.000032795
Run ID: ERR1160658


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 45
Sample name: 10317.000082870
Run ID: ERR2405131


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 46
Sample name: 10317.000082871
Run ID: ERR2405132


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 47
Sample name: 10317.000101088
Run ID: ERR4019696


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 48
Sample name: 10317.000050498
Run ID: ERR2319761


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 49
Sample name: 10317.000021280
Run ID: ERR1080303


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))


Sample number: 50
Sample name: 10317.000004800
Run ID: ERR1079930


HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




## Quiime2

In [9]:
artifact = quality_analysis('mental_ill_vs_food_dis', ['mental_illness', 'food_disorders'])

In [10]:
demux_sequences = demux.visualizers.summarize(artifact)
demux_sequences.visualization

In [7]:
artifact = quality_analysis('healthy_vs_not_healthy', ['healthy', 'not_healthy'])

In [8]:
demux_sequences = demux.visualizers.summarize(artifact)
demux_sequences.visualization

In [None]:
artifact = quality_analysis('not_healthy_old_vs_not_healthy_young', ['not_healthy_old', 'not_healthy_young'])

In [None]:
demux_sequences = demux.visualizers.summarize(artifact)
demux_sequences.visualization

In [11]:
artifact = quality_analysis('cancer_and_cardiovascular', ['cancer', 'cardiovascular'])

Sample identificator: Sample0
Sample name: 10317.000068682
Typology: cancer
Run ids: ['ERR2303942']

Sample identificator: Sample1
Sample name: 10317.000074849
Typology: cancer
Run ids: ['ERR2306239']

Sample identificator: Sample2
Sample name: 10317.000093401
Typology: cancer
Run ids: ['ERR2697007']

Sample identificator: Sample3
Sample name: 10317.000046446
Typology: cancer
Run ids: ['ERR2032678']

Sample identificator: Sample4
Sample name: 10317.000046444
Typology: cancer
Run ids: ['ERR2032676']

Sample identificator: Sample5
Sample name: 10317.000102894
Typology: cancer
Run ids: ['ERR4019242']

Sample identificator: Sample6
Sample name: 10317.000046431
Typology: cancer
Run ids: ['ERR2032663']

Sample identificator: Sample7
Sample name: 10317.000042649
Typology: cancer
Run ids: ['ERR1845970']

Sample identificator: Sample8
Sample name: 10317.000093587
Typology: cancer
Run ids: ['ERR2696648']

Sample identificator: Sample9
Sample name: 10317.000040121
Typology: cancer
Run ids: ['ERR2

In [12]:
demux_sequences = demux.visualizers.summarize(artifact)
demux_sequences.visualization