## Libraries

In [3]:
import qiime2
from tempfile import mkdtemp
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition
import pandas as pd
import os
import numpy as np
import random
import logging
from datetime import datetime
from Bio import Entrez
from pprint import pprint
from sklearn.utils import shuffle
import xml.etree.ElementTree as ET
import os
from tqdm.notebook import tqdm
from Bio import SeqIO
import ast

  import pandas.util.testing as pdt


#### Studies
* exercise_frequency
* flossing_frequency
* vitamin_d_supplement_frequency
* weight_change
* fruit_frequency

## Functions

### Pipeline cleaning

In [4]:
# Puliscel a working directory (result_extraction), elimina tutti i file e le cartelle fatta eccezione per il logfile
# il quale viene solo svuotato
def clean_workspace():
    
    # ci spostiamo nella working directory
    starting_path = os.getcwd()
    path = os.path.join(os.getcwd(), f'result_extraction')
    os.chdir(path)
    
    # prendiamo tutti i file contenuti nella working directory e eliminiamo a meno che non sia il log file
    files = os.listdir()
    for file in files:
        if file != 'sra_querying.log':
            delete_command = f'rm -r {file}'
            os.system(delete_command)
            
    # puliamo il log file
    delete_old_log = 'cat > sra_querying.log'
    os.system(delete_old_log)  
    
    # torniamo nella starting directory ./Human microbiome
    os.chdir(starting_path)

### NCBI utilities

In [5]:
# Funzioni per semplificare l'interrogazioen di NCBI
Entrez.email = "giacomo.villa.mi@gmail.com"

def good_print(text):
    print(pprint(text))

def esearch(db, query, num_max = 20):
    handle = Entrez.esearch(db = db, term = query, retmax = num_max)
    record = Entrez.read(handle, validate = True)
    return record

def esummary(db, id_val):
    handle = Entrez.esummary(db = db, id = id_val)
    record = Entrez.read(handle, validate = True)
    return record

### Final Sample generation

In [6]:
# Prende in input il dataset completo, data la query, 
def write_age_mean(total, typology, experiment):
    
    man = total.query('sex == "male"')
    woman = total.query('sex == "female"')
    
    mean_age_total = round(np.mean(total['age_years']), 4)
    mean_age_man = round(np.mean(man['age_years']), 4)
    mean_age_woman = round(np.mean(woman['age_years']), 4)
    not_valid_sex = total.query("sex != 'female' and sex != 'male'")
    
    if typology != 'sample':
        logging.info(f'--------------------{typology.upper()}--------------------')
    logging.info(f'Total number of {typology} people: {len(total)}')
    logging.info(f'Total number of {typology} man: {len(man)}')
    logging.info(f'Total number of {typology} woman: {len(woman)}')
    logging.info(f'Total number of non valid sex {len(not_valid_sex)}')
    
    logging.info(f'Mean Age for total: {mean_age_total}')
    logging.info(f'Mean Age for man: {mean_age_man}')
    logging.info(f'Mean Age for woman: {mean_age_woman}')
    logging.info(f'\n')
    
    print(f'--------------------{typology.upper()}--------------------')
    print(f'Total number of {typology} people: {len(total)}')
    print(f'Total number of {typology} man: {len(man)}')
    print(f'Total number of {typology} woman: {len(woman)}')
    print(f'Total number of non valid sex {len(not_valid_sex)}')
    
    print(f'Mean Age for total: {mean_age_total}')
    print(f'Mean Age for man: {mean_age_man}')
    print(f'Mean Age for woman: {mean_age_woman}')
    
    starting_path = os.getcwd()
    path = os.path.join(os.getcwd(), f'result_extraction/')
    os.chdir(path)
    directories = os.listdir()
    if f'{experiment}' not in directories:
        os.mkdir(f'{experiment}')
        
    os.chdir(f'{experiment}')
    if typology != 'sample':
        total.to_csv(f"./dataset_query_result_{typology}.csv", index=False, encoding='utf-8')
    
    os.chdir(starting_path)

# Prende in input il dataset da cui campionare e gli index degli element già presi, finché non campiona qualcosa di 
# nuovo non termina
def generate_single_sample(already_taken, from_df):
    element = random.randint(0, len(from_df) - 1)
    while (element in already_taken):
        element = random.randint(0, len(from_df) - 1)
    return element

# Dato il dataset di partenza contenente solo elementi effettivamente presenti su NCBI
# genera un nuovo dataset composto da n_samples elementi
def get_final_sample(started_dataset, n_samples):
    taken = set()
    final_sample = pd.DataFrame(columns=started_dataset.columns)
    for i in tqdm(range(n_samples), desc='Sampling data'):
        new_sample = generate_single_sample(taken, started_dataset)
        taken.add(new_sample)
        final_sample = final_sample.append(started_dataset.iloc[new_sample], ignore_index=True)
    return final_sample

def write_not_valid_ids(not_valid_names):
    files = os.listdir(f'./result_extraction')
    
    if 'not_valid_sample_names.csv' in files:
        all_not_valid_names = pd.read_csv("./result_extraction/not_valid_sample_names.csv", header=0, dtype=str)
        elements = [all_not_valid_names, not_valid_names]
        final_not_valid = pd.concat(elements, ignore_index=False, sort=False)
        final_not_valid.to_csv(f"./result_extraction/not_valid_sample_names.csv", index=False)
    else:
        not_valid_names.to_csv(f"./result_extraction/not_valid_sample_names.csv", index=False)
    
# Dato il dataset di partenza con tutti gli elementi che rispettano la query posta al gut, tiene in considraz
def sampling_data(start_dataset, typology, n_samples, experiment):
    write_age_mean(start_dataset, typology, experiment)
    valid_id = list()
    not_valid_id = list()
    for try_id in tqdm(start_dataset['sample_name'], desc='NCBI ids validation'):
        handleSce = esearch('biosample', try_id)
        if len(handleSce['IdList']) != 0:
            valid_id.append(try_id)
        else:
            not_valid_id.append(str(try_id))
            print(try_id)
            
    print(f'Total rows: {len(start_dataset)}')
    print(f'Valid rows: {len(valid_id)}')
    
    not_valid_names = pd.DataFrame(data={"not_valid_sample_name": not_valid_id, "typology": [typology]*len(not_valid_id)})
    print(len(not_valid_names))
    write_not_valid_ids(not_valid_names)
    
    logging.info(f'Total rows: {len(start_dataset)}')
    logging.info(f'Valid rows: {len(valid_id)}')
            
    valid_start_dataset = pd.DataFrame(columns=start_dataset.columns)
    index = 0
    for _, row in start_dataset.iterrows():
        if row['sample_name'] in valid_id:
            valid_start_dataset.loc[index] = row
            index += 1
            
    valid_start_dataset = shuffle(valid_start_dataset)
    final_sample = get_final_sample(valid_start_dataset, n_samples)
    write_sample_info(final_sample, typology, experiment)

# Funzione richiamata sul sample finale, traduce in csv il campione finale creando un csv di due colonne: sample_name,
# typology (e.g. healthy/not_healthy), richiama write_age_mean per scrivere sul log file l'età media dei soggetti
def write_sample_info(sample, typology, experiment):
    man = sample.query("sex == 'male'")
    woman = sample.query("sex == 'female'")
    write_age_mean(sample, 'sample', experiment)
    
    sample = sample[['sample_name']]
    sample['typology'] = [typology]*len(sample)
    
    files = os.listdir(f'./result_extraction/{experiment}')
    
    if f'final_sample_{experiment}.csv' in files:
        final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", header=0, dtype=str)
        final_sample = final_sample[['sample_name', 'typology']]
        elements = [final_sample, sample]
        final_sample = pd.concat(elements, ignore_index=False, sort=False)
        final_sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False)
    else:
        sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False)
        
    close_dashes = '-'*len(typology.upper())
    logging.info(f'--------------------{close_dashes}--------------------')

### SRA operation

In [7]:
# Funzione controller, prende in input il nome dell'esperimento (eg. healthy vs not_healthy) e le tipologie di campione
# (e.g. healthy e not_healty), chiama la funzione che interroga SRA di NCBI, in seguito richiama la funzione per 
# concatenare i file fasta e infine la funzione per prendere le sequenze che ricorrono più spesso. Infine salva su un
# file csv i fasta delle sequenze più popolose aggiungendo il campo che si rifà all'id di Biosample.
# Il file csv delle frequenze più popole è l'input per blast
def sra_querying(experiment, types):
    
    # Legge il csv contente i 30 campioni di una tipologia e i 30 campioni dell'altra tipologia dato l'esperimento
    # (e.g. esperimento: healthy vs not_healthy estrare il csv che contiene i 30 sample_name degli healthy e i 
    # 30 sample_name dei not_healthy)
    final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", header=0, dtype=str)
    
    # crea una lista che conterrà, dati i record, il corrispettivo id di Biosample
    bio_sample_id = list()
    
    sra_ids = list()
    
    # per ogni riga del csv dei 30 campioni di una tipologia e i 30 dell'altra tipologia dato l'esperimento
    for index, row in final_sample.iterrows():
        print(f'File number: {index+1}')
        
        # Gestione del problema sulla lettura di un sample_name con la concatenazione della stringa '001'
        # in generale estrae l'input per la funzione che farà la query su SRA
        record_id = str(row[0])[0:15]
        record_typology = row[1]
        
        # data la singola interrogazione, aggiunge alla lista degli id di bio_sample l'id.
        query_result = get_sequences(record_id, record_typology, experiment)
        bio_sample_id.append(query_result[0])
        sra_ids.append(query_result[1])
        
    # una volta scaricate tutti i file fasta data l'esperimento, per ogni tipologia (e.g healthy/not_healthy) 
    # crea un unico file con tutte le sequenze e poi prende, da questo file, solo quelle più popolose
    for typology in types:
        concatenate_fast_file(typology, 'fasta', experiment)
        get_top_sequences(typology, experiment)
        
    # crea una nuova colonna dove, per ogni sample_name, vi sarà l'id di biosample associato e salva il nuovo csv
    final_sample['bio_sample_id'] = bio_sample_id
    final_sample['runId'] = sra_ids
    final_sample.to_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", index=False)
    
# Funzione che interroga SRA, dato il sample_name. Richiede anche la tipologia del campione (e.g. healthy o not_healthy)
# e il nome dell'esperimento (e.g. healthy_vs_not_healthy), per andare a salvare correttamente nelle cartelle facenti
# riferimento all'esperimento
def get_sequences(sample_name, typology, experiment):
    
    # in funzione della tipologia del campione (e.g. healthy o not_healthy) e dell'esperimento (e.g. healthy_vs_not_healthy)
    # definisce il path corretto dove andare a salvare il risultato
    path = f'"./result_extraction/{experiment}/SRA_{typology}" '
    command1 = f'fastq-dump --fasta --readids --outdir {path}'
    command2 = f'fastq-dump --readids --outdir {path}'
        
    # Query su SRA e print utili
    print(f'Sample id: {sample_name}')
    handleSce = esearch('biosample', sample_name)
    biosampleId = handleSce['IdList'][0]
    print(f'Biosample ID {biosampleId}')
    print(f'Typology: {typology}')
    handleSra = Entrez.efetch(db='biosample', id=biosampleId, retmode='xml')
    root = ET.fromstring(handleSra.read())
    identifier = root.findall('.//BioSample//Ids//Id')
    for i in identifier:
        if i.attrib['db'] == 'SRA':
            sraId = i.text
    handleSra = Entrez.esearch(db='sra', term=sraId)
    resultsSra = Entrez.read(handleSra)['IdList']
    run_ids = list()
    for s in resultsSra:
        handlesngSraId = Entrez.efetch(db='sra', id=s, retmode='xml')
        root = ET.fromstring(handlesngSraId.read())
        identifier = root.find('.//EXPERIMENT_PACKAGE//RUN_SET//RUN')
        runId = identifier.attrib['accession']
        os.system(f'{command1}{runId}')
        os.system(f'{command2}{runId}') 
        print(f'Run ID: {runId}')
        run_ids.append(runId)
    print()
    return [biosampleId, run_ids]
        

# Dato il risultato delle query su SRA, concatena i file fasta facenti riferimento a una certa tipologia di record
# (e.g. healthy o not_healthy) dato un certo esperimento (e.g. healthy_vs_not_healthy)
def concatenate_fast_file(typology, file_format, experiment):
    
    # Prende tutti i file data la tipologia del record (e.g. healthy o not_healthy) contenuti nella cartella dove,
    # dato l'esperimento (e.g. healthy_vs_not_healthy), la query su SRA ha riposto i risultati
    files = os.listdir(f'./result_extraction/{experiment}/SRA_{typology}')
    
    # concatenazione file fasta
    compact_files = list()
    for file in files:
        if file_format in file:
            f = open(f'./result_extraction/{experiment}/SRA_{typology}/{file}', "r")
            compact_files.append(f.read())
            f.close()
    f = open(f'./result_extraction/{experiment}/SRA_{typology}/final_{file_format}_{typology}.{file_format}', 'w')
    for file in compact_files:
        f.write(file)
    f.close()
    
    # Eliminazione dei 
    starting_path = os.getcwd()
    path = os.path.join(os.getcwd(), f'result_extraction/{experiment}/SRA_{typology}')
    os.chdir(path)
    command = 'rm *[0-9].fasta'
    #os.system(command) 
    os.chdir(starting_path)
        
def get_top_sequences(typology, experiment):
    records = list(SeqIO.parse(f"./result_extraction/{experiment}/SRA_{typology}/final_fasta_{typology}.fasta", format="fasta"))
    print(f'Number of sequences for {typology}: {len(records)}')
    logging.info(f'Number of sequences for {typology}: {len(records)}')
    
    sequences = dict()
    for record in tqdm(records, desc='Compacting fasta'):
        if record.seq in sequences:
            sequences[record.seq][0] += 1
        else:
            sequences[record.seq] = [1, f'>{record.description}']
    
    print(f'Number of grouped sequences: {len(sequences)}')
    logging.info(f'Number of grouped sequences: {len(sequences)}')
    
    sequences_ord = {k: v for k, v in sorted(sequences.items(), key=lambda item: item[1], reverse=True)}
    
    cont = 0
    f = open(f'./result_extraction/{experiment}/SRA_{typology}/top_sequences_{typology}.fasta', 'w')
    
    for element in sequences_ord:
        if sequences_ord[element][0] >= 100:
            f.write(f'{sequences_ord[element][1]} number of reps {sequences_ord[element][0]}')
            f.write('\n')
            f.write(str(element))
            f.write('\n')
            cont += 1
    f.close()
    
    print(f'Number of taken sequences: {cont}')
    logging.info(f'Number of taken sequences: {cont}')
    print()
    logging.info('\n')

### Not placed function

In [17]:
def get_sample_top_sequences_count(experiment, typology):
    top_sequences = list(SeqIO.parse(f"./result_extraction/{experiment}/SRA_{typology}/top_sequences_{typology}.fasta", format="fasta"))
    os.mkdir(f'./result_extraction/{experiment}/SRA_{typology}/tmp')
    
    columns = ['sample_name']
    for i in range(len(top_sequences)):
        columns.append(f'seq {i+1}')
    final_summary = pd.DataFrame(columns = columns)
    
    final_sample = pd.read_csv(f"./result_extraction/{experiment}/final_sample_{experiment}.csv", dtype=str)
    target_sample = final_sample.query(f"typology == '{typology}'")

    for index, row in target_sample.iterrows():

        print(f'Sample number: {index+1}')
        run_ids = ast.literal_eval(row['runId'])
        typology = row['typology']
        sample_name = row['sample_name']
        compact_files = list()
        
        print(f'Sample name: {sample_name}')

        for run_id in run_ids:
            f = open(f'./result_extraction/{experiment}/SRA_{typology}/{run_id}.fasta', "r")
            compact_files.append(f.read())
            f.close()
            print(f'Run ID: {run_id}')

            f = open(f'./result_extraction/{experiment}/SRA_{typology}/tmp/{sample_name}.fasta', 'w')
            for file in compact_files:
                f.write(file)
            f.close()

        sample_sequences = list(SeqIO.parse(f"./result_extraction/{experiment}/SRA_{typology}/tmp/{sample_name}.fasta", format="fasta"))

        top_sequences_in_sample = dict()
        for top_sequence in tqdm(top_sequences):
            top_sequences_in_sample[top_sequence.description] = 0
            for sample_sequence in sample_sequences:
                if top_sequence.seq == sample_sequence.seq:
                    top_sequences_in_sample[top_sequence.description] += 1
        numbers = [sample_name]
        for seq in top_sequences_in_sample:
            numbers.append(top_sequences_in_sample[seq])
        final_summary.loc[len(final_summary)] = numbers

    index = 0
    for column in final_summary.columns:
        if column != 'sample_name':
            sum_column = final_summary[column].sum()
            real_value = int(top_sequences[index].description.split('number of reps')[1].strip())
            if sum_column != real_value:
                print(f'Problem to column: {column}')

            index += 1    

    final_summary.to_csv(f"./result_extraction/{experiment}/{typology}_top_sequences_distribution.csv", index=False)

## Main

In [None]:
# Carica il dataset gut
df = pd.read_csv("./data/american_gut.txt", delimiter="\t", dtype=str)

# Sostituisce con NAN valori non validi
df.replace(' ', np.nan, inplace=True)
df.replace('Not provided', np.nan, inplace=True)
df.replace('Unspecified', np.nan, inplace=True)

# Elimina dalla working directory tutti i risultati dello scorso esperimento, ripulisce il logfile
clean_workspace()

In [None]:
# Inizializza il log file
logging.basicConfig(filename='./result_extraction/sra_querying.log', level=logging.INFO, format='%(message)s')
today = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
logging.info(f'RUN TIME: {today}')

## All columns

In [None]:
for column in df.columns:
    print(column, end = ', ')

## Healthy vs not healthy study
### Healthy extraction

In [None]:
# Estraiamo dal dataset i dati di interesse
healthy = df.query("smoking_frequency == 'Never' and alcohol_frequency == 'Never'")

healthy = healthy.query("cancer == 'I do not have this condition'")

healthy['bmi'] = healthy['bmi'].apply(lambda x: float(x))
healthy = healthy.query("bmi >= 18.5 and bmi <= 24.99")

healthy['age_years'] = healthy['age_years'].apply(lambda x: float(x))
healthy = healthy.query("age_years >= 20 and age_years <= 50")

healthy = healthy.query("body_site == 'UBERON:feces'")

In [None]:
sampling_data(healthy, 'healthy', 30, 'healthy_vs_not_healthy')

### Not healthy extraction

In [None]:
not_healthy = df.query("smoking_frequency == 'Occasionally (1-2 times/week)' or smoking_frequency == 'Daily' or smoking_frequency == 'Regularly (3-5 times/week)'")
not_healthy = not_healthy.query("alcohol_frequency == 'Occasionally (1-2 times/week)' or alcohol_frequency == 'Daily' or alcohol_frequency == 'Regularly (3-5 times/week)'")

not_healthy = not_healthy.query("cancer == 'I do not have this condition'")

not_healthy['bmi'] = not_healthy['bmi'].apply(lambda x: float(x))
not_healthy = not_healthy.query("bmi < 18.5 or bmi > 24.99")

not_healthy['age_years'] = not_healthy['age_years'].apply(lambda x: float(x))
not_healthy = not_healthy.query("age_years >= 20 and age_years <= 50")

not_healthy = not_healthy.query("body_site == 'UBERON:feces'")

In [None]:
sampling_data(not_healthy, 'not_healthy', 30, 'healthy_vs_not_healthy')

## Mental illness vs food disorders
### Mental illness

In [None]:
mental_illness = df.query("country_residence == 'United States'")

mental_illness = mental_illness.query("body_site == 'UBERON:feces'")

mental_illness = mental_illness.query("mental_illness == 'true' or mental_illness == 'Yes'")

mental_illness['age_years'] = mental_illness['age_years'].apply(lambda x: float(x))

In [None]:
sampling_data(mental_illness, 'mental_illness', 30, 'mental_ill_vs_food_dis')

### Food disorders

In [None]:
food_disorders = df.query("country_residence == 'United States'")

food_disorders = food_disorders.query("body_site == 'UBERON:feces'")

food_disorders = food_disorders.query("mental_illness == 'false' or mental_illness == 'No'")

food_disorders['bmi'] =  food_disorders['bmi'].apply(lambda x : float(x))
food_disorders = food_disorders.query("bmi < 18.5 or bmi > 24.99")

food_disorders = food_disorders.query("(fruit_frequency == 'Never' or fruit_frequency == 'Rarely (less than once/week)')")

food_disorders = food_disorders.query("exercise_frequency=='Rarely (a few times/month)' or exercise_frequency=='Never'")

food_disorders['age_years'] = food_disorders['age_years'].apply(lambda x: float(x))

In [None]:
sampling_data(food_disorders, 'food_disorders', 30, 'mental_ill_vs_food_dis')

## NCBI Quering

In [None]:
sra_querying('mental_ill_vs_food_dis', ['mental_illness', 'food_disorders'])

# 10317.000067678,mental_illness,7497069,['ERR2092251']
# 10317.000040411,mental_illness,4565263,"['ERR1389919', 'ERR1316175']"
# 10317.000050338,food_disorders,14618671,['ERR4019182']

In [None]:
sra_querying('healthy_vs_not_healthy', ['healthy', 'not_healthy'])

In [7]:
get_sample_top_sequences_count('healthy_vs_not_healthy', 'healthy')

Sample number: 1
Sample name: 10317.000093022
Run id: ERR2696639


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 2
Sample name: 10317.000058340
Run id: ERR1846087


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 3
Sample name: 10317.000107254
Run id: ERR4019307


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 4
Sample name: 10317.000032796
Run id: ERR2239612
Run id: ERR1417466


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 5
Sample name: 10317.000046476
Run id: ERR2032691


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 6
Sample name: 10317.000079781
Run id: ERR2304091


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 7
Sample name: 10317.000040396
Run id: ERR1841615
Run id: ERR1843602


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 8
Sample name: 10317.000069615
Run id: ERR2319529


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 9
Sample name: 10317.000050243
Run id: ERR1841757
Run id: ERR1843760


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 10
Sample name: 10317.000092617
Run id: ERR2696987


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 11
Sample name: 10317.000043083
Run id: ERR1389989


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 12
Sample name: 10317.000058882
Run id: ERR2318042


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 13
Sample name: 10317.000039907
Run id: ERR1316071


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 14
Sample name: 10317.000029546
Run id: ERR1417421


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 15
Sample name: 10317.000068197
Run id: ERR2319486


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 16
Sample name: 10317.000072168
Run id: ERR2579904


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 17
Sample name: 10317.000072038
Run id: ERR2523900


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 18
Sample name: 10317.000065690
Run id: ERR2032849


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 19
Sample name: 10317.000067863
Run id: ERR2032373


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 20
Sample name: 10317.000046475
Run id: ERR2032690


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 21
Sample name: 10317.000058379
Run id: ERR1846096


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 22
Sample name: 10317.000068199
Run id: ERR2033474


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 23
Sample name: 10317.000087274
Run id: ERR2579440


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 24
Sample name: 10317.000074433
Run id: ERR2580084


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 25
Sample name: 10317.000041681
Run id: ERR1458999


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 26
Sample name: 10317.000097633
Run id: ERR2697896


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 27
Sample name: 10317.000100377
Run id: ERR2696765


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 28
Sample name: 10317.000097102
Run id: ERR2697076


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 29
Sample name: 10317.000065672
Run id: ERR2032836


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))


Sample number: 30
Sample name: 10317.000058955
Run id: ERR1854738


HBox(children=(FloatProgress(value=0.0, max=547.0), HTML(value='')))




In [8]:
get_sample_top_sequences_count('healthy_vs_not_healthy', 'not_healthy')

Sample number: 31
Sample name: 10317.000054208
Run id: ERR1842587


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 32
Sample name: 10317.000070659
Run id: ERR2318188


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 33
Sample name: 10317.000050274
Run id: ERR1842802


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 34
Sample name: 10317.000042630
Run id: ERR1597056


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 35
Sample name: 10317.000093103
Run id: ERR2696999


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 36
Sample name: 10317.000059040
Run id: ERR1842899


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 37
Sample name: 10317.000068175
Run id: ERR2033470


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 38
Sample name: 10317.000047222
Run id: ERR1842761


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 39
Sample name: 10317.000093400
Run id: ERR2579553


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 40
Sample name: 10317.000075909
Run id: ERR2523933


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 41
Sample name: 10317.000037933
Run id: ERR1842199
Run id: ERR2056801


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 42
Sample name: 10317.000038160
Run id: ERR2313969


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 43
Sample name: 10317.000058971
Run id: ERR2696443


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 44
Sample name: 10317.000105372
Run id: ERR4019280


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 45
Sample name: 10317.000047151
Run id: ERR1842295


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 46
Sample name: 10317.000070708
Run id: ERR2404950


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 47
Sample name: 10317.000107269
Run id: ERR4019312


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 48
Sample name: 10317.000068174
Run id: ERR2033469


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 49
Sample name: 10317.000041832
Run id: ERR1842728


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 50
Sample name: 10317.000065320
Run id: ERR1854752


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 51
Sample name: 10317.000047463
Run id: ERR2032698


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 52
Sample name: 10317.000093404
Run id: ERR2579554


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 53
Sample name: 10317.000060228
Run id: ERR2319389


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 54
Sample name: 10317.000069197
Run id: ERR4018975


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 55
Sample name: 10317.000068657
Run id: ERR2306142


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 56
Sample name: 10317.000105371
Run id: ERR4019279


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 57
Sample name: 10317.000075933
Run id: ERR2696475


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 58
Sample name: 10317.000062086
Run id: ERR1842637


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 59
Sample name: 10317.000101067
Run id: ERR4020553


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))


Sample number: 60
Sample name: 10317.000038305
Run id: ERR1389836


HBox(children=(FloatProgress(value=0.0, max=464.0), HTML(value='')))




In [19]:
get_sample_top_sequences_count('mental_ill_vs_food_dis', 'mental_illness')

Sample number: 1
Sample name: 10317.000067678
Run ID: ERR2092251


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 2
Sample name: 10317.000089968
Run ID: ERR2524133


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 3
Sample name: 10317.000102683
Run ID: ERR4019724


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 4
Sample name: 10317.000084671
Run ID: ERR2308949


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 5
Sample name: 10317.000074845
Run ID: ERR2580096


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 6
Sample name: 10317.000072390
Run ID: ERR2404967


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 7
Sample name: 10317.000022557
Run ID: ERR2579634


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 8
Sample name: 10317.000049819
Run ID: ERR1459224


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 9
Sample name: 10317.000036888
Run ID: ERR1845908


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 10
Sample name: 10317.000040397
Run ID: ERR1389911
Run ID: ERR1316165


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 11
Sample name: 10317.000039595
Run ID: ERR2032567


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 12
Sample name: 10317.000107973
Run ID: ERR4019336


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 13
Sample name: 10317.000093103
Run ID: ERR2696999


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 14
Sample name: 10317.000067634
Run ID: ERR2092225


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 15
Sample name: 10317.000052139
Run ID: ERR2696437


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 16
Sample name: 10317.000068197
Run ID: ERR2319486


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 17
Sample name: 10317.000052259
Run ID: ERR1841793
Run ID: ERR1843798


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 18
Sample name: 10317.000039622
Run ID: ERR2032574


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 19
Sample name: 10317.000079421
Run ID: ERR2309414


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 20
Sample name: 10317.000062984
Run ID: ERR2092007


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 21
Sample name: 10317.000107280
Run ID: ERR4019314


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 22
Sample name: 10317.000031563
Run ID: ERR2032517


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 23
Sample name: 10317.000089999
Run ID: ERR2524147


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 24
Sample name: 10317.000040070
Run ID: ERR2313983


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 25
Sample name: 10317.000071616
Run ID: ERR2308626


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 26
Sample name: 10317.000087179
Run ID: ERR2524101


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 27
Sample name: 10317.000069719
Run ID: ERR2057157


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 28
Sample name: 10317.000040411
Run ID: ERR1389919
Run ID: ERR1316175


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 29
Sample name: 10317.000002929
Run ID: ERR1842115


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))


Sample number: 30
Sample name: 10317.000040057
Run ID: ERR1389881


HBox(children=(FloatProgress(value=0.0, max=550.0), HTML(value='')))




In [20]:
get_sample_top_sequences_count('mental_ill_vs_food_dis', 'food_disorders')

Sample number: 31
Sample name: 10317.000101190
Run ID: ERR4020648


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 32
Sample name: 10317.000067892
Run ID: ERR2319445


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 33
Sample name: 10317.000072842
Run ID: ERR2314353


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 34
Sample name: 10317.000100334
Run ID: ERR2696756


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 35
Sample name: 10317.000047573
Run ID: ERR2579676
Run ID: ERR1854702


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 36
Sample name: 10317.000054345
Run ID: ERR1846078


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 37
Sample name: 10317.000093606
Run ID: ERR2696650


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 38
Sample name: 10317.000067987
Run ID: ERR2092387


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 39
Sample name: 10317.000039516
Run ID: ERR1845928


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 40
Sample name: 10317.000044319
Run ID: ERR2239784
Run ID: ERR1459116


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 41
Sample name: 10317.000068279
Run ID: ERR2032945


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 42
Sample name: 10317.000094641
Run ID: ERR2697054


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 43
Sample name: 10317.000090080
Run ID: ERR2524177


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 44
Sample name: 10317.000079461
Run ID: ERR2306263


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 45
Sample name: 10317.000065701
Run ID: ERR2032856


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 46
Sample name: 10317.000068295
Run ID: ERR2092071


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 47
Sample name: 10317.000039742
Run ID: ERR2318010


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 48
Sample name: 10317.000072547
Run ID: ERR4019034


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 49
Sample name: 10317.000074360
Run ID: ERR2404995


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 50
Sample name: 10317.000082850
Run ID: ERR2405130


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 51
Sample name: 10317.000052019
Run ID: ERR2523869


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 52
Sample name: 10317.000059163
Run ID: ERR1841829
Run ID: ERR1843850


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 53
Sample name: 10317.000076751
Run ID: ERR2304070


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 54
Sample name: 10317.000071590
Run ID: ERR2319592


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 55
Sample name: 10317.000087191
Run ID: ERR2696929


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 56
Sample name: 10317.000073874
Run ID: ERR2318296


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 57
Sample name: 10317.000084291
Run ID: ERR2308828


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 58
Sample name: 10317.000072255
Run ID: ERR2579927


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 59
Sample name: 10317.000062971
Run ID: ERR2092006


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


Sample number: 60
Sample name: 10317.000050338
Run ID: ERR4019182


HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))


