In [None]:
import pandas as pd
import numpy
import os
import glob
import plotly
import plotly.graph_objects as go
import plotly.express as px
import time
from Bio.Blast.Applications import NcbiblastnCommandline
from urllib.error import HTTPError
from ete3 import NCBITaxa
ncbi = NCBITaxa()

#variables necessary for the script functions:
pd.options.mode.chained_assignment = None

In [None]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1,
                        length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()


def seq_blast(pathwd, inputfile, split_len, split_dir):
    os.chdir(pathwd)
    #names of the output file(removing dot):
    outputname=inputfile.split(".")[0]
    wd=os.getcwd()
    print(wd)
    #make working directory:
    while(True):
        splitdir=str(split_dir)
        try:
            os.makedirs("./"+splitdir)
            break
        except FileExistsError as err:
            break

    #read and split file in lines:
    splitLen=int(split_len)
    splitfile = open( inputfile, 'r').read().split('\n')

    #os change directory and write outputfile:
    path= "./"+splitdir+"/"
    os.chdir(path)

    at = 1
    for lines in range(0, len(splitfile), splitLen):
        # First, get the list slice
        outputData = splitfile[lines:lines+splitLen]
        # Now open the output file, join the new slice with newlines
        # and write it out. Then close the file.
        output = open(outputname +"_" +str(at), 'w')
        output.write('\n'.join(outputData))
        output.close()
        # Increment the counter
        at += 1

    #time necessary to start blastn
    print("Starting blastn:it will take a while...so wait!!!!\n")
    time.sleep(2)

    #biopython blast:
    l=len(os.listdir(os.curdir))
    for count,file in enumerate(os.listdir(os.curdir)):
        while (True):
            try:
                outfmt6="6 qseqid saccver pident evalue staxids ssciname sskingdom"
                cline=NcbiblastnCommandline(query=file, max_target_seqs="10", db="nt",
                                    outfmt=outfmt6, out=file+".tsv", remote=True)
                cline()
                break
            except KeyboardInterrupt:
                print("\nKeyboard interruption")
                alt=str(input("Continue?:"))
                if alt[0]=='y':
                    continue
                else:
                    quit()
            except HTTPError as err:
                print("\nHTTPE error connection")
                alt=str(input("Continue?:"))
                if alt[0]=='y':
                    continue
                else:
                    quit()
                except OSError as err:
                    print("\nConnection error")
                    alt=str(input("Continue?:"))
                    if alt[0]=='y':
                        continue
                    else:
                        quit()

    merge=pd.concat((pd.read_csv(f, sep='\t', names=['seqid', 'accession', 'identity', 'evalue', 'taxids', 'sciname', 'kingdom']) for f in sorted(glob.iglob('*.tsv'))),
                  ignore_index=True, sort=False)
    merge.to_csv('blast_'+ outputname+'.tsv',sep='\t', index=False)

def reduce_blast(blastoutput):
    df = pd.read_csv(blastoutput,sep='\t', header=[0])
    n=int(len(df))
    df['order']= range(1,n+1,1)
    df['blast_frequency']=1
    df = df.groupby(['seqid', 'sciname','taxids']).agg({'accession':'first', 'identity':'max',
                                  'evalue':'min','blast_frequency': 'sum','order':'first'}).reset_index()
    df = df.loc[df.groupby(['seqid'])['blast_frequency'].idxmax()].reset_index()
    df = df.sort_values('order')
    df = df[['seqid','accession','identity','evalue','sciname','taxids']]
    return df

def add_taxonomy(dataframe):
    listtax=[]

    for taxid in dataframe['taxids']:
    try:
        lineage =  ncbi.get_lineage(int(taxid))

    except ValueError:
        print('Value error')
    phylum = clas = order = family = genus = specie = "NaN"  # Initializing

    if lineage is not None:
        for z in range(len(lineage)):
            lineage_rank = ncbi.get_rank([lineage[z]])

            # Checking the rank and getting their name
            if "phylum" == lineage_rank[lineage[z]]:
                rank_tmp = ncbi.get_taxid_translator([lineage[z]])
                phylum = rank_tmp[lineage[z]]

            if "class" == lineage_rank[lineage[z]]:
                rank_tmp = ncbi.get_taxid_translator([lineage[z]])
                clas = rank_tmp[lineage[z]]

            if "order" == lineage_rank[lineage[z]]:
                rank_tmp = ncbi.get_taxid_translator([lineage[z]])
                order = rank_tmp[lineage[z]]

            if "family" == lineage_rank[lineage[z]]:
                rank_tmp = ncbi.get_taxid_translator([lineage[z]])
                family = rank_tmp[lineage[z]]

            if "genus" == lineage_rank[lineage[z]]:
                rank_tmp = ncbi.get_taxid_translator([lineage[z]])
                genus = rank_tmp[lineage[z]]

            if "species" == lineage_rank[lineage[z]]:
                rank_tmp = ncbi.get_taxid_translator([lineage[z]])
                specie = rank_tmp[lineage[z]]

    taxa_list = ";".join([phylum, clas, order, family, genus, specie])
    listtax.append(taxa_list)

    dataframe['taxonomy']=listtax
    return dataframe


In [None]:
df=reduce_blast('/home/alberto/working/healthy/blast_top_sequences_healthy.tsv')
add_taxonomy(df)

In [None]:
seq_blast( '/home/alberto/working/','top_sequences_healthy.fasta',400, 'healthy' )