In [1]:
# Show plots as part of the notebook (this is a Jupyter-specific operation)
%matplotlib inline
import time
import matplotlib.pyplot as plt
import requests
import xmltodict
# Standard library packages
import os

# Import Pandas and Seaborn
import pandas as pd
import seaborn as sns

# Import Biopython tools for running local BLASTX
from Bio.Blast.Applications import NcbiblastxCommandline
from Bio.Blast.Applications import NcbiblastnCommandline

#For execution time tracking
from datetime import datetime
from pytz import timezone

In [2]:
now_time = datetime.now(timezone('America/Chicago'))
print(now_time.strftime('%I:%M:%S %p'))

09:09:25 PM


In [3]:
%%time
df = pd.read_csv('01_db_blastn_July142020_filtered_1e-20_subj.csv')

CPU times: user 16.7 s, sys: 4.12 s, total: 20.9 s
Wall time: 20.9 s


In [4]:
df.head(20)

Unnamed: 0,query,subject,pc_identity,aln_length,mismatches,gaps_opened,query_start,query_end,subject_start,subject_end,e_value,bitscore
0,A00842:193:HMWFWDRXY:1:2101:18322:1094,KJ410683.1,90.58,138,9,4,5,140,65939,66074,1.7799999999999998e-44,180.0
1,A00842:193:HMWFWDRXY:1:2101:18322:1094,KJ410683.1,87.368,95,11,1,8,102,4262,4169,8.500000000000001e-23,108.0
2,A00842:193:HMWFWDRXY:1:2101:18322:1094,NC_031144.1,83.916,143,18,4,2,142,138599,138738,5.0399999999999994e-30,132.0
3,A00842:193:HMWFWDRXY:1:2101:18322:1094,NC_031144.1,83.721,129,16,4,16,142,110876,110751,1.4100000000000001e-25,117.0
4,A00842:193:HMWFWDRXY:1:2101:18322:1094,KX284709.1,83.916,143,18,4,2,142,138599,138738,5.0399999999999994e-30,132.0
5,A00842:193:HMWFWDRXY:1:2101:18322:1094,KX284709.1,83.721,129,16,4,16,142,110876,110751,1.4100000000000001e-25,117.0
6,A00842:193:HMWFWDRXY:1:2101:18322:1094,NC_046751.1,83.217,143,19,4,2,142,151276,151415,2.3500000000000003e-28,126.0
7,A00842:193:HMWFWDRXY:1:2101:18322:1094,NC_031170.1,83.217,143,19,4,2,142,138372,138511,2.3500000000000003e-28,126.0
8,A00842:193:HMWFWDRXY:1:2101:18322:1094,MK783267.1,83.217,143,19,4,2,142,151276,151415,2.3500000000000003e-28,126.0
9,A00842:193:HMWFWDRXY:1:2101:18322:1094,KX284713.1,83.217,143,19,4,2,142,138372,138511,2.3500000000000003e-28,126.0


In [5]:
df['subject'].nunique()

10033

## Retrieving (efetch) taxonomy heirarchy from NCBI

In [6]:
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{}"
db = "nuccore"

In [7]:
def get_taxon_name(tax_id):
    try:
        efetch = "efetch.fcgi?db={}&id={}&rettype=json".format(db, tax_id)
        efetch_response = requests.get(base_url.format(efetch))
        efetch_dict = xmltodict.parse(efetch_response.content)
        Definition = efetch_dict['GBSet']['GBSeq']['GBSeq_definition']
        Definition = Definition.split(',')
        Definition = Definition[0]
        time.sleep(0.5)
    except:
        Definition = tax_id
        pass
 
    return Definition

In [8]:
get_taxon_name ('NC_016703.1')

'Phaeocystis antarctica plastid'

In [9]:
def get_7level_taxonomy(tax_id):
    try:   
        efetch = "efetch.fcgi?db={}&id={}&rettype=json".format(db, tax_id)
        efetch_response = requests.get(base_url.format(efetch))
        efetch_dict = xmltodict.parse(efetch_response.content)
        GBSeq_taxonomy = efetch_dict['GBSet']['GBSeq']['GBSeq_taxonomy']
#        GBSeq_taxonomy_class = GBSeq_taxonomy.split("; ")
#        GBSeq_taxonomy_class = GBSeq_taxonomy_class[2]
        time.sleep(0.5)
    except:
        GBSeq_taxonomy = tax_id
        pass
    
    return GBSeq_taxonomy

In [10]:
get_7level_taxonomy ('NC_016703.1')

'Eukaryota; Haptophyceae; Phaeocystales; Phaeocystaceae; Phaeocystis'

In [16]:
type(df.subject.unique())

numpy.ndarray

In [19]:
%%time
for sub in df.subject.unique():
    df["7Level_taxonomy"] = get_7level_taxonomy(sub)

CPU times: user 38min 10s, sys: 13min 31s, total: 51min 42s
Wall time: 4h 58min 7s


In [21]:
df.to_csv('01_1_db_blastn_July142020_filtered_1e-20_subj_to_7level_taxonomy.csv', index=False)

In [22]:
df.(10)

Unnamed: 0,query,subject,pc_identity,aln_length,mismatches,gaps_opened,query_start,query_end,subject_start,subject_end,e_value,bitscore,7Level_taxonomy
0,A00842:193:HMWFWDRXY:1:2101:18322:1094,KJ410683.1,90.58,138,9,4,5,140,65939,66074,1.7799999999999998e-44,180.0,Eukaryota; Rhodophyta; Bangiophyceae; Cyanidia...
1,A00842:193:HMWFWDRXY:1:2101:18322:1094,KJ410683.1,87.368,95,11,1,8,102,4262,4169,8.500000000000001e-23,108.0,Eukaryota; Rhodophyta; Bangiophyceae; Cyanidia...
2,A00842:193:HMWFWDRXY:1:2101:18322:1094,NC_031144.1,83.916,143,18,4,2,142,138599,138738,5.0399999999999994e-30,132.0,Eukaryota; Rhodophyta; Bangiophyceae; Cyanidia...
3,A00842:193:HMWFWDRXY:1:2101:18322:1094,NC_031144.1,83.721,129,16,4,16,142,110876,110751,1.4100000000000001e-25,117.0,Eukaryota; Rhodophyta; Bangiophyceae; Cyanidia...
4,A00842:193:HMWFWDRXY:1:2101:18322:1094,KX284709.1,83.916,143,18,4,2,142,138599,138738,5.0399999999999994e-30,132.0,Eukaryota; Rhodophyta; Bangiophyceae; Cyanidia...
