In [1]:
import time
import json

from genbank.keywords2ids import GenbankSearch
from genbank.ids2protein import ProteinRetrieval
from genbank.proteins2info import InfoRetrieval
from genbank.info2csv import PopulateDatabase
from genbank.evaluate_genbank_search import EvaluateGenbankSearch



# Genbank pipeline
The code in this notebook provides an example how to create an antibody database with information from Genbank

In [2]:
# Define keywords for genbank search
keywords = '((Immunoglobulin[All Fields] OR antibody[All Fields] ' +\
               'OR antibodies[All Fields] OR nanobody[All Fields] ' +\
               'OR nanobodies[All Fields]) AND (COVID-19[All Fields] ' +\
               'OR coronavirus[All Fields] OR Sars-Cov[All Fields] ' +\
               'OR Mers-Cov[All Fields] OR SARS[All Fields] ' +\
               'OR Sars-CoV-2[All Fields]) AND (neutralizing[All Fields] ' +\
               'OR neutralize[All Fields] OR neutralisation[All Fields] ' +\
               'OR bind[All Fields] OR inhibit[All Fields] ' +\
               'OR anti-Sars-Cov-2[All Fields]))'

In [3]:
# Search genbank with keywords and get ids of all entries
start_time = time.time()

genbanksearch = GenbankSearch(keywords)
genbanksearch()

print("--- %s seconds ---" % int((time.time() - start_time)))

number of entries found: 10049
----------
number of IDs retrieved: 10049
--- 2 seconds ---


In [4]:
# Download the protein handles of all found ids
start_time = time.time()

proteinretrival = ProteinRetrieval()
proteinretrival()

print("--- %s seconds ---" % int((time.time() - start_time)))

number of protein handles retrieved: 10049
----------
--- 348 seconds ---


In [2]:
# Extract the relevant information from protein handles
start_time = time.time()

inforetreival = InfoRetrieval()
inforetreival(classification_method='anarci')

print("--- %s seconds ---" % int((time.time() - start_time)))

Number of entires removed by antibody filter: 3740
Number of entires after antibody filter: 6309
----------
Number of entries where antigen was determined: 6209
Number of entries where antigen was not determined: 100
----------
Number of entries where fragement name was determined: 1729
Number of entries where fragment name was not determined: 4580
----------
Number of nanobodies: 155
Number of antibodies: 6154
----------
Number of sequence pairs: 763
Number of sequences that could not be paired in attempt 1: 4628
----------
Number of entries attempted to pair with SAbDab: 164
Number of entries not pairable with SAbDab: 4464
----------
Number of pairs found with SAbDab: 82
Number of sequences not paired but sequence from PDB added: 0
--- 514 seconds ---


In [3]:
# Populate a csv file with information
start_time = time.time()

populatedb = PopulateDatabase()
populatedb()

print("--- %s seconds ---" % int((time.time() - start_time)))

Number of paired database entires: 1000
----------
Number of paired database entires after duplicate removal: 421
Number of duplicates removed: 579
----------
Number of unpaired database entires: 4464
----------
--- 78 seconds ---


In [7]:
# Compare found genbank entries to Covabdab
start_time = time.time()

with open('genbank/data/protein_handles', 'r') as infile:
    protein_entries = json.load(infile)

evaluation = EvaluateGenbankSearch('genbank/data/CoV-AbDab_010322.csv', protein_entries, keywords)
evaluation(outpath='genbank/data/protein_search_stats.csv', print_metrics=True, save_metrics=True)

print("--- %s seconds ---" % int((time.time() - start_time)))

Compilation is falling back to object mode WITH looplifting enabled because Function "search_in_covabdab" failed type inference due to: non-precise type pyobject
During: typing of argument at /Users/fabian/Desktop/SABS/Antibody project/code/auto-db-pipeline/genbank/evaluate_genbank_search.py (76)

File "genbank/evaluate_genbank_search.py", line 76:
    def search_in_covabdab(self):
        <source elided>
        # loop throught aa seqs
        for aa_seq in self.aa_seqs:
        ^

  @jit  # (nopython=True)
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "search_in_covabdab" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "genbank/evaluate_genbank_search.py", line 76:
    def search_in_covabdab(self):
        <source elided>
        # loop throught aa seqs
        for aa_seq in self.aa_seqs:
        ^

  @jit  # (nopython=True)

File "genbank/evaluate_genbank_search.py", line 76:
    

total sequences assessed: 10027)
        number of genbank sequences not in covab dab:
        5316
        number of genbank sequences found in covab dab:
        4711
        match rate: 0.46983145507130747
        number of genebank sequences that have multiple matches in covab dab:
        1081.0
        number of genebank entries with non unique match in covab dab:
        1633
        -------------
        total sequences in covab dab: 5033
        number of Covab Dab VH sequences found: 1522
        number of Covab Dab VL sequences found: 1556
        number of Covab Dab VH VL pairings found: 1423
        number of Covab Dab entires where either VH or VL was found:
        1655
        -------------
        percentage of covab dab entries with pairing found:
        28.27339558911186
        percentage of covab dab entries with either VL or VH found:
        32.88297238227697
        
--- 52 seconds ---
