In [2]:
import time
import json
import http.client
http.client.HTTPConnection._http_vsn = 10
http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'

from auto_db_pipeline.genbank.keywords2ids import GenbankSearch
from auto_db_pipeline.genbank.json_combination import Combination
from auto_db_pipeline.genbank.ids2protein import ProteinRetrieval
from auto_db_pipeline.genbank.proteins2info import InfoRetrieval
from auto_db_pipeline.genbank.info2csv import PopulateDatabase
from auto_db_pipeline.genbank.evaluate_genbank_search import EvaluateGenbankSearch



# Genbank pipeline
The code in this notebook provides an example how to create an antibody database with information from Genbank

In [2]:
# Define keywords for genbank search
keywords1 = '((Immunoglobulin[All Fields] OR antibody[All Fields] ' +\
               'OR antibodies[All Fields] OR nanobody[All Fields] ' +\
               'OR nanobodies[All Fields]) AND (COVID-19[All Fields] ' +\
               'OR coronavirus[All Fields] OR Sars-Cov[All Fields] ' +\
               'OR Mers-Cov[All Fields] OR SARS[All Fields] ' +\
               'OR Sars-CoV-2[All Fields]) AND (neutralizing[All Fields] ' +\
               'OR neutralize[All Fields] OR neutralisation[All Fields] ' +\
               'OR bind[All Fields] OR inhibit[All Fields] ' +\
               'OR anti-Sars-Cov-2[All Fields]))'

In [3]:
# saved in files with ...2.json
keywords2 = '((Immunoglobulin[All Fields] OR antibody[All Fields] ' +\
               'OR antibodies[All Fields] OR nanobody[All Fields] ' +\
               'OR nanobodies[All Fields] OR MAb[All Fields]) AND (COVID-19[All Fields] ' +\
               'OR coronavirus[All Fields] OR Sars-Cov[All Fields] ' +\
               'OR Mers-Cov[All Fields] OR SARS[All Fields] ' +\
               'OR Sars-CoV-2[All Fields]) AND (neutralizing[All Fields] ' +\
               'OR neutralize[All Fields] OR neutralization[All Fields] ' +\
               'OR bind[All Fields] OR inhibit[All Fields] or targeting[All Fields] or binding[All Fields]) ' +\
               'AND (heavy chain[All Fields] or complementary determining region[All Fields] or gene[All Fields] ' +\
               'or epitope[All Fields] or receptor-binding domain[All Fields] or rbd[All Fields] or spike protein[All Fields] ' +\
               'or VHH[All Fields]))'

In [4]:
# Search genbank with keywords and get ids of all entries
start_time = time.time()

genbanksearch = GenbankSearch(keywords1)
genbanksearch(out_file_path='genbank/data/id_list.json')

print("--- %s seconds ---" % int((time.time() - start_time)))

number of entries found: 10086
----------
number of IDs retrieved: 10086
--- 2 seconds ---


In [5]:
#Combine protein ids from the genbank search with protein ids from paper scraping
start_time = time.time()

idscombination = Combination('genbank/data/id_list.json', 'genbank/data/id_list_protein_from_papers.json')
idscombination(ids_out_file_path='genbank/data/id_list_combined.json')

print("--- %s seconds ---" % int((time.time() - start_time)))

Number of ids from keyword search: 32339
Number of ids from paper scraping: 1953
----------
Number of unique ids from paper scraping: 1953
Total ids after comination: 34292
--- 1 seconds ---


In [21]:
# Download the protein handles of all found ids
start_time = time.time()

proteinretrival = ProteinRetrieval(ids_file_path='genbank/data/protein_accessions_papers2.json')
proteinretrival(db='protein', out_file_path='genbank/data/protein_handles_accession_new_papers.json')

print("--- %s seconds ---" % int((time.time() - start_time)))

 number of protein handles retrieved: 856
----------
--- 4182 seconds ---


In [2]:
# Extract the relevant information from protein handles
start_time = time.time()

inforetreival = InfoRetrieval(proteins_file_path='genbank/data/nucleotide_handles_from_papers.json')
inforetreival(db='nucleotide', classification_method='anarci',
                paired_out_file_path='genbank/data/AB_paired_nucleotides_from_papers.json',
                unpaired_out_file_path='genbank/data/AB_unpaired_nucleotides_from_papers.json',
                nanobod_out_file_path='genbank/data/nanobody_nucleotides_from_papers.json')

print("--- %s seconds ---" % int((time.time() - start_time)))



Number of entires removed by antibody filter: 4327
Number of entires after antibody filter: 385
----------
Number of entries where antigen was determined: 348
Number of entries where antigen was not determined: 37
----------
Number of entries where fragement name was determined: 373
Number of entries where fragment name was not determined: 12
----------
Number of nanobodies: 12
Number of antibodies: 373
----------
Number of sequence pairs: 119
Number of sequences that could not be paired in attempt 1: 135
----------
Number of entries attempted to pair with SAbDab: 0
Number of entries not pairable with SAbDab: 135
----------
Number of pairs found with SAbDab: 0
Number of sequences not paired but sequence from PDB added: 0
--- 88 seconds ---


In [4]:
# combine information from nucleotide and protein ids

combination = Combination('genbank/data/AB_paired_combined.json', 'genbank/data/AB_paired_nucleotides_from_papers.json')
combination('genbank/data/AB_paired_combined_papers_prot_nt.json')

combination = Combination('genbank/data/AB_unpaired_combined.json', 'genbank/data/AB_unpaired_nucleotides_from_papers.json')
combination('genbank/data/AB_unpaired_combined_papers_prot_nt.json')

combination = Combination('genbank/data/nanobody_combined.json', 'genbank/data/nanobody_nucleotides_from_papers.json')
combination('genbank/data/nanobody_combined_papers_prot_nt.json')

Elements in file 1: 848
Elements in file 2: 119
----------
Number of unique elements in file 2 119
Total elements after comination: 967
Elements in file 1: 61
Elements in file 2: 47
----------
Number of unique elements in file 2 47
Total elements after comination: 108
Elements in file 1: 155
Elements in file 2: 12
----------
Number of unique elements in file 2 12
Total elements after comination: 167


In [3]:
# Populate a csv file with information
start_time = time.time()

populatedb = PopulateDatabase(paired_path='genbank/data/AB_paired_combined_papers_prot_nt.json',
                 unpaired_path='genbank/data/AB_unpaired_combined_papers_prot_nt.json',
                 nanobod_path='genbank/data/nanobody_combined_papers_prot_nt.json')
populatedb(out_file_paired='genbank/data/ab_database_combined_papers_prot_nt.csv',
                 out_file_unpaired='genbank/data/ab_database_unpaired_combined_papers_prot_nt.csv')
print("--- %s seconds ---" % int((time.time() - start_time)))

Number of paired database entires: 1134
----------
Number of paired database entires after duplicate removal: 549
Number of duplicates removed: 585
----------
Number of unpaired database entires: 4611
----------
--- 90 seconds ---


In [5]:
# Compare found genbank entries to Covabdab
start_time = time.time()

with open('genbank/data/protein_handles2.json', 'r') as infile:
    protein_entries = json.load(infile)

evaluation = EvaluateGenbankSearch('genbank/data/CoV-AbDab_090322.csv', protein_entries, keywords2)
evaluation(outpath='genbank/data/protein_search_stats.csv', print_metrics=True, save_metrics=True)

print("--- %s seconds ---" % int((time.time() - start_time)))

Compilation is falling back to object mode WITH looplifting enabled because Function "search_in_covabdab" failed type inference due to: non-precise type pyobject
During: typing of argument at /Users/fabian/Desktop/SABS/Antibody project/code/auto-db-pipeline/genbank/evaluate_genbank_search.py (76)

File "genbank/evaluate_genbank_search.py", line 76:
    def search_in_covabdab(self):
        <source elided>
        # loop throught aa seqs
        for aa_seq in self.aa_seqs:
        ^

  @jit  # (nopython=True)
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "search_in_covabdab" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "genbank/evaluate_genbank_search.py", line 76:
    def search_in_covabdab(self):
        <source elided>
        # loop throught aa seqs
        for aa_seq in self.aa_seqs:
        ^

  @jit  # (nopython=True)

File "genbank/evaluate_genbank_search.py", line 76:
    

total sequences assessed: 32339)
        number of genbank sequences not in covab dab:
        28622
        number of genbank sequences found in covab dab:
        3717
        match rate: 0.11493861900491667
        number of genebank sequences that have multiple matches in covab dab:
        952.0
        number of genebank entries with non unique match in covab dab:
        1148
        -------------
        total sequences in covab dab: 5210
        number of Covab Dab VH sequences found: 1641
        number of Covab Dab VL sequences found: 928
        number of Covab Dab VH VL pairings found: 856
        number of Covab Dab entires where either VH or VL was found:
        1713
        -------------
        percentage of covab dab entries with pairing found:
        16.429942418426105
        percentage of covab dab entries with either VL or VH found:
        32.87907869481766
        
--- 553 seconds ---
