In [2]:
from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
import copy
import numpy as np
import json
import pickle
import random
import multiprocessing
from numba import njit, jit

Entrez.email = "fabian.spoendlin@exeter.ox.ac.uk" ## Move this to the init function of your genbank class

"""
## Right now you have all your code in this notebook. What you should be doing is whenever you have written a set of code you can group into a 
## function, you should put that function in a .py file which you then import into your notebook. This way you will organise your code better and
## you won't end up with a notebook with 100+ cells where you can't figure out what you need to run and what is old code. 

## However, you will probably have to edit some of the .py scripts at some point, and with the default setting on jupyter this means you need to 
## restart your kernel to then import the updated script. 

## To avoid this, include the 2 lines below after your imports above. This forces your notebook to autoreload your .py scripts everytime you edit them.

load_ext autoreload
%autoreload 2

"""

"\n## Right now you have all your code in this notebook. What you should be doing is whenever you have written a set of code you can group into a \n## function, you should put that function in a .py file which you then import into your notebook. This way you will organise your code better and\n## you won't end up with a notebook with 100+ cells where you can't figure out what you need to run and what is old code. \n\n## However, you will probably have to edit some of the .py scripts at some point, and with the default setting on jupyter this means you need to \n## restart your kernel to then import the updated script. \n\n## To avoid this, include the 2 lines below after your imports above. This forces your notebook to autoreload your .py scripts everytime you edit them.\n\nload_ext autoreload\n%autoreload 2\n\n"

In [3]:
"""
## It might be worth organising all the code as a class. This allows you too have all genbanksearch code in a single place and also easy to execute and
## change. The below code is an example of how it could be done.


class GenbankSearch:

    def __init__(self, all_keyword_lists):
    
        self.search_query = get_genbank_search_query(all_keyword_lists)
        
        
    def get_number_of_entries(self, db='protein'):
        "
        search protein data base with keywords and find out how many entries are found
        "
        handle = Entrez.esearch(db=db, term=self.search_query, retmax='2')
        record = Entrez.read(handle)
        self.number_of_entries = int(record['Count'])
        print('number of entries:', number_of_entries)
        
    def get_entries(self, db='protein'):
        "
        download all entries from search
        "
        
        handle = Entrez.esearch(db=db, term=self.search_query, retmax=self.number_of_entries)
        record = Entrez.read(handle)

        # 25 searches per second
        a_handle =  Entrez.efetch(db=db, id=record['IdList'], rettype="gb", retmode="xml")
        return Entrez.read(a_handle)
        

    def __call__(self, db='protein'):
    
        self.get_number_of_entries(db)
        entries = self.get_entries(db)
        
        ## More steps
   

## The __init__ function allows you to set some values that won't change
genbanksearch = GenbankSearch(all_keyword_lists)

## The __call__ function allows you to run the class directly without specifying a specific function. 
## Within the __call__ function you should have the pipeline you want to be executed.

genbanksearch('protein')
genbanksearch('nucleotide')
"""




with open('CoV-AbDab_181021.csv', 'r') as f:
    CovAbDab = pd.read_csv(f)   

# missing seqs in Cov abdab are ND, this can be protein seq -> replace with something thats not protein seq
CovAbDab['VH or VHH'].replace(to_replace='ND', value='no sequence available', inplace=True)
CovAbDab['VL'].replace(to_replace='ND', value='no sequence available', inplace=True)
CovAbDab['VH or VHH'].fillna('no sequence available', inplace=True)
CovAbDab['VL'].fillna('no sequence available', inplace=True)

# Protein search pipeline

In [4]:
# perform keyword search

# specify the terms used for the search
search = '((Immunoglobulin[All Fields] OR antibody[All Fields] OR antibodies[All Fields] OR nanobody[All Fields] OR nanobodies[All Fields]) AND (COVID-19[All Fields] OR coronavirus[All Fields] OR Sars-Cov[All Fields] OR Mers-Cov[All Fields] OR SARS[All Fields] OR Sars-CoV-2[All Fields]) AND (neutralizing[All Fields] OR neutralize[All Fields] OR neutralisation[All Fields] OR bind[All Fields] OR inhibit[All Fields] OR anti-Sars-Cov-2[All Fields]))'

"""
## Changing the search query into a function with lists of keywords as input will allow you to be more flexible. 
## The following is an example, where the 'all_keyword_lists' variable should be coming from outside the whole mine genbank class.

keyword_list1 = ['Immunoglobulin','antibody','antibodies','nanobody','nanobodies']
keyword_list2 = ['COVID-19','coronavirus','Sars-Cov','Mers-Cov','SARS','Sars-CoV-2']
keyword_list3 = ['neutralizing','neutralize','neutralisation','bind','inhibit','anti-Sars-Cov-2']

all_keyword_lists = [keyword_list1, keyword_list2, keyword_list3]


def get_genbank_search_query(all_keyword_lists):

    search = []
    for keyword_list in all_keyword_lists:
        or_keywords = ' OR '.join(["{}[All Fields]".format(keyword) for keyword in keyword_list])

        search.append('('+or_keywords+')')


    return '('+' AND '.join(search)+')'

search = get_genbank_search_query(all_keyword_lists)
"""

# search protein data base with keywords and find out how many entries are found
handle = Entrez.esearch(db='protein', term=search, retmax='2')
record = Entrez.read(handle)
number_of_entries = int(record['Count'])
print('number of entries:', number_of_entries)

number of entries: 9143


In [5]:
# download all entries from search

handle = Entrez.esearch(db='protein', term=search, retmax=25)#number_of_entries)
record = Entrez.read(handle)

# 25 searches per second
protein_handle =  Entrez.efetch(db="protein", id=record['IdList'], rettype="gb", retmode="xml")
proteins = Entrez.read(protein_handle)

In [10]:
# compare found sequences to covab dab

# use numba to speed up computation
"""
It needs to be set to nopython=True for it run fast. However, this gives some struggles with the code.
"""

@jit(nopython=True)
def search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab):

    sequences_not_in_covabdab = 0

    # loop throught aa seqs
    for aa_seq in aa_seqs:
        sequence_found = False

        # loop throught covab dab entries
        for i in range(len(VH_arr)):

            # in case VH is in covab dab increase the VH count of this entry by 1
            if VH_arr[i] in aa_seq:
                VH_found[i] = VH_found[i] + 1
                sequence_found = True

            # in case VL is in covab dab increase the VL count of this entry by 1
            if VL_arr[i] in aa_seq:
                VL_found[i] = VL_found[i] + 1
                sequence_found = True

        # sequence that has no match with vh or vl is counted as a not found sequence
        if not sequence_found:
            sequences_not_in_covabdab = sequences_not_in_covabdab + 1

        
    return VH_found, VL_found, sequences_not_in_covabdab

# prepare all variables for jit
sequences_not_in_covabdab = 0
VL_arr = CovAbDab['VL'].to_numpy()
VH_arr = CovAbDab['VH or VHH'].to_numpy()
VH_found = np.zeros((len(VH_arr)))
VL_found = np.zeros((len(VL_arr)))
aa_seqs = []
for i in range(len(proteins)):
    aa_seq = Seq(proteins[i]['GBSeq_sequence'])
    aa_seqs.append(str.upper(str(aa_seq)))

# run function
VH_found, VL_found, sequences_not_in_covabdab = search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab)

# format results
sequences_in_covabdab = len(proteins) - sequences_not_in_covabdab
CovAbDab_stats_pr = copy.deepcopy(CovAbDab)
CovAbDab_stats_pr['VH_found'] = VH_found
CovAbDab_stats_pr['VL_found'] = VL_found




#print summary statistics

if True:
    print('total sequences assessed:', len(proteins))
    print('number of genbank sequences not in covab dab:', sequences_not_in_covabdab)
    print('number of genbank sequences found in covab dab:', sequences_in_covabdab)
    print('match rate:', sequences_in_covabdab / len(proteins))
    # if the total number of counts in VH and VL columns is higher than the genbank sequences that have a match in covab dab
    # then a genbank sequence must have several matches in covab dab
    print('number of genebank sequences that have multiple matches in covab dab:', (sum(CovAbDab_stats_pr['VH_found'])+sum(CovAbDab_stats_pr['VL_found'])-sequences_in_covabdab))
    # if the number of genbank sequences with a match in covab dab is higher than the number of covab dab VH and VLs that were found
    # then a number of genbank sequences must have matched to the same covab dab sequenc
    # stat is wrong
    print('number of genebank entries with non unique match in covab dab:', (sequences_in_covabdab - (len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VH_found'] > 0)]) + len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VL_found'] > 0)]))))
    print('-------------')
    print('total sequences in covab dab:', len(CovAbDab_stats_pr))
    print('number of Covab Dab VH sequences found:', len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VH_found'] > 0)]))
    print('number of Covab Dab VL sequences found:', len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VL_found'] > 0)]))
    VH_VL_pairings = len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VH_found'] > 0) & (CovAbDab_stats_pr['VL_found'] > 0)])
    print('number of Covab Dab VH VL pairings found:', VH_VL_pairings)
    VH_or_VL = len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VH_found'] > 0) | (CovAbDab_stats_pr['VL_found'] > 0)])
    print('number of Covab Dab entires where either VH or VL was found:', VH_or_VL)
    print('-------------')
    print('percentage of covab dab entries with pairing found:', VH_VL_pairings / len(CovAbDab_stats_pr) * 100 )
    print('percentage of covab dab entries with either VL or VH found:', VH_or_VL / len(CovAbDab_stats_pr) * 100)

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1mnon-precise type array(pyobject, 1d, C)[0m
[0m[1mDuring: typing of argument at <ipython-input-10-bb58127ff652> (7)[0m
[1m
File "<ipython-input-10-bb58127ff652>", line 7:[0m
[1mdef search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab):
    <source elided>

[1m    sequences_not_in_covabdab = 0
[0m    [1m^[0m[0m


In [13]:
# save stats

if True:
    with open('data/protein_search_stats.csv', 'r+') as fo:
        fo.read()
        fo.write(f'{search}, {len(CovAbDab_stats_pr)}, {sequences_not_in_covabdab}, {sequences_in_covabdab}, {sequences_in_covabdab / len(CovAbDab_stats_pr)}, {(sum(CovAbDab_stats_pr["VH_found"]) + sum(CovAbDab_stats_pr["VL_found"]) - sequences_in_covabdab)}, {(sequences_in_covabdab - (len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VH_found"] > 0)])) + len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VL_found"] > 0)]))}, {len(CovAbDab_stats_pr)}, {len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VH_found"] > 0)])}, {len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VL_found"] > 0)])}, {len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VH_found"] > 0) & (CovAbDab_stats_pr["VL_found"] > 0)])}, {len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VH_found"] > 0) | (CovAbDab_stats_pr["VL_found"] > 0)])}, {VH_VL_pairings / len(CovAbDab_stats_pr) * 100}, {VH_or_VL / len(CovAbDab_stats_pr) * 100}\n')

# Nucleotide search pipeline

In [18]:
# perform keyword search

# specify the terms used for the search
search = '((Immunoglobulin[All Fields] OR antibody[All Fields] OR antibodies[All Fields] OR nanobody[All Fields] OR nanobodies[All Fields]) AND (COVID-19[All Fields] OR coronavirus[All Fields] OR Sars-Cov[All Fields] OR Mers-Cov[All Fields] OR SARS[All Fields] OR Sars-CoV-2[All Fields]) AND (neutralizing[All Fields] OR neutralize[All Fields] OR neutralisation[All Fields] OR bind[All Fields] OR inhibit[All Fields] OR anti-Sars-Cov-2[All Fields]))'

# search protein data base with keywords and find out how many entries are found
handle = Entrez.esearch(db='nucleotide', term=search, retmax='2')
record = Entrez.read(handle)
number_of_entries = int(record['Count'])
print('number of entries:', number_of_entries)

number of entries: 4305


In [19]:
# download all entries from search

handle = Entrez.esearch(db='nucleotide', term=search, retmax=number_of_entries)
record = Entrez.read(handle)

# 25 searches per second
nucleotide_handle =  Entrez.efetch(db="nucleotide", id=record['IdList'], rettype="gb", retmode="xml")
nucleotides = Entrez.read(nucleotide_handle)




# compare found sequences to covab dab

# use numba to speed up computation
@jit
def search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab):

    sequences_not_in_covabdab = 0

    # loop throught aa seqs
    for aa_seq in aa_seqs:
        sequence_found = False

        # loop throught covab dab entries
        for i in range(len(VH_arr)):

            # in case VH is in covab dab increase the VH count of this entry by 1
            if VH_arr[i] in aa_seq:
                VH_found[i] = VH_found[i] + 1
                sequence_found = True

            # in case VL is in covab dab increase the VL count of this entry by 1
            if str(VL_arr[i]) in aa_seq:
                VL_found[i] = VL_found[i] + 1
                sequence_found = True

        # sequence that has no match with vh or vl is counted as a not found sequence
        if not sequence_found:
            sequences_not_in_covabdab = sequences_not_in_covabdab + 1

        
    return VH_found, VL_found, sequences_not_in_covabdab

# prepare all variables for jit
sequences_not_in_covabdab = 0
VL_arr = CovAbDab['VL'].to_numpy()
VH_arr = CovAbDab['VH or VHH'].to_numpy()
VH_found = np.zeros((len(VH_arr)))
VL_found = np.zeros((len(VL_arr)))
aa_seqs = []
for i in range(len(nucleotides)):
    nt_seq = Seq(nucleotides[i]['GBSeq_sequence'])
    lenght_sequence = len(nt_seq)
    remove_bases = lenght_sequence % 3
    if remove_bases > 0:
        nt_seq = nt_seq[:-remove_bases]
    aa_seqs.append(str(nt_seq.translate()))

# run function
VH_found, VL_found, sequences_not_in_covabdab = search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab)

# format results
sequences_in_covabdab = len(nucleotides) - sequences_not_in_covabdab
CovAbDab_stats_nt= copy.deepcopy(CovAbDab)
CovAbDab_stats_nt['VH_found'] = VH_found
CovAbDab_stats_nt['VL_found'] = VL_found



#print statistics

if True:
    print('total sequences assessed:', len(nucleotides))
    print('number of genbank sequences not in covab dab:', sequences_not_in_covabdab)
    print('number of genbank sequences found in covab dab:', sequences_in_covabdab)
    print('match rate:', sequences_in_covabdab / len(nucleotides))
    # if the total number of counts in VH and VL columns is higher than the genbank sequences that have a match in covab dab
    # then a genbank sequence must have several matches in covab dab
    print('number of genebank sequences that have multiple matches in covab dab:', (sum(CovAbDab_stats_nt['VH_found'])+sum(CovAbDab_stats_nt['VL_found'])-sequences_in_covabdab))
    # if the number of genbank sequences with a match in covab dab is higher than the number of covab dab VH and VLs that were found
    # then a number of genbank sequences must have matched to the same covab dab sequenc
    print('number of genebank entries with non unique match in covab dab:', (sequences_in_covabdab - (len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VH_found'] > 0)]) + len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VL_found'] > 0)]))))
    print('-------------')
    print('total sequences in covab dab:', len(CovAbDab_stats_nt))
    print('number of Covab Dab VH sequences found:', len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VH_found'] > 0)]))
    print('number of Covab Dab VL sequences found:', len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VL_found'] > 0)]))
    VH_VL_pairings = len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VH_found'] > 0) & (CovAbDab_stats_nt['VL_found'] > 0)])
    print('number of Covab Dab VH VL pairings found:', VH_VL_pairings)
    VH_or_VL = len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VH_found'] > 0) | (CovAbDab_stats_nt['VL_found'] > 0)])
    print('number of Covab Dab entires where either VH or VL was found:', VH_or_VL)
    print('-------------')
    print('percentage of covab dab entries with pairing found:', VH_VL_pairings / len(CovAbDab_stats_nt) * 100 )
    print('percentage of covab dab entries with either VL or VH found:', VH_or_VL / len(CovAbDab_stats_nt) * 100)


IncompleteRead: IncompleteRead(286 bytes read)

In [20]:
# error if some of the genbank entries are to big

In [None]:
# save statistics

if False:
    with open('data/nucleotide_search_stats.csv', 'r+') as fo:
        fo.read()
        fo.write(f'{search}, {len(nucleotides)}, {sequences_not_in_covabdab }, {sequences_in_covabdab }, {sequences_in_covabdab / len(nucleotides)}, {(sum(CovAbDab_stats["VH_found"]) + sum(CovAbDab_stats["VL_found"]) - sequences_in_covabdab)}, {(sequences_in_covabdab - (len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0)])) + len(CovAbDab_stats.loc[(CovAbDab_stats["VL_found"] > 0)]))}, {len(CovAbDab_stats)}, {len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0)])}, {len(CovAbDab_stats.loc[(CovAbDab_stats["VL_found"] > 0)])}, {len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0) & (CovAbDab_stats["VL_found"] > 0)])}, {len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0) | (CovAbDab_stats["VL_found"] > 0)])}, {VH_VL_pairings / len(CovAbDab_stats) * 100}, {VH_or_VL / len(CovAbDab_stats) * 100}\n')

# Search both protein and nucelotide - much slower than the individual search first

In [16]:
# perform keyword search

# specify the terms used for the search
search = '((Immunoglobulin[All Fields] OR antibody[All Fields] OR antibodies[All Fields] OR nanobody[All Fields] OR nanobodies[All Fields]) AND (COVID-19[All Fields] OR coronavirus[All Fields] OR Sars-Cov[All Fields] OR Mers-Cov[All Fields] OR SARS[All Fields] OR Sars-CoV-2[All Fields]) AND (neutralizing[All Fields] OR neutralize[All Fields] OR neutralisation[All Fields] OR bind[All Fields] OR inhibit[All Fields] OR anti-Sars-Cov-2[All Fields]))'

# search protein data base with keywords and find out how many entries are found
handle_pr = Entrez.esearch(db='protein', term=search, retmax='2')
record_pr = Entrez.read(handle_pr)
number_of_proteins = int(record_pr['Count'])
print('number of proteins:', number_of_proteins)


# search protein data base with keywords and find out how many entries are found
handle_nt = Entrez.esearch(db='nucleotide', term=search, retmax='2')
record_nt = Entrez.read(handle_nt)
number_of_nucleotides = int(record_nt['Count'])
print('number of nucelotides:', number_of_nucleotides)

number of proteins: 8979
number of nucelotides: 4305


In [17]:
# download all entries from search

# ----> remove function to make it faster
def download_entries(database):

    if database == 'protein':
        handle = Entrez.esearch(db='protein', term=search, retmax=number_of_proteins)
        record = Entrez.read(handle)

        # 25 searches per second
        entry_handle =  Entrez.efetch(db="protein", id=record['IdList'], rettype="gb", retmode="xml")
        entries = Entrez.read(entry_handle)

    if database == 'nucleotide':
        handle = Entrez.esearch(db='nucleotide', term=search, retmax=number_of_nucleotides)
        record = Entrez.read(handle)

        # 25 searches per second
        entry_handle =  Entrez.efetch(db="nucleotide", id=record['IdList'], rettype="gb", retmode="xml")
        entries = Entrez.read(entry_handle)

    search_results[database] = entries


search_results = {}
for database in ['protein','nucleotide']:
    download_entries(database)

# run searches on multiprocessing
#pool = multiprocessing.Pool()
#pool.map(download_entries, ['protein', 'nucleotide'])
#pool.close()

# compare found sequences to covab dab

# use numba to speed up computation
@jit
def search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab):

    # loop throught aa seqs
    for aa_seq in aa_seqs:
        sequence_found = False

        # loop throught covab dab entries
        for i in range(len(VH_arr)):

            # in case VH is in covab dab increase the VH count of this entry by 1
            if VH_arr[i] in aa_seq:
                VH_found[i] = VH_found[i] + 1
                sequence_found = True

            # in case VL is in covab dab increase the VL count of this entry by 1
            if str(VL_arr[i]) in aa_seq:
                VL_found[i] = VL_found[i] + 1
                sequence_found = True

        # sequence that has no match with vh or vl is counted as a not found sequence
        if not sequence_found:
            sequences_not_in_covabdab = sequences_not_in_covabdab + 1

        
    return VH_found, VL_found, sequences_not_in_covabdab

# prepare all variables for jit
VL_arr = CovAbDab['VL'].to_numpy()
VH_arr = CovAbDab['VH or VHH'].to_numpy()
VH_found_pr = np.zeros((len(VH_arr)))
VL_found_pr = np.zeros((len(VL_arr)))
VH_found_nt = np.zeros((len(VH_arr)))
VL_found_nt = np.zeros((len(VL_arr)))


sequences_not_in_covabdab_nt = 0
aa_seqs_nt = []
for i in range(len(search_results['nucleotide'])):
    nt_seq = Seq(search_results['nucleotide'][i]['GBSeq_sequence'])
    lenght_sequence = len(nt_seq)
    remove_bases = lenght_sequence % 3
    if remove_bases > 0:
        nt_seq = nt_seq[:-remove_bases]
    aa_seqs_nt.append(str(nt_seq.translate()))

# prepare all variables for jit
sequences_not_in_covabdab_pr = 0
aa_seqs_pr = []
for i in range(len(search_results['protein'])):
    aa_seq = Seq(search_results['protein'][i]['GBSeq_sequence'])
    aa_seqs_pr.append(str.upper(str(aa_seq)))

# run function
VH_found_nt, VL_found_nt, sequences_not_in_covabdab_nt = search_in_covabdab(aa_seqs_nt, VH_arr, VL_arr, VH_found_nt, VL_found_nt, sequences_not_in_covabdab_nt)
VH_found_pr, VL_found_pr, sequences_not_in_covabdab_pr = search_in_covabdab(aa_seqs_pr, VH_arr, VL_arr, VH_found_pr, VL_found_pr, sequences_not_in_covabdab_pr)

# format results
sequences_in_covabdab_nt = len(search_results['nucleotide']) - sequences_not_in_covabdab_nt
sequences_in_covabdab_pr = len(search_results['protein']) - sequences_not_in_covabdab_pr
total_sequences = len(search_results['nucleotide']) + len(search_results['protein'])
total_sequences_in_covabdab = sequences_in_covabdab_pr + sequences_in_covabdab_nt
total_sequences_not_in_covabdab = sequences_not_in_covabdab_nt + sequences_not_in_covabdab_pr
CovAbDab_stats= copy.deepcopy(CovAbDab)

CovAbDab_stats['VH_found_nucleotide'] = VH_found_nt
CovAbDab_stats['VL_found_nucleotide'] = VL_found_nt
CovAbDab_stats['VH_found_protein'] = VH_found_pr
CovAbDab_stats['VL_found_protein'] = VL_found_pr
CovAbDab_stats['VH_found'] = VH_found_pr + VH_found_nt # total
CovAbDab_stats['VL_found'] = VL_found_pr + VL_found_nt # total



#print statistics

if True:
    print('total sequences assessed:', total_sequences)
    print('number of genbank sequences not in covab dab:', total_sequences_not_in_covabdab)
    print('number of genbank sequences found in covab dab:', total_sequences_in_covabdab)
    print('match rate:', total_sequences_in_covabdab / total_sequences)
    # if the total number of counts in VH and VL columns is higher than the genbank sequences that have a match in covab dab
    # then a genbank sequence must have several matches in covab dab
    print('number of genebank sequences that have multiple matches in covab dab:', (sum(CovAbDab_stats['VH_found'])+sum(CovAbDab_stats['VL_found'])-total_sequences_in_covabdab))
    # if the number of genbank sequences with a match in covab dab is higher than the number of covab dab VH and VLs that were found
    # then a number of genbank sequences must have matched to the same covab dab sequenc
    print('number of genebank entries with non unique match in covab dab:', (total_sequences_in_covabdab - (len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found'] > 0)]) + len(CovAbDab_stats.loc[(CovAbDab_stats['VL_found'] > 0)]))))
    print('-------------')
    print('total sequences in covab dab:', len(CovAbDab_stats))
    print('number of Covab Dab VH sequences found based on nucleotide:', len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found_nucleotide'] > 0)]))
    print('number of Covab Dab VL sequences found based on nucleotide:', len(CovAbDab_stats.loc[(CovAbDab_stats['VL_found_nucleotide'] > 0)]))
    print('number of Covab Dab VH sequences found based on protein:', len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found_protein'] > 0)]))
    print('number of Covab Dab VL sequences found based on protein:', len(CovAbDab_stats.loc[(CovAbDab_stats['VL_found_protein'] > 0)]))
    print('number of Covab Dab VH sequences found total:', len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found'] > 0)]))
    print('number of Covab Dab VL sequences found total:', len(CovAbDab_stats.loc[(CovAbDab_stats['VL_found'] > 0)]))
    VH_VL_pairings = len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found'] > 0) & (CovAbDab_stats['VL_found'] > 0)])
    print('number of Covab Dab VH VL pairings found:', VH_VL_pairings)
    VH_or_VL = len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found'] > 0) | (CovAbDab_stats['VL_found'] > 0)])
    print('number of Covab Dab entires where either VH or VL was found:', VH_or_VL)
    print('-------------')
    print('percentage of covab dab entries with pairing found:', VH_VL_pairings / len(CovAbDab_stats) * 100 )
    print('percentage of covab dab entries with either VL or VH found:', VH_or_VL / len(CovAbDab_stats) * 100)

IncompleteRead: IncompleteRead(1603 bytes read)

In [None]:
# save statistics

if True:
    with open('data/protein_nucleotide_search_stats.csv', 'r+') as fo:
        fo.read()
        fo.write(f'{search}, {total_sequences}, {total_sequences_not_in_covabdab}, {total_sequences_in_covabdab}, {total_sequences_in_covabdab / total_sequences}, {(sum(CovAbDab_stats["VH_found"])+sum(CovAbDab_stats["VL_found"])-total_sequences_in_covabdab)}, {total_sequences_in_covabdab - (len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0)]) + len(CovAbDab_stats.loc[(CovAbDab_stats["VL_found"] > 0)]))}, {len(CovAbDab_stats)}, {len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0)])}, {len(CovAbDab_stats.loc[(CovAbDab_stats["VL_found"] > 0)])}, {len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0) & (CovAbDab_stats["VL_found"] > 0)])}, {len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0) | (CovAbDab_stats["VL_found"] > 0)])}, {VH_VL_pairings / len(CovAbDab_stats) * 100}, {VH_or_VL / len(CovAbDab_stats) * 100}\n')

For this search the nucleotide search does not find any new sequences compared to the protein search

## Next tasks
1. optimise keywords to find more sequences
2. look at proteins found in genbank that are not in covab dab, are these false positives or relevant antibodies missing from covab dab
3. find a way to combine heavy and light chain sequnce
4. extract all necesary info
5. filter for false positives
6. extend to other databases