In [13]:
from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
import copy
import numpy as np
import json
import pickle
import random
import multiprocessing
from numba import njit, jit

Entrez.email = "fabian.spoendlin@exeter.ox.ac.uk"


In [45]:
with open('CoV-AbDab_181021.csv', 'r') as f:
    CovAbDab = pd.read_csv(f)   

# missing seqs in Cov abdab are ND, this can be protein seq -> replace with something thats not protein seq
CovAbDab['VH or VHH'].replace(to_replace='ND', value='no sequence available', inplace=True)
CovAbDab['VL'].replace(to_replace='ND', value='no sequence available', inplace=True)
CovAbDab['VH or VHH'].fillna('no sequence available', inplace=True)
CovAbDab['VL'].fillna('no sequence available', inplace=True)

In [5]:
#search nucleotides and check how many of the sequences are in Covab Dab
handle = Entrez.esearch(db='nucleotide', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='50')
record = Entrez.read(handle)

nucleotides = []

for ID in record['IdList']:
    nucelotide = Entrez.efetch(db="nucleotide", id=ID, rettype="gb", retmode="text")
    nucelotide_formated = SeqIO.read(nucelotide,'genbank')
    nucleotides.append(nucelotide_formated)


In [6]:
summary_n = []

for entry in nucleotides:

    lenght_sequence = len(entry.seq)
    remove_bases = lenght_sequence % 3
    if remove_bases == 0:
        aa_seq = str(entry.seq.translate()) # problems if nucleotide sequence is not in correct frame
    else:
        aa_seq = str(entry.seq[:-remove_bases].translate())

    containes = []
    for VH in CovAbDab['VH or VHH']:
        if VH in aa_seq:
            containes.append('heavy chain')
    for VL in CovAbDab['VL']:
        if str(VL) in aa_seq: # nan in VL that is formated as float
            containes.append('light chain')
    
    if 'heavy chain' in containes:
        summary_n.append('heavy chain')
        
    elif 'light chain' in containes:
        summary_n.append('light chain') 

    else:
        summary_n.append('not in covab-dab')
        
print(summary_n.count('not in covab-dab'),len(summary_n))


17 50


In [62]:
#search proteins and check how many of the sequences are in Covab Dab
handle = Entrez.esearch(db='protein', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='50')
record = Entrez.read(handle)

proteins = []

for ID in record['IdList']:
    protein = Entrez.efetch(db="protein", id=ID, rettype="gb", retmode="text")
    protein_formated = SeqIO.read(protein,'genbank')
    proteins.append(protein_formated)



In [66]:
summary_p = []

for entry in proteins:

    aa_seq = str(entry.seq)

    containes = []
    for VH in CovAbDab['VH or VHH']:
        if VH in aa_seq:
            containes.append('heavy chain')
    for VL in CovAbDab['VL']:
        if str(VL) in aa_seq: # nan in VL that is formated as float
            containes.append('light chain')
    
    if 'heavy chain' in containes:
        summary_p.append('heavy chain')
        
    elif 'light chain' in containes:
        summary_p.append('light chain') 

    else:
        summary_p.append('not in covab-dab')

print(summary_p.count('not in covab-dab'),len(summary_p))

13 50


In [67]:
#nucelotide and protein database contains sequence of whole chain not just VL and VH, thus this comparison does not work
summary_2_n = []

for entry in nucleotides:

    aa_seq = str(entry.seq.translate())
   
    if aa_seq in CovAbDab['VH or VHH']:
        summary_2_n.append('heavy chain')

    elif aa_seq in str(CovAbDab['VL']):
        summary_2_n.append('light chain') 

    else:
        summary_2_n.append('not in covab-dab')

print(summary_2_n.count('not in covab-dab'),len(summary_2_n))


summary_2_p = []

for entry in proteins:

    aa_seq = str(entry.seq)
   
    if aa_seq in CovAbDab['VH or VHH']:
        summary_2_p.append('heavy chain')

    elif aa_seq in str(CovAbDab['VL']):
        summary_2_p.append('light chain') 

    else:
        summary_2_p.append('not in covab-dab')

print(summary_2_p.count('not in covab-dab'),len(summary_2_p))

50 50
50 50




# Protein search pipeline

In [50]:
# perform keyword search

# specify the terms used for the search
search = 'anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]'

# search protein data base with keywords and find out how many entries are found
handle = Entrez.esearch(db='protein', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='2')
record = Entrez.read(handle)
number_of_entries = int(record['Count'])
print('number of entries:', number_of_entries)

number of entries: 2685


In [51]:
# download all entries from search

handle = Entrez.esearch(db='protein', term=search, retmax=number_of_entries)
record = Entrez.read(handle)

# 25 searches per second
protein_handle =  Entrez.efetch(db="protein", id=record['IdList'], rettype="gb", retmode="xml")
proteins = Entrez.read(protein_handle)




# compare found sequences to covab dab

# use numba to speed up computation
@jit
def search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab):

    sequences_not_in_covabdab = 0

    # loop throught aa seqs
    for aa_seq in aa_seqs:
        sequence_found = False

        # loop throught covab dab entries
        for i in range(len(VH_arr)):

            # in case VH is in covab dab increase the VH count of this entry by 1
            if VH_arr[i] in aa_seq:
                VH_found[i] = VH_found[i] + 1
                sequence_found = True

            # in case VL is in covab dab increase the VL count of this entry by 1
            if VL_arr[i] in aa_seq:
                VL_found[i] = VL_found[i] + 1
                sequence_found = True

        # sequence that has no match with vh or vl is counted as a not found sequence
        if not sequence_found:
            sequences_not_in_covabdab = sequences_not_in_covabdab + 1

        
    return VH_found, VL_found, sequences_not_in_covabdab

# prepare all variables for jit
sequences_not_in_covabdab = 0
VL_arr = CovAbDab['VL'].to_numpy()
VH_arr = CovAbDab['VH or VHH'].to_numpy()
VH_found = np.zeros((len(VH_arr)))
VL_found = np.zeros((len(VL_arr)))
aa_seqs = []
for i in range(len(proteins)):
    aa_seq = Seq(proteins[i]['GBSeq_sequence'])
    aa_seqs.append(str.upper(str(aa_seq)))

# run function
VH_found, VL_found, sequences_not_in_covabdab = search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab)

# format results
sequences_in_covabdab = len(proteins) - sequences_not_in_covabdab
CovAbDab_stats_pr = copy.deepcopy(CovAbDab)
CovAbDab_stats_pr['VH_found'] = VH_found
CovAbDab_stats_pr['VL_found'] = VL_found




#print summary statistics

if True:
    print('total sequences assessed:', len(proteins))
    print('number of genbank sequences not in covab dab:', sequences_not_in_covabdab)
    print('number of genbank sequences found in covab dab:', sequences_in_covabdab)
    print('match rate:', sequences_in_covabdab / len(proteins))
    # if the total number of counts in VH and VL columns is higher than the genbank sequences that have a match in covab dab
    # then a genbank sequence must have several matches in covab dab
    print('number of genebank sequences that have multiple matches in covab dab:', (sum(CovAbDab_stats_pr['VH_found'])+sum(CovAbDab_stats_pr['VL_found'])-sequences_in_covabdab))
    # if the number of genbank sequences with a match in covab dab is higher than the number of covab dab VH and VLs that were found
    # then a number of genbank sequences must have matched to the same covab dab sequenc
    # stat is wrong
    print('number of genebank entries with non unique match in covab dab:', (sequences_in_covabdab - (len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VH_found'] > 0)]) + len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VL_found'] > 0)]))))
    print('-------------')
    print('total sequences in covab dab:', len(CovAbDab_stats_pr))
    print('number of Covab Dab VH sequences found:', len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VH_found'] > 0)]))
    print('number of Covab Dab VL sequences found:', len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VL_found'] > 0)]))
    VH_VL_pairings = len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VH_found'] > 0) & (CovAbDab_stats_pr['VL_found'] > 0)])
    print('number of Covab Dab VH VL pairings found:', VH_VL_pairings)
    VH_or_VL = len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr['VH_found'] > 0) | (CovAbDab_stats_pr['VL_found'] > 0)])
    print('number of Covab Dab entires where either VH or VL was found:', VH_or_VL)
    print('-------------')
    print('percentage of covab dab entries with pairing found:', VH_VL_pairings / len(CovAbDab_stats_pr) * 100 )
    print('percentage of covab dab entries with either VL or VH found:', VH_or_VL / len(CovAbDab_stats_pr) * 100)

Compilation is falling back to object mode WITH looplifting enabled because Function "search_in_covabdab" failed type inference due to: non-precise type array(pyobject, 1d, C)
During: typing of argument at /var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/1416774354.py (19)

File "../../../../../../../../var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/1416774354.py", line 19:
<source missing, REPL/exec in use?>

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "search_in_covabdab" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "../../../../../../../../var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/1416774354.py", line 22:
<source missing, REPL/exec in use?>

  @jit

File "../../../../../../../../var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/1416774354.py", line 19:
<source missing, REPL/exec in use?>

Fall-back fr

total sequences assessed: 2685
number of genbank sequences not in covab dab: 409
number of genbank sequences found in covab dab: 2276
match rate: 0.8476722532588454
number of genebank sequences that have multiple matches in covab dab: 636.0
number of genebank entries with non unique match in covab dab: 13
-------------
total sequences in covab dab: 4198
number of Covab Dab VH sequences found: 1110
number of Covab Dab VL sequences found: 1153
number of Covab Dab VH VL pairings found: 1059
number of Covab Dab entires where either VH or VL was found: 1204
-------------
percentage of covab dab entries with pairing found: 25.226298237255833
percentage of covab dab entries with either VL or VH found: 28.680323963792283


In [None]:
# save stats

if False:
    with open('data/protein_search_stats.csv', 'r+') as fo:
        fo.read()
        fo.write(f'''{search} - samples: {number_of_samples}, 
                    {iteration_count}, 
                    {sequences_not_in_covabdab}, 
                    {sequences_in_covabdab}, 
                    {sequences_in_covabdab / iteration_count}, 
                    {(sum(CovAbDab_stats_pr["VH_found"]) + sum(CovAbDab_stats_pr["VL_found"]) - sequences_in_covabdab)}, 
                    {(sequences_in_covabdab - (len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VH_found"] > 0)])) + len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VL_found"] > 0)]))}, 
                    {len(CovAbDab_stats_pr)}, {len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VH_found"] > 0)])}, 
                    {len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VL_found"] > 0)])}, 
                    {len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VH_found"] > 0) & (CovAbDab_stats_pr["VL_found"] > 0)])}, 
                    {len(CovAbDab_stats_pr.loc[(CovAbDab_stats_pr["VH_found"] > 0) | (CovAbDab_stats_pr["VL_found"] > 0)])}, 
                    {VH_VL_pairings / len(CovAbDab_stats_pr) * 100}, {VH_or_VL / len(CovAbDab_stats_pr) * 100}''')

# Nucleotide search pipeline

In [73]:
# perform keyword search

# specify the terms used for the search
search = 'anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]'

# search protein data base with keywords and find out how many entries are found
handle = Entrez.esearch(db='nucleotide', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='2')
record = Entrez.read(handle)
number_of_entries = int(record['Count'])
print('number of entries:', number_of_entries)

number of entries: 2312


In [75]:
# download all entries from search

handle = Entrez.esearch(db='nucleotide', term=search, retmax=number_of_entries)
record = Entrez.read(handle)

# 25 searches per second
nucleotide_handle =  Entrez.efetch(db="nucleotide", id=record['IdList'], rettype="gb", retmode="xml")
nucleotides = Entrez.read(nucleotide_handle)




# compare found sequences to covab dab

# use numba to speed up computation
@jit
def search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab):

    sequences_not_in_covabdab = 0

    # loop throught aa seqs
    for aa_seq in aa_seqs:
        sequence_found = False

        # loop throught covab dab entries
        for i in range(len(VH_arr)):

            # in case VH is in covab dab increase the VH count of this entry by 1
            if VH_arr[i] in aa_seq:
                VH_found[i] = VH_found[i] + 1
                sequence_found = True

            # in case VL is in covab dab increase the VL count of this entry by 1
            if str(VL_arr[i]) in aa_seq:
                VL_found[i] = VL_found[i] + 1
                sequence_found = True

        # sequence that has no match with vh or vl is counted as a not found sequence
        if not sequence_found:
            sequences_not_in_covabdab = sequences_not_in_covabdab + 1

        
    return VH_found, VL_found, sequences_not_in_covabdab

# prepare all variables for jit
sequences_not_in_covabdab = 0
VL_arr = CovAbDab['VL'].to_numpy()
VH_arr = CovAbDab['VH or VHH'].to_numpy()
VH_found = np.zeros((len(VH_arr)))
VL_found = np.zeros((len(VL_arr)))
aa_seqs = []
for i in range(len(nucleotides)):
    nt_seq = Seq(nucleotides[i]['GBSeq_sequence'])
    lenght_sequence = len(nt_seq)
    remove_bases = lenght_sequence % 3
    if remove_bases > 0:
        nt_seq = nt_seq[:-remove_bases]
    aa_seqs.append(str(nt_seq.translate()))

# run function
VH_found, VL_found, sequences_not_in_covabdab = search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab)

# format results
sequences_in_covabdab = len(nucleotides) - sequences_not_in_covabdab
CovAbDab_stats_nt= copy.deepcopy(CovAbDab)
CovAbDab_stats_nt['VH_found'] = VH_found
CovAbDab_stats_nt['VL_found'] = VL_found



#print statistics

if True:
    print('total sequences assessed:', len(nucleotides))
    print('number of genbank sequences not in covab dab:', sequences_not_in_covabdab)
    print('number of genbank sequences found in covab dab:', sequences_in_covabdab)
    print('match rate:', sequences_in_covabdab / len(nucleotides))
    # if the total number of counts in VH and VL columns is higher than the genbank sequences that have a match in covab dab
    # then a genbank sequence must have several matches in covab dab
    print('number of genebank sequences that have multiple matches in covab dab:', (sum(CovAbDab_stats_nt['VH_found'])+sum(CovAbDab_stats_nt['VL_found'])-sequences_in_covabdab))
    # if the number of genbank sequences with a match in covab dab is higher than the number of covab dab VH and VLs that were found
    # then a number of genbank sequences must have matched to the same covab dab sequenc
    print('number of genebank entries with non unique match in covab dab:', (sequences_in_covabdab - (len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VH_found'] > 0)]) + len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VL_found'] > 0)]))))
    print('-------------')
    print('total sequences in covab dab:', len(CovAbDab_stats_nt))
    print('number of Covab Dab VH sequences found:', len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VH_found'] > 0)]))
    print('number of Covab Dab VL sequences found:', len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VL_found'] > 0)]))
    VH_VL_pairings = len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VH_found'] > 0) & (CovAbDab_stats_nt['VL_found'] > 0)])
    print('number of Covab Dab VH VL pairings found:', VH_VL_pairings)
    VH_or_VL = len(CovAbDab_stats_nt.loc[(CovAbDab_stats_nt['VH_found'] > 0) | (CovAbDab_stats_nt['VL_found'] > 0)])
    print('number of Covab Dab entires where either VH or VL was found:', VH_or_VL)
    print('-------------')
    print('percentage of covab dab entries with pairing found:', VH_VL_pairings / len(CovAbDab_stats_nt) * 100 )
    print('percentage of covab dab entries with either VL or VH found:', VH_or_VL / len(CovAbDab_stats_nt) * 100)


Compilation is falling back to object mode WITH looplifting enabled because Function "search_in_covabdab" failed type inference due to: non-precise type array(pyobject, 1d, C)
During: typing of argument at /var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/4195458418.py (19)

File "../../../../../../../../var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/4195458418.py", line 19:
<source missing, REPL/exec in use?>

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "search_in_covabdab" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "../../../../../../../../var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/4195458418.py", line 22:
<source missing, REPL/exec in use?>

  @jit

File "../../../../../../../../var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/4195458418.py", line 19:
<source missing, REPL/exec in use?>

Fall-back fr

total sequences assessed: 2312
number of genbank sequences not in covab dab: 316
number of genbank sequences found in covab dab: 1996
match rate: 0.8633217993079585
number of genebank sequences that have multiple matches in covab dab: 614.0
number of genebank entries with non unique match in covab dab: -55
-------------
total sequences in covab dab: 4198
number of Covab Dab VH sequences found: 1002
number of Covab Dab VL sequences found: 1049
number of Covab Dab VH VL pairings found: 976
number of Covab Dab entires where either VH or VL was found: 1075
-------------
percentage of covab dab entries with pairing found: 23.249166269652218
percentage of covab dab entries with either VL or VH found: 25.607432110528823


In [None]:
# save statistics

if False:
    with open('data/nucleotide_search_stats.csv', 'r+') as fo:
        fo.read()
        fo.write(f'''{search} - samples: {number_of_samples}, 
                    {iteration_count}, 
                    {sequences_not_in_covabdab }, 
                    {sequences_in_covabdab }, 
                    {sequences_in_covabdab / iteration_count}, 
                    {(sum(CovAbDab_stats["VH_found"]) + sum(CovAbDab_stats["VL_found"]) - sequences_in_covabdab)}, 
                    {(sequences_in_covabdab - (len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0)])) + len(CovAbDab_stats.loc[(CovAbDab_stats["VL_found"] > 0)]))}, 
                    {len(CovAbDab_stats)}, {len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0)])}, 
                    {len(CovAbDab_stats.loc[(CovAbDab_stats["VL_found"] > 0)])}, 
                    {len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0) & (CovAbDab_stats["VL_found"] > 0)])}, 
                    {len(CovAbDab_stats.loc[(CovAbDab_stats["VH_found"] > 0) | (CovAbDab_stats["VL_found"] > 0)])}, 
                    {VH_VL_pairings / len(CovAbDab_stats) * 100}, {VH_or_VL / len(CovAbDab_stats) * 100}''')

# Search both protein and nucelotide

In [54]:
# perform keyword search

# specify the terms used for the search
search = 'anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]'

# search protein data base with keywords and find out how many entries are found
handle_pr = Entrez.esearch(db='protein', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='2')
record_pr = Entrez.read(handle_pr)
number_of_proteins = int(record_pr['Count'])
print('number of proteins:', number_of_proteins)


# search protein data base with keywords and find out how many entries are found
handle_nt = Entrez.esearch(db='nucleotide', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='2')
record_nt = Entrez.read(handle_nt)
number_of_nucleotides = int(record_nt['Count'])
print('number of nucelotides:', number_of_nucleotides)

number of proteins: 2685
number of nucelotides: 2312


In [70]:
# download all entries from search
def download_entries(database):

    if database == 'protein':
        handle = Entrez.esearch(db='protein', term=search, retmax=number_of_proteins)
        record = Entrez.read(handle)

        # 25 searches per second
        entry_handle =  Entrez.efetch(db="protein", id=record['IdList'], rettype="gb", retmode="xml")
        entries = Entrez.read(entry_handle)

    if database == 'nucleotide':
        handle = Entrez.esearch(db='nucleotide', term=search, retmax=number_of_nucleotides)
        record = Entrez.read(handle)

        # 25 searches per second
        entry_handle =  Entrez.efetch(db="nucleotide", id=record['IdList'], rettype="gb", retmode="xml")
        entries = Entrez.read(entry_handle)

    search_results[database] = entries


search_results = {}
for database in ['protein','nucleotide']:
    download_entries(database)

# run searches on multiprocessing
#pool = multiprocessing.Pool()
#pool.map(download_entries, ['protein', 'nucleotide'])
#pool.close()

# compare found sequences to covab dab

# use numba to speed up computation
@jit
def search_in_covabdab(aa_seqs, VH_arr, VL_arr, VH_found, VL_found, sequences_not_in_covabdab):

    # loop throught aa seqs
    for aa_seq in aa_seqs:
        sequence_found = False

        # loop throught covab dab entries
        for i in range(len(VH_arr)):

            # in case VH is in covab dab increase the VH count of this entry by 1
            if VH_arr[i] in aa_seq:
                VH_found[i] = VH_found[i] + 1
                sequence_found = True

            # in case VL is in covab dab increase the VL count of this entry by 1
            if str(VL_arr[i]) in aa_seq:
                VL_found[i] = VL_found[i] + 1
                sequence_found = True

        # sequence that has no match with vh or vl is counted as a not found sequence
        if not sequence_found:
            sequences_not_in_covabdab = sequences_not_in_covabdab + 1

        
    return VH_found, VL_found, sequences_not_in_covabdab

# prepare all variables for jit
VL_arr = CovAbDab['VL'].to_numpy()
VH_arr = CovAbDab['VH or VHH'].to_numpy()
VH_found_pr = np.zeros((len(VH_arr)))
VL_found_pr = np.zeros((len(VL_arr)))
VH_found_nt = np.zeros((len(VH_arr)))
VL_found_nt = np.zeros((len(VL_arr)))


sequences_not_in_covabdab_nt = 0
aa_seqs_nt = []
for i in range(len(search_results['nucleotide'])):
    nt_seq = Seq(search_results['nucleotide'][i]['GBSeq_sequence'])
    lenght_sequence = len(nt_seq)
    remove_bases = lenght_sequence % 3
    if remove_bases > 0:
        nt_seq = nt_seq[:-remove_bases]
    aa_seqs_nt.append(str(nt_seq.translate()))

# prepare all variables for jit
sequences_not_in_covabdab_pr = 0
aa_seqs_pr = []
for i in range(len(search_results['protein'])):
    aa_seq = Seq(search_results['protein'][i]['GBSeq_sequence'])
    aa_seqs_pr.append(str.upper(str(aa_seq)))

# run function
VH_found_nt, VL_found_nt, sequences_not_in_covabdab_nt = search_in_covabdab(aa_seqs_nt, VH_arr, VL_arr, VH_found_nt, VL_found_nt, sequences_not_in_covabdab_nt)
VH_found_pr, VL_found_pr, sequences_not_in_covabdab_pr = search_in_covabdab(aa_seqs_pr, VH_arr, VL_arr, VH_found_pr, VL_found_pr, sequences_not_in_covabdab_pr)

# format results
sequences_in_covabdab_nt = len(search_results['nucleotide']) - sequences_not_in_covabdab_nt
sequences_in_covabdab_pr = len(search_results['protein']) - sequences_not_in_covabdab_pr
total_sequences = len(search_results['nucleotide']) + len(search_results['protein'])
total_sequences_in_covabdab = sequences_in_covabdab_pr + sequences_in_covabdab_nt
total_sequences_not_in_covabdab = sequences_not_in_covabdab_nt + sequences_not_in_covabdab_pr
CovAbDab_stats= copy.deepcopy(CovAbDab)

CovAbDab_stats['VH_found_nucleotide'] = VH_found_nt
CovAbDab_stats['VL_found_nucleotide'] = VL_found_nt
CovAbDab_stats['VH_found_protein'] = VH_found_pr
CovAbDab_stats['VL_found_protein'] = VL_found_pr
CovAbDab_stats['VH_found'] = VH_found_pr + VH_found_nt # total
CovAbDab_stats['VL_found'] = VL_found_pr + VL_found_nt # total



#print statistics

if True:
    print('total sequences assessed:', total_sequences)
    print('number of genbank sequences not in covab dab:', total_sequences_not_in_covabdab)
    print('number of genbank sequences found in covab dab:', total_sequences_in_covabdab)
    print('match rate:', total_sequences_in_covabdab / total_sequences)
    # if the total number of counts in VH and VL columns is higher than the genbank sequences that have a match in covab dab
    # then a genbank sequence must have several matches in covab dab
    print('number of genebank sequences that have multiple matches in covab dab:', (sum(CovAbDab_stats['VH_found'])+sum(CovAbDab_stats['VL_found'])-total_sequences_in_covabdab))
    # if the number of genbank sequences with a match in covab dab is higher than the number of covab dab VH and VLs that were found
    # then a number of genbank sequences must have matched to the same covab dab sequenc
    print('number of genebank entries with non unique match in covab dab:', (total_sequences_in_covabdab - (len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found'] > 0)]) + len(CovAbDab_stats.loc[(CovAbDab_stats['VL_found'] > 0)]))))
    print('-------------')
    print('total sequences in covab dab:', len(CovAbDab_stats))
    print('number of Covab Dab VH sequences found based on nucleotide:', len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found_nucleotide'] > 0)]))
    print('number of Covab Dab VL sequences found based on nucleotide:', len(CovAbDab_stats.loc[(CovAbDab_stats['VL_found_nucleotide'] > 0)]))
    print('number of Covab Dab VH sequences found based on protein:', len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found_protein'] > 0)]))
    print('number of Covab Dab VL sequences found based on protein:', len(CovAbDab_stats.loc[(CovAbDab_stats['VL_found_protein'] > 0)]))
    print('number of Covab Dab VH sequences found total:', len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found'] > 0)]))
    print('number of Covab Dab VL sequences found total:', len(CovAbDab_stats.loc[(CovAbDab_stats['VL_found'] > 0)]))
    VH_VL_pairings = len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found'] > 0) & (CovAbDab_stats['VL_found'] > 0)])
    print('number of Covab Dab VH VL pairings found:', VH_VL_pairings)
    VH_or_VL = len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found'] > 0) | (CovAbDab_stats['VL_found'] > 0)])
    print('number of Covab Dab entires where either VH or VL was found:', VH_or_VL)
    print('-------------')
    print('percentage of covab dab entries with pairing found:', VH_VL_pairings / len(CovAbDab_stats) * 100 )
    print('percentage of covab dab entries with either VL or VH found:', VH_or_VL / len(CovAbDab_stats) * 100)

Compilation is falling back to object mode WITH looplifting enabled because Function "search_in_covabdab" failed type inference due to: non-precise type array(pyobject, 1d, C)
During: typing of argument at /var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/3240083081.py (39)

File "../../../../../../../../var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/3240083081.py", line 39:
<source missing, REPL/exec in use?>

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "search_in_covabdab" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "../../../../../../../../var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/3240083081.py", line 39:
<source missing, REPL/exec in use?>

  @jit

File "../../../../../../../../var/folders/8x/40q8fgwd2wg9ptnzw_b8cl480000gn/T/ipykernel_92357/3240083081.py", line 39:
<source missing, REPL/exec in use?>

Fall-back fr

total sequences assessed: 4997
number of genbank sequences not in covab dab: 725
number of genbank sequences found in covab dab: 4272
match rate: 0.8549129477686612
number of genebank sequences that have multiple matches in covab dab: 1250.0
number of genebank entries with non unique match in covab dab: 2009
-------------
total sequences in covab dab: 4198
number of Covab Dab VH sequences found based on nucleotide: 1002
number of Covab Dab VL sequences found based on nucleotide: 1049
number of Covab Dab VH sequences found based on protein: 1110
number of Covab Dab VL sequences found based on protein: 1153
number of Covab Dab VH sequences found total: 1110
number of Covab Dab VL sequences found total: 1153
number of Covab Dab VH VL pairings found: 1059
number of Covab Dab entires where either VH or VL was found: 1204
-------------
percentage of covab dab entries with pairing found: 25.226298237255833
percentage of covab dab entries with either VL or VH found: 28.680323963792283


For this search the nucleotide search does not find any new sequences compared to the protein search

### Version without numpa (takes 3 min / 1000 seqs)

In [25]:
# copy covabdab dataframe to add stats
CovAbDab_stats_nt= copy.deepcopy(CovAbDab)

# add columns to count how many times a certain VH or VL was found in the genbank
CovAbDab_stats_nt['VH_found'] = 0
CovAbDab_stats_nt['VL_found'] = 0

sequences_not_in_covabdab = 0
iteration_count = 0

# loop throught the feched proteins
for nucleotide in nucleotides:

    iteration_count += 1

    nt_seq = nucleotide['GBSeq_sequence']
    lenght_sequence = len(nt_seq)
    remove_bases = lenght_sequence % 3
    
    if remove_bases > 0:
        nt_seq = nt_seq[:-remove_bases]

    nt_seq = Seq(nt_seq)
    aa_seq = nt_seq.translate()

    sequence_found = False

    # loop throught covab dab entries
    for i in range(len(CovAbDab_stats_nt)):
        
        # in case VH is in covab dab increase the VH count of this entry by 1
        if CovAbDab_stats_nt.iloc[i,8] in aa_seq:
            CovAbDab_stats_nt.iloc[i,-2] = CovAbDab_stats_nt.iloc[i,-2] + 1
            sequence_found = True

        # in case VL is in covab dab increase the VL count of this entry by 1
        if str(CovAbDab_stats_nt.iloc[i,9]) in aa_seq:
            CovAbDab_stats_nt.iloc[i,-1] = CovAbDab_stats_nt.iloc[i,-1] + 1
            sequence_found = True

    # sequence that has no match with vh or vl is counted as a not found sequence
    if not sequence_found:
        sequences_not_in_covabdab += 1

sequences_in_covabdab = iteration_count - sequences_not_in_covabdab

KeyboardInterrupt: 

## Next tasks
1. optimise keywords to find more sequences
2. use multiprocessing
3. look at proteins found in genbank that are not in covab dab, are these false positives or relevant antibodies missing from covab dab

In [17]:
#get more infomrtion for each entry
handle = Entrez.esearch(db='nucleotide', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='10')
record = Entrez.read(handle)

genbank_entries_2 = []

for ID in record['IdList']:
    genbank_entries_2.append(Entrez.efetch(db="nucleotide", id=ID, rettype="gb", retmode="text").read())

print(genbank_entries_2[0])

LOCUS       MZ751050                 321 bp    mRNA    linear   ROD 17-AUG-2021
DEFINITION  Mus musculus clone 15G9/10D2 anti-SARS-CoV-2 spike protein
            immunoglobulin light chain variable region mRNA, partial cds.
ACCESSION   MZ751050
VERSION     MZ751050.1
KEYWORDS    .
SOURCE      Mus musculus (house mouse)
  ORGANISM  Mus musculus
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Mammalia; Eutheria; Euarchontoglires; Glires; Rodentia; Myomorpha;
            Muroidea; Muridae; Murinae; Mus; Mus.
REFERENCE   1  (bases 1 to 321)
  AUTHORS   Zhang,G., Wang,A. and Jiang,M.
  TITLE     Epitope profiling reveals the critical antigenic determinants in
            SARSCoV-2 RBD-based antigen
  JOURNAL   Unpublished
REFERENCE   2  (bases 1 to 321)
  AUTHORS   Zhang,G., Wang,A. and Jiang,M.
  TITLE     Direct Submission
  JOURNAL   Submitted (09-AUG-2021) College of Animal Science and Veterinary
            Medicine, Henan Agricultural Univers