In [137]:
from Bio import Entrez
from Bio import SeqIO
import pandas as pd
import copy

Entrez.email = "fabian.spoendlin@exeter.ox.ac.uk"

with open('CoV-AbDab_181021.csv', 'r') as f:
    CovAbDab = pd.read_csv(f)   

In [None]:
handle = Entrez.efetch(db='pubmed', id='32571838')
record = handle.read()


In [None]:
related = Entrez.read(Entrez.elink(db='protein',dbfrom='pubmed', id='32571838'))
for linksetdb in related[0]["LinkSetDb"]:
    print(linksetdb["DbTo"], linksetdb["LinkName"], len(linksetdb["Link"]))


In [None]:
related = Entrez.read(Entrez.elink(db='nucleotide',dbfrom='pubmed', id='32571838'))
for linksetdb in related[0]["LinkSetDb"]:
    print(linksetdb["DbTo"], linksetdb["LinkName"], len(linksetdb["Link"]))
related[0]

In [134]:
#search nucleotides and check how many of the sequences are in Covab Dab
handle = Entrez.esearch(db='nucleotide', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='50')
record = Entrez.read(handle)

nucleotides = []

for ID in record['IdList']:
    nucelotide = Entrez.efetch(db="nucleotide", id=ID, rettype="gb", retmode="text")
    nucelotide_formated = SeqIO.read(nucelotide,'genbank')
    nucleotides.append(nucelotide_formated)


In [135]:
summary_n = []

for entry in nucleotides:

    lenght_sequence = len(entry.seq)
    remove_bases = lenght_sequence % 3
    if remove_bases == 0:
        aa_seq = str(entry.seq.translate()) # problems if nucleotide sequence is not in correct frame
    else:
        aa_seq = str(entry.seq[:-remove_bases].translate())

    containes = []
    for VH in CovAbDab['VH or VHH']:
        if VH in aa_seq:
            containes.append('heavy chain')
    for VL in CovAbDab['VL']:
        if str(VL) in aa_seq: # nan in VL that is formated as float
            containes.append('light chain')
    
    if 'heavy chain' in containes:
        summary_n.append('heavy chain')
        
    elif 'light chain' in containes:
        summary_n.append('light chain') 

    else:
        summary_n.append('not in covab-dab')
        
print(summary_n.count('not in covab-dab'),len(summary_n))


17 50


In [119]:
#search proteins and check how many of the sequences are in Covab Dab
handle = Entrez.esearch(db='protein', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='50')
record = Entrez.read(handle)

proteins = []

for ID in record['IdList']:
    protein = Entrez.efetch(db="protein", id=ID, rettype="gb", retmode="text")
    protein_formated = SeqIO.read(protein,'genbank')
    proteins.append(protein_formated)



In [131]:
summary_p = []

for entry in proteins:

    aa_seq = str(entry.seq)

    containes = []
    for VH in CovAbDab['VH or VHH']:
        if VH in aa_seq:
            containes.append('heavy chain')
    for VL in CovAbDab['VL']:
        if str(VL) in aa_seq: # nan in VL that is formated as float
            containes.append('light chain')
    
    if 'heavy chain' in containes:
        summary_p.append('heavy chain')
        
    elif 'light chain' in containes:
        summary_p.append('light chain') 

    else:
        summary_p.append('not in covab-dab')

print(summary_p.count('not in covab-dab'),len(summary_p))

8 50


In [136]:
#nucelotide and protein database contains sequence of whole chain not just VL and VH, thus this comparison does not work
summary_2_n = []

for entry in nucleotides:

    aa_seq = str(entry.seq.translate())
   
    if aa_seq in CovAbDab['VH or VHH']:
        summary_2_n.append('heavy chain')

    elif aa_seq in str(CovAbDab['VL']):
        summary_2_n.append('light chain') 

    else:
        summary_2_n.append('not in covab-dab')

print(summary_2_n.count('not in covab-dab'),len(summary_2_n))


summary_2_p = []

for entry in proteins:

    aa_seq = str(entry.seq)
   
    if aa_seq in CovAbDab['VH or VHH']:
        summary_2_p.append('heavy chain')

    elif aa_seq in str(CovAbDab['VL']):
        summary_2_p.append('light chain') 

    else:
        summary_2_p.append('not in covab-dab')

print(summary_2_p.count('not in covab-dab'),len(summary_2_p))

50 50
50 50




In [173]:
CovAbDab_stats = copy.deepcopy(CovAbDab)
CovAbDab_stats['VH_found'] = 0
CovAbDab_stats['VL_found'] = 0

In [170]:
#search proteins and check how many of the sequences are in Covab Dab
handle = Entrez.esearch(db='protein', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='50')
record = Entrez.read(handle)

proteins = []

for ID in record['IdList']:
    protein = Entrez.efetch(db="protein", id=ID, rettype="gb", retmode="text")
    protein_formated = SeqIO.read(protein,'genbank')
    proteins.append(protein_formated)



In [239]:
sequences_not_in_covabdab = 0

# loop throught covab dab entries
for i in range(len(CovAbDab_stats)):

    # for each covab dab entry loop throught the proteins
    for entry in proteins:
        
        #seq to string
        aa_seq = str(entry.seq)
    
        # is VH of covab dab entry in seq
        if CovAbDab_stats.iloc[i,8] in aa_seq:
            CovAbDab_stats.iloc[i,-2] = CovAbDab_stats.iloc[i,-2] + 1

        # is VL of covab dab entry in seq
        if str(CovAbDab_stats.iloc[i,9]) in aa_seq:
            CovAbDab_stats.iloc[i,-1] = CovAbDab_stats.iloc[i,-1] + 1


In [240]:
#CovAbDab_stats.loc[(CovAbDab_stats['VH_found'] > 9)]

In [184]:
print(sum(CovAbDab_stats['VH_found']))
print(len(CovAbDab_stats.loc[(CovAbDab_stats['VH_found'] > 0)]))

2396
487


In [227]:
print(len(proteins[17].seq))
print(len(CovAbDab_stats.iloc[2345,8]))

119
127


In [None]:
# issue: there are two many hits for when sequences from protein are compared with covab dab
# this is likely because light and heavy chains are compared individually
# light and heavy chains might occur several times in covab dab but the combinations are unique
# --> try to figure out a way how the corresponding Vh and VL sequences from genbank can be identified and combined
# then do the same comparison again

In [228]:
#get more infomrtion for each entry
handle = Entrez.esearch(db='nucleotide', term='anti-sars-cov-2[All Fields] AND immunoglobulin[All Fields]', retmax='10')
record = Entrez.read(handle)

genbank_entries_2 = []

for ID in record['IdList']:
    genbank_entries_2.append(Entrez.efetch(db="nucleotide", id=ID, rettype="gb", retmode="text").read())

print(genbank_entries_2[0])

LOCUS       MZ751050                 321 bp    mRNA    linear   ROD 17-AUG-2021
DEFINITION  Mus musculus clone 15G9/10D2 anti-SARS-CoV-2 spike protein
            immunoglobulin light chain variable region mRNA, partial cds.
ACCESSION   MZ751050
VERSION     MZ751050.1
KEYWORDS    .
SOURCE      Mus musculus (house mouse)
  ORGANISM  Mus musculus
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Mammalia; Eutheria; Euarchontoglires; Glires; Rodentia; Myomorpha;
            Muroidea; Muridae; Murinae; Mus; Mus.
REFERENCE   1  (bases 1 to 321)
  AUTHORS   Zhang,G., Wang,A. and Jiang,M.
  TITLE     Epitope profiling reveals the critical antigenic determinants in
            SARSCoV-2 RBD-based antigen
  JOURNAL   Unpublished
REFERENCE   2  (bases 1 to 321)
  AUTHORS   Zhang,G., Wang,A. and Jiang,M.
  TITLE     Direct Submission
  JOURNAL   Submitted (09-AUG-2021) College of Animal Science and Veterinary
            Medicine, Henan Agricultural Univers