In [5]:
import pandas as pd
import random
from Bio import Entrez
import json
from http.client import HTTPResponse
import http.client
http.client.HTTPConnection._http_vsn = 10
http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
import io

In [20]:
db = pd.read_csv('data/ab_database2.csv')
cov = pd.read_csv('data/CoV-AbDab_090322.csv')
cov.fillna('NaN', inplace=True)
db.fillna('NaN', inplace=True)

In [21]:
print('Sequence pairs found with genbank scraping:', len(db))

Sequence pairs found with genbank scraping: 427


In [22]:
# compare sequences in ab_database.csv to covabdab
VH_ids_not_found = []
VL_ids_not_found = []
VH_ids_found = []
VL_ids_found = []

for i in range(len(db)):
    db_entry = db.iloc[i]
    VH = db_entry.VH.upper()
    VL = db_entry.VL.upper()
    VH_id = db_entry.Genbank_protein_id_vh
    VL_id = db_entry.Genbank_protein_id_vl

    pairing_found = False

    for i in range(len(cov)):
        cov_entry = cov.iloc[i]
        VH_cov = cov_entry.loc['VH or VHH'].upper()
        VL_cov = cov_entry.loc['VL'].upper()

        if (VH in VH_cov and VL in VL_cov) or (VH_cov in VH and VL_cov in VL) or (VL == 'NaN' and VH_cov in VH) or (VL == 'NaN' and VH_cov == VH):
            pairing_found = True
            VH_ids_found.append(VH_id)
            VL_ids_found.append(VL_id)
            break

    if not pairing_found:
        VH_ids_not_found.append(VH_id)
        VL_ids_not_found.append(VL_id)

In [23]:
print('Number of sequence pairs in covabdab')
len(VL_ids_found)

Number of sequence pairs in covabdab


356

In [24]:
print('Number of sequence pairs not in covabdab')
len(VH_ids_not_found)

Number of sequence pairs not in covabdab


71

In [25]:
rand_ids_VH = random.sample(VH_ids_not_found, 12)
print(rand_ids_VH)

['7E3L_D', '7O7F_H', '6NB4_H', '7W9E_D', '7KKK_F', '6W7Y_A', '7TLY_A', '7M3I_H', '7LKA_E', '7DZY_O', '7LU9_q', '7TN0_M']


In [27]:
# select 12 random heavy chains that are not in covabdab and print their definition an title
Entrez.email = 'fabian.spoendlin@exeter.ox.ac.uk'
entries_handle = Entrez.efetch(db='protein', id=rand_ids_VH, rettype="gb", retmode="xml")
rand_VH = Entrez.read(entries_handle)

for e in rand_VH:
    print(e['GBSeq_definition'])
    print(e['GBSeq_references'][0]['GBReference_title'])
    

Chain D, 58G6 heavy chain
Potent SARS-CoV-2 neutralizing antibodies with protective efficacy against newly emerged mutational variants
Chain H, Fab antibody fragment heavy chain
Structural basis of the activation of the CC chemokine receptor 5 by a chemokine agonist
Chain H, LCA60 heavy chain
Unexpected Receptor Functional Mimicry Elucidates Activation of Coronavirus Fusion
Chain D, Anti-H5N1 hemagglutinin monoclonal anitbody H5M9 heavy chain
Structural basis for SARS-CoV-2 Delta variant recognition of ACE2 receptor and broadly neutralizing antibodies
Chain F, Synthetic nanobody Nb6
An ultrapotent synthetic nanobody neutralizes SARS-CoV-2 by stabilizing inactive Spike
Chain A, CR3022 Heavy chain
Structure and Antigenicity of the SARS-CoV-2 Receptor Binding Domain
Chain A, S309 Fab heavy chain
Structural basis of SARS-CoV-2 Omicron immune evasion and receptor engagement
Chain H, CV2-75 Fab Heavy chain
Isolation and characterization of cross-neutralizing coronavirus antibodies from COVID

In [28]:
# select 8 random heavy chains that are not in covabdab and print their definition an title
rand_ids_VL = random.sample(VL_ids_not_found, 10)
print(rand_ids_VL)

['7N4M_L', '7R8N_P', 'NaN', '7R6X_L', '7S4S_L', '7E8F_E', '7MMO_E', 'NaN', '7KFW_L', '6XCA_L']


In [29]:
entries_handle = Entrez.efetch(db='protein', id=rand_ids_VL, rettype="gb", retmode="xml")
rand_VL = Entrez.read(entries_handle)


for e in rand_VL:
    print(e['GBSeq_locus'])
    print(e['GBSeq_definition'])
    print(e['GBSeq_references'][0]['GBReference_title'])

7N4M_L
Chain L, WRAIR-2151 antibody Fab light chain
Low-dose in vivo protection and neutralization across SARS-CoV-2 variants by monoclonal antibody combinations
7R8N_P
Chain P, C051 Fab Light Chain
Development of potency, breadth and resilience to viral escape mutations in SARS-CoV-2 neutralizing antibodies
7R6X_L
Chain L, Monoclonal antibody S304 Fab light chain
SARS-CoV-2 RBD antibodies that maximize breadth and resistance to escape
7S4S_L
Chain L, CoV11 light chain
Crystal Structure of SARS-CoV-2 S receptor-binding domain (RBD) in complex CoV11 Fab
7E8F_E
Chain E, L
Humoral immune response to circulating SARS-CoV-2 variants elicited by inactivated and RBD-subunit vaccines
7MMO_E
Chain E, LY-CoV1404 Fab light chain
LY-CoV1404 (bebtelovimab) potently neutralizes SARS-CoV-2 variants
7KFW_L
Chain L, light chain of antibody C1A-B3 Fab
Molecular basis for a germline-biased neutralizing antibody response to SARS-CoV-2
6XCA_L
Chain L, C105 Light Chain
Structures of Human Antibodies Bound t

In [10]:
with open('data/protein_handles_30488.json', 'r') as f:
    batch1 = json.load(f)

len(batch1)

30488

In [19]:
batch1[22304]['GBSeq_sequence']

'mdlfmriftigtvtlkqgeikdatpsdfvratatipiqaslpfgwlivgvallavfqsaskiitlkkrwqlalskgvhfvcnllllfvtvyshlllvaagleapflylyalvyflqsinfvriimrlwlcwkcrsknpllydanyflcwhtncydycipynsvtssivitsgdgttspisehdyqiggytekwesgvkdcvvlhsyftsdyyqlystqlstdtgvehvtffiynkivdepeehvqihtidgssgvvnpvmepiydepttttsvpl'

In [3]:
import json
with open('data/refseq_ids_papers2.json', 'r') as f:
    batch2 = json.load(f)

len(batch2)

525

In [13]:
comb = batch1 + batch2
len(comb)

32339

In [36]:
with open('data/protein_handles_from_papers.json', 'r') as f:
    prot = json.load(f)

len(prot)

779

In [4]:
with open('data/protein_handles.json', 'r') as f:
    prot = json.load(f)


In [16]:
for pro in prot:
    try:
        xrefs = pro['GBSeq_references'][0]['GBReference_xref']

        for xref in xrefs:
            if xref['GBXref_dbname'] == 'doi':
                print(xref['GBXref_id'])
    except KeyError:
        pass


10.1038/s41586-020-2008-3
10.1038/s41586-020-2008-3
10.1128/JVI.68.7.4525-4537.1994
10.1128/JVI.68.8.5216-5224.1994
10.1002/eji.1830250904
10.1016/s0092-8674(05)80090-x
10.1084/jem.173.4.1017
10.1111/j.1432-1033.1991.tb16139.x
10.1083/jcb.105.3.1183
10.1073/pnas.86.3.840
10.1016/s0092-8674(00)80534-6
10.1073/pnas.022637199
10.1083/jcb.108.2.703
10.1126/science.1085952
10.1111/j.1432-1033.1991.tb16139.x
10.1016/0888-7543(95)80171-h
10.1128/JVI.65.12.6881-6891.1991
10.1007/BF00404425
10.3390/antib11010013
10.3390/antib11010013
10.3390/antib11010013
10.3390/antib11010013
10.3390/antib11010013
10.3390/antib11010013
10.3390/antib11010013
10.3390/antib11010013
10.3390/antib11010013
10.3390/antib11010013
10.3390/antib11010013
10.3390/antib11010013
10.1080/19420862.2021.2021601
10.1080/19420862.2021.2021601
10.1080/19420862.2021.2021601
10.1080/19420862.2021.2021601
10.1080/19420862.2021.2021601
10.1080/19420862.2021.2021601
10.1016/j.celrep.2022.110428
10.1016/j.celrep.2022.110428
10.1016/j.c

In [11]:
xref

[{'GBXref_dbname': 'doi', 'GBXref_id': '10.1038/s41586-020-2008-3'}]