## Auto-AbDab

**Pipeline for the automatic generation of disease-specific antibody databases**

In [1]:
from auto_abdab import (
    utils,
    patents_pipeline, 
    get_supp_seqs,
    keywords2papers,
    keywords2pdbs,
    genbank_pipeline,
    collate_results
    )



1. load keywords and known antigens from test file

In [2]:
keywords_disease = utils.load_keywords('../src/covid_keywords.txt')
known_antigens = utils.load_known_antigens('../src/covid_known_antigens.txt')

In [4]:
keywords_disease

['COVID-19',
 'CORONAVIRUS',
 'SARS-COV',
 'MERS-COV',
 'SARS',
 'SARS-COV-2',
 'SPIKE PROTEIN',
 'RBD',
 'RECEPTOR-BINDING DOMAIN',
 'MERS',
 'MIDDLE EAST RESPIRATORY SYNDROME',
 'SEVERE ACUTE RESPIRATORY SYNDROME']

In [5]:
known_antigens

{'COVID': ['coronavirus', 'sars-cov', 'sars'],
 'MERS': ['mers-cov', 'mers'],
 'SARS-COV-2': ['sars-cov-2', 'covid-19'],
 'SARS-COV-1': ['sars-cov-1'],
 'Spike protein': ['spike', 'spike protein'],
 'RBD': ['receptor binding domain', 'rbd']}

2. search patents

In [None]:
patents_pipeline.get_seq_from_patents(keywords_disease)

3. Search SI

In [6]:
# find paper links
k2p = keywords2papers.Keywords2Papers(keywords_disease)
pubmed_results = k2p.get_pubmed()
biorxiv_results = k2p.get_biorxiv()
paper_urls = papers2urls(pubmed_results, biorxiv_results)

# get sequences from supplementary material
get_seqs_from_supp(paper_urls)

getting pubmed
fetching pubmed
((COVID-19) OR (CORONAVIRUS) OR (SARS-COV) OR (MERS-COV) OR (SARS) OR (SARS-COV-2) OR (SPIKE PROTEIN) OR (RBD) OR (RECEPTOR-BINDING DOMAIN) OR (MERS) OR (MIDDLE EAST RESPIRATORY SYNDROME) OR (SEVERE ACUTE RESPIRATORY SYNDROME)) AND ((antibody) OR (antibodies) OR (nanobody) OR (immunoglobulin) OR (MAb) OR (nanobodies)) AND ((neutralizing) OR (neutralize) OR (neutralization) OR (bind) OR (binding) OR (inhibit) OR (targeting)) AND ((heavy chain) OR (complementarity determining region) OR (gene) OR (epitope) OR (receptor-binding domain) OR (rbd) OR (spike protein) OR (VHH))
['title', 'authors', 'date', 'abstract', 'journal', 'doi']


HTTPError: 400 Client Error: Bad Request for url: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?tool=MyTool&email=abc%40def.gh&db=pubmed&term=%28%28COVID-19%29+OR+%28CORONAVIRUS%29+OR+%28SARS-COV%29+OR+%28MERS-COV%29+OR+%28SARS%29+OR+%28SARS-COV-2%29+OR+%28SPIKE+PROTEIN%29+OR+%28RBD%29+OR+%28RECEPTOR-BINDING+DOMAIN%29+OR+%28MERS%29+OR+%28MIDDLE+EAST+RESPIRATORY+SYNDROME%29+OR+%28SEVERE+ACUTE+RESPIRATORY+SYNDROME%29%29+AND+%28%28antibody%29+OR+%28antibodies%29+OR+%28nanobody%29+OR+%28immunoglobulin%29+OR+%28MAb%29+OR+%28nanobodies%29%29+AND+%28%28neutralizing%29+OR+%28neutralize%29+OR+%28neutralization%29+OR+%28bind%29+OR+%28binding%29+OR+%28inhibit%29+OR+%28targeting%29%29+AND+%28%28heavy+chain%29+OR+%28complementarity+determining+region%29+OR+%28gene%29+OR+%28epitope%29+OR+%28receptor-binding+domain%29+OR+%28rbd%29+OR+%28spike+protein%29+OR+%28VHH%29%29&retmax=50000&retmode=json

4. Search PDB

In [None]:
keywords2pdbs.get_or_update_pdb_chains(keywords_disease, save=True)

5. search genbank

In [None]:
genbank_pipeline.get_seqs_from_genbank(keywords_disease, known_antigens, output_path='../data/genbank/')

6. Create database

In [2]:
collate_results.collate_results(outfile_name='../data/final_antibody_db.csv')

Running dataframes through IgBLAST and ANARCI...
PATENT
Limiting hmmer search to species ['human', 'mouse'] was requested but hits did not achieve a high enough bitscore. Reverting to using any species
Limiting hmmer search to species ['human', 'mouse'] was requested but hits did not achieve a high enough bitscore. Reverting to using any species
Total number of sequences input:  4764
Total sequences N/A from input:  239
Total sequences lost after failing ANARCI filtering:  184
SI
Total number of sequences input:  916
Total sequences N/A from input:  1
Total sequences lost after failing ANARCI filtering:  0
PDB
Limiting hmmer search to species ['human', 'mouse'] was requested but hits did not achieve a high enough bitscore. Reverting to using any species
Limiting hmmer search to species ['human', 'mouse'] was requested but hits did not achieve a high enough bitscore. Reverting to using any species
Total number of sequences input:  1706
Total sequences N/A from input:  0
Total sequences 

Unnamed: 0,Scrape source,URL,Source,Reference,DOI,PDB ID,GenBank ID VH,GenBank ID VL,Binds to,Origin,...,CDRH1,CDRH2,CDRH3,CDRL1,CDRL2,CDRL3,Heavy J gene,Heavy V gene,Light J gene,Light V gene
0,Patent,https://patents.google.com/patent/US11034762B1/en,"(334/394) in one instance, the antibody or ant...",,,,,,,,...,GGTFSSYA,INPGGGNT,AGEWGISTPMDV,QSVLYSSINKNY,WAS,QQYYGFPLT,IGHJ6*01 (human),IGHV1-69*01 (human),IGKJ4*01 (human),IGKV4-1*01 (human)
1,Patent,https://patents.google.com/patent/US11034762B1/en,"(318/378) in one instance, the antibody or ant...",,,,,,,,...,GGTFSSYA,IYPGDSTT,YSGSGISTPMDV,QSVLYSSINKNY,WAS,RQYYSTPWT,IGHJ6*01 (human),IGHV1-69*01 (human),IGKJ4*01 (human),IGKV4-1*01 (human)
2,Patent,https://patents.google.com/patent/US11034762B1/en,"(322/382) in one instance, the antibody or ant...",,,,,,,,...,GYTFTGHY,IYPGDSEA,AGGSGISTPMDV,QSVLYTSNNNNY,WGS,RQYYSTPWT,IGHJ6*01 (human),IGHV1-46*01 (human),IGKJ4*01 (human),IGKV4-1*01 (human)
3,Patent,https://patents.google.com/patent/US11034762B1/en,"(351/411) in one instance, the antibody or ant...",,,,,,,,...,GYGFITYW,IYPGDSET,AGGSGLFTPMDV,QSVLYSSINKNY,WAS,QQYSTTPYT,IGHJ6*01 (human),IGHV1-69*01 (human),IGKJ4*01 (human),IGKV4-1*01 (human)
4,Patent,https://patents.google.com/patent/US11034762B1/en,"(323/383) in one instance, the antibody or ant...",,,,,,,,...,GGTFSSYA,IIPIFGTT,AGEWGISTPMDV,QSVLYSSINKNY,WAN,QQYYSTPLT,IGHJ6*01 (human),IGHV1-69*01 (human),IGKJ4*01 (human),IGKV4-1*01 (human)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6210,GenBank,,,"Hong,J. et al.; Generation and characterizatio...",,,,QIQ28233,"['COVID', 'SARS-COV-2', 'RBD']",Oryctolagus cuniculus (rabbit),...,,,,qsigtr,yas,lgehsyssgdgrta,,,IGKJ4*01 (human),IGKV1-5*01 (human)
6211,GenBank,,,"Hong,J. et al.; Generation and characterizatio...",,,,QIQ28232,"['COVID', 'SARS-COV-2', 'RBD']",Oryctolagus cuniculus (rabbit),...,,,,ediysn,das,qtyystvtra,,,IGKJ4*01 (human),IGKV1-5*01 (human)
6212,GenBank,,,"Hong,J. et al.; Generation and characterizatio...",,,,QIQ28231,"['COVID', 'SARS-COV-2', 'RBD']",Oryctolagus cuniculus (rabbit),...,,,,eniysf,tas,qqtdtysnvdna,,,IGKJ4*01 (human),IGKV1-13*02 (human)
6213,GenBank,,,"Hong,J. et al.; Generation and characterizatio...",,,,QIQ28230,"['COVID', 'SARS-COV-2', 'RBD']",Oryctolagus cuniculus (rabbit),...,,,,qsvydnnw,sas,aggysgnifa,,,IGKJ5*01 (human),IGKV1-12*01 (human)
