In [24]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from datetime import datetime
import pandas as pd
import os

In [2]:
path = f'{os.getcwd()}/PDBs'
df = pd.read_csv(f'{path}/APD3.csv')

In [3]:
def multi_replace(text, replaces) -> (str, dict):
    for key in replaces.keys():
        text = text.replace(key, replaces[key])
    return text

In [4]:
# Fix type error in line 2745
df.loc[2745, 'Reference'] = 'Not Found'

In [5]:
doi = {
    'Index': [],
    'DOI': []
}

replaces = {
    'doi: ': 'doi:',
    '.Pub': ' ',
    'doi:': 'doi: ',
    'doi.org/': 'doi: ',
    '.Abs': ' '
}

# Get DOIs from DataFrame
for i in range(0, len(df['Reference'])):
    for ref_info in df['Reference'][i].split('. '):
        if 'doi' in ref_info:
            ref_info = multi_replace(ref_info, replaces)
            doi_text = ref_info.split("doi:")[1].split(' ')[1]
            doi['Index'].append(i)
            doi['DOI'].append(f'https://doi.org/{doi_text}')

In [6]:
references = {}

replaces = {
    'Pub-Med': 'PubMed',
    ' PubMed': 'PubMed',
    ' Pub-Med': 'PubMed',
    ' OnlineJournal': 'OnlineJournal',
    ' Publiser': 'Publiser',
    ' SciDirect': 'ScienceDirect',
    '1': ' '
}

for index in range(0, len(df['Reference'])):
    # Ignore references what already have DOI
    if index in doi['Index']:
        continue
    
    # Get website name
    ref_info = [x for x in df['Reference'][index].split('.') if x != '']
    ref_name = ref_info[len(ref_info)-1]
    
    # Execute replaces
    if not ' ' in ref_name or ref_name in replaces.keys():
        ref_name = multi_replace(ref_name, replaces)
        
    # Verify names what don't have references
    if 'Submit' in ref_name:
        ref_name = 'Submitted'
    elif ' ' in ref_name or len(ref_name) < 5:
        ref_name = 'Not Found'
    
    # Verify if name already stays at list
    exists = [x for x in references.keys() if x in ref_name]
    if not exists:
        references[ref_name] = [index]
    else:
        references[exists[0]].append(index)

In [8]:
for k in references.keys():
    print(k, len(references[k]))

PubMed 1954
OnlineJournal 1
Not Found 850
Submitted 15
Publiser 12
ScienceDirect 5
GenBank 1


In [20]:
driver = webdriver.Chrome()

In [9]:
# PubMed
for index in references['PubMed']:
    # Search article from reference
    driver.get('https://pubmed.ncbi.nlm.nih.gov')
    search = driver.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div/form/div/div[1]/div/span/input')
    search.send_keys(df['Reference'][index])
    driver.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div/form/div/div[1]/div/button').click()
    try:
        # Get DOI
        doi_text = driver.find_element(By.CLASS_NAME, 'identifier.doi').text.split(' ')[1]
        doi['Index'].append(index)
        doi['DOI'].append(f'https://doi.org/{doi_text}')
        print(f'{index} - {doi_text}')
    except:
        print(f'Don\'t finded DOI for index {index}')

0 - 10.1111/j.1432-1033.1994.tb19924.x
1 - 10.1111/j.1432-1033.1990.tb15315.x
2 - 10.1016/0014-5793(95)00666-w
3 - 10.1016/0014-5793(95)00666-w
4 - 10.1002/j.1460-2075.1991.tb07932.x
5 - 10.1002/j.1460-2075.1989.tb08368.x
6 - 10.1002/j.1460-2075.1989.tb08368.x
Don't finded DOI for index 7
8 - 10.1128/iai.57.10.3142-3146.1989
9 - 10.1128/iai.57.10.3142-3146.1989
Don't finded DOI for index 10
11 - 10.1046/j.1432-1327.2000.01536.x
12 - 10.1046/j.1432-1327.2000.01536.x
13 - 10.1046/j.1432-1327.2000.01536.x
14 - 10.1046/j.1432-1327.2000.01536.x
15 - 10.1046/j.1432-1327.2000.01536.x
16 - 10.1046/j.1432-1327.2000.01536.x
17 - 10.1046/j.1432-1327.2000.01536.x
18 - 10.1046/j.1432-1327.2000.01536.x
19 - 10.1046/j.1432-1327.2000.01536.x
20 - 10.1046/j.1432-1327.2000.01536.x
21 - 10.1046/j.1432-1327.2000.01536.x
23 - 10.1073/pnas.192301899
24 - 10.1073/pnas.192301899
25 - 10.1111/j.1365-2672.1992.tb05007.x
26 - 10.1111/j.1365-2672.1995.tb01671.x
27 - 10.7164/antibiotics.46.1756
28 - 10.1128/AAC.44

365 - 10.1074/jbc.271.45.28375
366 - 10.1074/jbc.271.45.28375
367 - 10.1016/s0014-5793(97)01310-0
Don't finded DOI for index 368
Don't finded DOI for index 369
373 - 10.1046/j.1432-1033.2002.03177.x
374 - 10.1046/j.1432-1033.2002.03177.x
375 - 10.1074/jbc.M100216200
376 - 10.1074/jbc.M100216200
377 - 10.1074/jbc.M100216200
378 - 10.1074/jbc.M100216200
379 - 10.1074/jbc.M100216200
380 - 10.1074/jbc.M100216200
381 - 10.1074/jbc.M100216200
382 - 10.1074/jbc.M100216200
383 - 10.1074/jbc.M100216200
384 - 10.1046/j.1432-1033.2002.03177.x
385 - 10.1074/jbc.M100216200
386 - 10.1074/jbc.M100216200
387 - 10.1074/jbc.M100216200
388 - 10.1074/jbc.M100216200
389 - 10.1074/jbc.M100216200
390 - 10.1074/jbc.M100216200
391 - 10.1007/pl00000764
392 - 10.1007/pl00000764
393 - 10.1007/pl00000764
394 - 10.1007/pl00000764
395 - 10.1111/j.1432-1033.1991.tb16442.x
404 - 10.1006/bbrc.1998.9362
Don't finded DOI for index 405
407 - 10.1006/bbrc.1998.9362
413 - 10.1016/0965-1748(93)90032-n
414 - 10.1016/0965-1748

778 - 10.1128/AAC.47.8.2464-2470.2003
779 - 10.1080/00365549850161557
780 - 10.1128/AAC.47.8.2464-2470.2003
782 - 10.1128/AAC.47.8.2464-2470.2003
783 - 10.1128/AAC.47.8.2464-2470.2003
784 - 10.1128/AAC.47.8.2464-2470.2003
785 - 10.1128/AAC.47.8.2464-2470.2003
786 - 10.1128/AAC.47.8.2464-2470.2003
787 - 10.1128/AAC.47.8.2464-2470.2003
788 - 10.1128/AAC.47.8.2464-2470.2003
789 - 10.1128/AAC.47.8.2464-2470.2003
790 - 10.1128/AAC.47.8.2464-2470.2003
791 - 10.1128/AAC.47.8.2464-2470.2003
794 - 10.1016/s0196-9781(00)00316-8
807 - 10.1016/j.molimm.2007.11.007
808 - 10.1016/j.molimm.2006.09.031
811 - 10.1111/j.1432-1033.1996.t01-1-00516.x
812 - 10.1021/bi035400o
813 - 10.1016/j.bbapap.2003.09.004
814 - 10.1016/j.bbapap.2003.09.004
815 - 10.1016/j.bbapap.2003.09.004
816 - 10.1034/j.1399-3011.2001.00947.x
817 - 10.1034/j.1399-3011.2001.00947.x
818 - 10.1034/j.1399-3011.2001.00947.x
819 - 10.1034/j.1399-3011.2001.00947.x
820 - 10.1034/j.1399-3011.2001.00947.x
821 - 10.1034/j.1399-3011.2001.00947.

1220 - 10.1016/j.bbrc.2004.05.141
1221 - 10.1016/j.bbrc.2004.05.141
1222 - 10.1016/j.bbrc.2004.05.141
1226 - 10.1002/prot.340010305
1238 - 10.1371/journal.pone.0003217
1242 - 10.1016/j.peptides.2008.08.007
1243 - 10.1016/j.peptides.2008.08.007
1244 - 10.1016/j.peptides.2008.08.007
1245 - 10.1074/jbc.M709103200
1252 - 10.1016/j.regpep.2003.12.003
1253 - 10.1016/j.regpep.2003.12.003
1254 - 10.1016/j.regpep.2003.12.003
1255 - 10.1016/j.regpep.2003.12.003
1256 - 10.1016/j.regpep.2003.12.003
1257 - 10.1016/j.regpep.2003.12.003
Don't finded DOI for index 1258
1268 - 10.1002/cbic.200800476
1270 - 10.1016/s1570-9639(02)00432-6
1271 - 10.1016/s1570-9639(02)00432-6
1272 - 10.1016/s1570-9639(02)00432-6
1273 - 10.1016/s1570-9639(02)00432-6
1274 - 10.1016/s1570-9639(02)00432-6
1275 - 10.1016/s1570-9639(02)00432-6
1277 - 10.1515/bchm.1997.378.9.989
1278 - 10.1515/bchm.1997.378.9.989
1279 - 10.1515/bchm.1997.378.9.989
1280 - 10.1515/bchm.1997.378.9.989
1288 - 10.1110/ps.24401
1290 - 10.1074/mcp.M6003

1677 - 10.1016/j.bbagen.2010.12.002
1686 - 10.1016/0014-5793(93)81299-f
1687 - 10.1016/0014-5793(93)81299-f
1688 - 10.1016/0014-5793(93)81299-f
1689 - 10.1016/0014-5793(93)81299-f
1690 - 10.1016/0014-5793(93)81299-f
1691 - 10.1016/0014-5793(93)81299-f
1702 - 10.1016/j.peptides.2010.12.013
1703 - 10.1016/j.peptides.2010.12.013
1704 - 10.1016/j.peptides.2010.12.013
1705 - 10.1016/j.peptides.2010.12.013
1706 - 10.1016/j.peptides.2010.12.013
1707 - 10.1016/j.peptides.2010.12.013
1708 - 10.1016/j.peptides.2010.12.013
1709 - 10.1016/j.peptides.2010.12.013
1710 - 10.1016/j.peptides.2010.12.013
1711 - 10.1128/AEM.01962-10
1717 - 10.1074/jbc.M209239200
1718 - 10.1016/j.dci.2011.01.017
1719 - 10.1016/j.dci.2006.08.002
1720 - 10.1016/j.dci.2011.01.017
1721 - 10.1016/j.dci.2011.01.017
1722 - 10.1016/j.dci.2011.01.017
1723 - 10.1021/ja802966g
1724 - 10.1016/j.dci.2011.01.017
1725 - 10.1016/j.dci.2011.01.017
1726 - 10.1016/j.dci.2011.01.017
1727 - 10.1021/ja056780z
1728 - 10.1021/ja056780z
1729 - 10

Don't finded DOI for index 2014
2015 - 10.1016/s0965-1748(03)00028-6
Don't finded DOI for index 2017
Don't finded DOI for index 2018
Don't finded DOI for index 2019
Don't finded DOI for index 2020
Don't finded DOI for index 2021
Don't finded DOI for index 2022
Don't finded DOI for index 2023
2024 - 10.4049/jimmunol.181.2.1083
2025 - 10.1016/j.peptides.2012.04.010
2026 - 10.1016/j.peptides.2009.10.008
Don't finded DOI for index 2027
Don't finded DOI for index 2028
2029 - 10.1016/j.molimm.2012.07.003
2031 - 10.1007/s00253-011-3863-5
2032 - 10.5483/bmbrep.2003.36.6.603
2033 - 10.1016/j.peptides.2005.05.009
2034 - 10.1046/j.1365-313x.1999.00569.x
2035 - 10.1046/j.1365-313x.1999.00569.x
2036 - 10.1046/j.1365-313x.1999.00569.x
2038 - 10.1016/j.fsi.2012.08.022
2039 - 10.1128/am.28.2.165-168.1974
2040 - 10.1128/am.28.2.165-168.1974
2042 - 10.1016/j.fsi.2012.01.007
2043 - 10.1016/j.jprot.2012.08.004
2044 - 10.1016/j.jprot.2012.08.004
2045 - 10.1016/j.jprot.2012.08.004
2046 - 10.1016/j.jprot.201

Don't finded DOI for index 2335
2341 - 10.1016/j.bbrc.2005.03.057
2343 - 10.1016/j.dci.2010.12.011
2344 - 10.1016/j.dci.2010.12.011
2346 - 10.1128/AEM.07782-11
2350 - 10.1111/j.1432-1033.1995.tb20476.x
2351 - 10.1074/jbc.271.45.28533
2352 - 10.1074/jbc.275.15.10745
2353 - 10.1128/IAI.67.7.3542-3547.1999
Don't finded DOI for index 2354
2355 - 10.1016/j.peptides.2012.03.016
2356 - 10.1016/j.peptides.2012.03.016
2357 - 10.1016/j.peptides.2011.11.012
2358 - 10.1128/JB.01375-09
2361 - 10.1016/j.toxicon.2013.08.056
2362 - 10.1016/j.toxicon.2013.08.056
2363 - 10.1016/j.toxicon.2013.08.056
2364 - 10.1016/j.toxicon.2013.08.056
2365 - 10.1016/j.toxicon.2013.08.056
2366 - 10.1016/j.toxicon.2013.08.056
2367 - 10.1016/j.toxicon.2013.08.056
2368 - 10.1016/j.toxicon.2013.08.056
2369 - 10.1007/s11274-013-1558-z
2370 - 10.1016/j.dci.2004.11.001
2387 - 10.1111/mmi.12621
2394 - 10.1271/bbb.120681
2399 - 10.1371/journal.pone.0083044
2400 - 10.1074/jbc.M414064200
2403 - 10.1073/pnas.1319584111
2407 - 10.10

Don't finded DOI for index 2660
2661 - 10.1016/j.dci.2016.02.016
Don't finded DOI for index 2662
2663 - 10.1128/JB.188.1.328-334.2006
2664 - 10.1016/j.fsi.2016.02.022
2665 - 10.1371/journal.pone.0149729
2666 - 10.1046/j.1432-1327.1998.2530452.x
2668 - 10.1016/j.toxicon.2016.02.021
2669 - 10.1007/s00203-016-1206-8
2670 - 10.1016/j.peptides.2007.09.010
2671 - 10.1556/018.67.2016.1.10
2672 - 10.1007/s11033-016-3967-1
2674 - 10.1021/acs.jnatprod.5b01129
Don't finded DOI for index 2675
2676 - 10.1021/acs.jafc.5b05717
2677 - 10.1007/s00203-014-0958-2
2678 - 10.1002/mnfr.201500182
2679 - 10.1016/j.fsi.2014.01.003
2680 - 10.1021/acs.jafc.6b00730
2681 - 10.1371/journal.pone.0151820
2682 - 10.1371/journal.pone.0151820
2683 - 10.1371/journal.pone.0151820
2684 - 10.1111/febs.13720
2685 - 10.1111/febs.13720
2686 - 10.1111/febs.13720
2687 - 10.1126/science.aaa4690
2688 - 10.1016/S1875-5364(16)30030-9
2689 - 10.1016/S1875-5364(16)30030-9
2690 - 10.1007/s10930-016-9662-1
Don't finded DOI for index 269

In [19]:
driver.close()

In [32]:
df_doi = pd.DataFrame(doi)
df_doi.to_csv(f'{path}/APD3_DOI.csv', index=False)

In [35]:
df_doi

Unnamed: 0,Index,DOI
0,156,https://doi.org/10.1021/bi00100a014
1,163,https://doi.org/10.1016/s0167-4781(97)00194-2
2,186,https://doi.org/10.1128/iai.42.1.10-14.1983
3,187,https://doi.org/10.1128/iai.42.1.10-14.1983
4,205,https://doi.org/10.3181/00379727-60-15091.
...,...,...
2524,3170,https://doi.org/10.1371/journal.pone.0169582
2525,3171,https://doi.org/10.1038/s41598-020-60623-0
2526,3397,https://doi.org/10.1016/j.peptides.2011.08.015
2527,3398,https://doi.org/10.1016/j.peptides.2011.08.015


In [33]:
# Add DOI column to DataFrame
doi_index = 0
doi_values = []
for i in range(0, len(df['Reference'])):
    if i == df_doi['Index'][doi_index]:
        doi_values.append(df_doi['DOI'][doi_index])
        doi_index += 1
    else:
        doi_values.append('Not Found')

df['DOI'] = doi_values
        
for i in range(0, len(doi_values)):
    print(i, doi_values[i])

0 Not Found
1 Not Found
2 Not Found
3 Not Found
4 Not Found
5 Not Found
6 Not Found
7 Not Found
8 Not Found
9 Not Found
10 Not Found
11 Not Found
12 Not Found
13 Not Found
14 Not Found
15 Not Found
16 Not Found
17 Not Found
18 Not Found
19 Not Found
20 Not Found
21 Not Found
22 Not Found
23 Not Found
24 Not Found
25 Not Found
26 Not Found
27 Not Found
28 Not Found
29 Not Found
30 Not Found
31 Not Found
32 Not Found
33 Not Found
34 Not Found
35 Not Found
36 Not Found
37 Not Found
38 Not Found
39 Not Found
40 Not Found
41 Not Found
42 Not Found
43 Not Found
44 Not Found
45 Not Found
46 Not Found
47 Not Found
48 Not Found
49 Not Found
50 Not Found
51 Not Found
52 Not Found
53 Not Found
54 Not Found
55 Not Found
56 Not Found
57 Not Found
58 Not Found
59 Not Found
60 Not Found
61 Not Found
62 Not Found
63 Not Found
64 Not Found
65 Not Found
66 Not Found
67 Not Found
68 Not Found
69 Not Found
70 Not Found
71 Not Found
72 Not Found
73 Not Found
74 Not Found
75 Not Found
76 Not Found
77 Not Fo

In [34]:
dt = datetime.now()
df.to_csv(f'{path}/APD3_releases/APD3_{dt.strftime("%Y_%m_%d")}.csv', index=False)
df.to_csv(f'{path}/APD3.csv', index=False)

In [36]:
df

Unnamed: 0,APD ID,Name/Class,Source,Sequence,Length,Net charge,Hydrophobic residue%,Boman Index,3D Structure,Method,SwissProt ID,Activity,Crucial residues,Additional info,Title,Author,Reference,DOI
0,AP00001,"Dermaseptin-B2 (XXA, DRS-B2, Dermaseptin B2, D...","skin, Giant leaf frog, Phyllomedusa bicolor, S...",GLWSKIKEVGKEAAKAAAKAAGKAALGAVSEAV,33,4,54%,0.23,Helix,NMR,SwissProt ID: P31107 Go to SwissProt,"Anti-Gram+ & Gram-, Antifungal, candidacidal, ...",N-terminal segment,"History: A frog used for ""hunting magic"" by se...",Isolation and structure of novel defensive pep...,"Mor, A., Nicolas, P.1994","Eur J Biochem 1994, 219 (1-2):145-54. PubMed.",Not Found
1,AP00002,"Abaecin (natural AMPs; Pro-rich; insects, arth...","honeybee, Apis mellifera L.",YVPLPNVPQPGRRPFPTFPGQGPFNPKIKWPQGY,34,4,23%,1.19,Rich,,SwissProt ID: P15450 Go to SwissProt,Anti-Gram+ & Gram-,,Rich in P (29.4%).\nActivity: Active against A...,"Isolation and characterization of abaecin, a m...","Casteels P, Ampe C, Riviere L, Van Damme J, El...",Eur J Biochem. 1990 Jan 26;187(2):381-6. PubMed.,Not Found
2,AP00003,"Hs-AFP1 (HsAFP1, H. sanguinea antifungal prote...",Heuchera sanguinea,DGVKLCDVPSGTWSGHCGSSSKCSQQCKDREHFAYGGACHYQFPSV...,54,6,33%,1.95,Bridge,,Reference ID: Ref,Antifungal,,In medium A supplemented with 1 mM CaCl2 and 5...,Isolation and characterisation of plant defens...,"Osborn RW, De Samblanx GW, Thevissen K, Goderi...",FEBS Lett. 1995 Jul 17;368(2):257-62. PubMed.,Not Found
3,AP00004,"Ct-AMP1 (CtAMP1, C. ternatea-antimicrobial pep...",Clitoria ternatea,NLCERASLTWTGNCGNTGHCDTQCRNWESAKHGACHKRGNWKCFCYFDC,49,5,36%,2.43,Bridge,,Reference ID: Ref,"Anti-Gram+, Antifungal",,In medium A supplemented with 1 mM CaCl2 and 5...,Isolation and characterisation of plant defens...,"Osborn RW, De Samblanx GW, Thevissen K, Goderi...",FEBS Lett. 1995 Jul 17;368(2):257-62. PubMed.,Not Found
4,AP00005,"Andropin (natural AMPs; insects, arthropods, i...","Fruit fly, Drosophila melanogaster",VFIDILDKVENAIHNAAQVGIGFAKPFEKLINPK,34,1,50%,0.55,Unknown,,SwissProt ID: P21663 Go to SwissProt,Anti-Gram+,,Active against B. megatherium Bml 1 (MIC 11 uM...,"The andropin gene and its product, a male-spec...","Samakovlis, C., Kylsten, P., Kimbrell, DA., En...",EMBO J. 1991; 10:163-169. PubMed.,Not Found
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3574,AP03575,"Spampcin(65-86) (Spa22; Synthetic, shorter fra...",truncation,RLRAPPPFHKRCVCLCRTAPPP,22,6,41%,2.18,Unknown,,Reference ID: Ref,"Anti-Gram+ & Gram-, Antifungal",,APD analysis reveals this sequence is most sim...,A Novel Antimicrobial Peptide Spampcin56-86 fr...,"Jiang M, Chen R, Zhang J, Chen F, Wang KJ. 2022",Int J Mol Sci. 2022 Nov 1;23(21):13316. doi: 1...,https://doi.org/10.3390/ijms232113316
3575,AP03576,"Spampcin(65-78) (Spa14; Synthetic, shortest fr...",truncation,RLRAPPPFHKRCVC,14,5,43%,2.76,Unknown,,Reference ID: Ref,"Anti-Gram+ & Gram-, Antifungal",,APD analysis reveals this sequence is most sim...,A Novel Antimicrobial Peptide Spampcin56-86 fr...,"Jiang M, Chen R, Zhang J, Chen F, Wang KJ. 2022",Int J Mol Sci. 2022 Nov 1;23(21):13316. doi: 1...,https://doi.org/10.3390/ijms232113316
3576,AP03577,"MDN-0066 (Leu-rich, natural AMPs, Gram-negativ...",Pseudomonas moraviensis HN2,LETLLSLI,8,-1,63%,-1.47,Unknown,,Reference ID: Ref,Anti-Gram+ & Gram-,,APD analysis reveals this sequence is most sim...,Genome mining and chemical characterization of...,Ma Z. 2023,J Antibiot (Tokyo). 2023 Jan 26. doi: 10.1038/...,https://doi.org/10.1038/s41429-023-00597-z
3577,AP03578,"MDN-0066-beta (Leu-rich, Gram-negative bacteri...",Pseudomonas moraviensis HN2,LETLLSLV,8,-1,63%,-1.36,Unknown,,Reference ID: Ref,Anti-Gram+ & Gram-,,APD analysis reveals this sequence is most sim...,Genome mining and chemical characterization of...,Ma Z. 2023,J Antibiot (Tokyo). 2023 Jan 26. doi: 10.1038/...,https://doi.org/10.1038/s41429-023-00597-z
