In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from datetime import datetime
import pubmed as pm
import pandas as pd
import polars as pl
import utils
import time
import os

In [7]:
path = utils.path()
log_path = f'{path.split("PDBs_releases")[0]}log'
url = 'http://webs.iiitd.edu.in/raghava/cancerppd/browse_peptide.php'

In [3]:
data = {
    'CancerPPD ID': [],
    'PMID': [],
    'YEAR': [],
    'SEQUENCE': [],
    'NAME': [],
    'LENGTH': [],
    'LINEAR/CYCLIC': [],
    'CHIRAL': [],
    'CHEM-MOD': [],
    'C-ter MOD': [],
    'N-ter MOD': [],
    'NATURE': [],
    'ORIGIN': [],
    'CELL LINE': [],
    'CANCER TYPE': [],
    'ASSAY': [],
    'ACTIVITY': [],
    'TESTING TIME': [],
    'TISSUE EFFECTED': [],
    'PATENTS': [],
    'SMILES': []
}

In [4]:
driver = webdriver.Chrome()

In [5]:
count_errors = []
error_line = ''
count = 1 # max=623
while True:
    driver.get(url)
    time.sleep(2)
    
    #Show all peptides
    select = Select(driver.find_element(By.XPATH, '/html/body/table[4]/tfoot/tr/td/select'))
    select.select_by_value('1000')
    
    #Show details from one peptide
    try:
        pep = driver.find_element(By.XPATH, f'/html/body/table[4]/tbody/tr[{count}]/td[1]/a')
        error_line = f'{count}, {pep.text}'
        pep.click()
        time.sleep(2)
    except:
        print('Finished')
        break
        
    #Capture data
    try:
        table = driver.find_element(By.XPATH, '/html/body/div[4]/div[1]/table/tbody')
        elements = table.find_elements(By.TAG_NAME, 'tr')
        for element in elements:
            props = element.find_elements(By.TAG_NAME, 'td')
            index = 0
            for key in data.keys():
                try:
                    data[key].append(props[index].text)
                    index += 1
                except:
                    data[key].append('Not Found')
    except:
        count_errors.append(error_line)
    finally:
        count += 1

Finished


In [7]:
# Create DataFrame
df = pd.DataFrame(data)

In [8]:
# Get DOI from PubMed
doi_list = []

#for pmid in data['PMID']:
for pmid in df['PMID']:
    if pmid != 'None':
        if len(pmid.split(',')) > 1:
            [pmid1, pmid2] = pmid.split(', ')
            doi1 = pm.get_doi_from_pmid(driver, pmid1)
            doi2 = pm.get_doi_from_pmid(driver, pmid2)
            doi_list.append([doi1, doi2])
        else:
            doi = pm.get_doi_from_pmid(driver, pmid)
            doi_list.append(doi)
    else:
        doi_list.append('Not Found')
    
df['DOI'] = doi_list

In [9]:
driver.close()

In [23]:
#Write log file with failed peptides download 
dt = datetime.now()
with open(f'{log_path}\\CancerPPD_errors_{dt.strftime("%Y_%m_%d_%H_%M")}.log', 'a', encoding='utf8') as file:
    print('failed peptides download:', len(count_errors))
    for value in count_errors:
        file.write(value)
        print(value.split(', '))

failed peptides download: 0


In [42]:
# Save csv file
utils.pd_to_csv(df, path, 'CancerPPD')

Save files in:
D:\WilliamJSS\Projects\LBQC\LBQC-PDB\PDBs_releases/CancerPPD/CancerPPD_2023_03_14.csv
D:\WilliamJSS\Projects\LBQC\LBQC-PDB\PDBs_releases/CancerPPD.csv


In [41]:
df

Unnamed: 0,CancerPPD ID,PMID,YEAR,SEQUENCE,NAME,LENGTH,LINEAR/CYCLIC,CHIRAL,CHEM-MOD,C-ter MOD,...,ORIGIN,CELL LINE,CANCER TYPE,ASSAY,ACTIVITY,TESTING TIME,TISSUE EFFECTED,PATENTS,SMILES,DOI
0,No data available in table,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,...,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found
1,No data available in table,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,...,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found
2,No data available in table,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,...,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found
3,4644,8445676,1993,2-[[2-(dimethylamino)-3-methylbutanoyl]amino]-...,Dolastatin 10,Not Available,Not Available,Not Available,Not Available,Not Available,...,Dolabella auricularia,DB,Lymphoma Cancer,Cell Viability assay,IC50=.0013-.013 nM,Not Available,Blood,,,https://doi.org/10.1093/jnci/85.6.483
4,4645,8445676,1993,2-[[2-(dimethylamino)-3-methylbutanoyl]amino]-...,Dolastatin 10,Not Available,Not Available,Not Available,Not Available,Not Available,...,Dolabella auricularia,HT,Lymphoma Cancer,Cell Viability assay,IC50=.0013-.013 nM,Not Available,Blood,,,https://doi.org/10.1093/jnci/85.6.483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5283,4223,16091933,2005,YSL,Tyroserleutide,3,Linear,L,,Free,...,Synthetic Peptide,BEL-7402,Liver Cancer,MTT/MTS assay,39.0% cytotoxic at 0.1 µg/ml,96-h,Liver,,N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H]...,https://doi.org/10.1007/s00262-005-0024-7
5284,4224,16091933,2005,YSL,Tyroserleutide,3,Linear,L,,Free,...,Synthetic Peptide,BEL-7402,Liver Cancer,MTT/MTS assay,20.8% cytotoxic at 1 µg/ml,96-h,Liver,,N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H]...,https://doi.org/10.1007/s00262-005-0024-7
5285,4225,16091933,2005,YSL,Tyroserleutide,3,Linear,L,,Free,...,Synthetic Peptide,BEL-7402,Liver Cancer,MTT/MTS assay,33.8% cytotoxic at 10 µg/ml,96-h,Liver,,N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H]...,https://doi.org/10.1007/s00262-005-0024-7
5286,4226,16091933,2005,YSL,Tyroserleutide,3,Linear,L,,Free,...,Synthetic Peptide,BEL-7402,Liver Cancer,MTT/MTS assay,35.6% cytotoxic at 100 µg/ml,96-h,Liver,,N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H]...,https://doi.org/10.1007/s00262-005-0024-7
