### Import Package:

In [1]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import pandas as pd
from collections import Counter
import re
import os

### Import Data

In [27]:
files = os.listdir(".\\PPsample") #Load the file stored protein sequences
files

['samplingPorphyra dentata',
 'samplingPorphyra dentata.csv',
 'samplingporphyra haitanensis',
 'samplingporphyra haitanensis.csv',
 'samplingPorphyra perforata',
 'samplingPorphyra perforata.csv',
 'samplingPorphyra purpurea',
 'samplingPorphyra purpurea.csv',
 'samplingPorphyra schizophylla.csv',
 'samplingPorphyra umbilicalis',
 'samplingPorphyra umbilicalis.csv']

### Extract the sequence from a csv file:

In [28]:
#treated with the first csv file:
protein_csv_location=".\\PPsample\\"+files[0]
protein_data=pd.read_csv(protein_csv_location)
data1=list(protein_data["Sequence"])
data2=list(protein_data["Length"])

### Function section:

In [4]:
def openbrower():
    """
    open the brower:
    return driver
    """
    driver=webdriver.Firefox()
    driver.implicitly_wait(10)
    driver.set_window_size(300,6600)
    return driver

def seqtolink(driver,seq,enzyme="&enz1=13&enz2=12&enz3=11"):
    """
    build the required link and send the link
    the utilized enzymes could be modified in parameter enzyme
    Three hydrolase enzymes used in this function were trypsin(No. 12), chymotrypsin A (No. 11) and pepsin (No.13)
    return None
    """
    linkhead = "https://biochemia.uwm.edu.pl/biopep/report_cutting_for_seq.php?txt_seq_e=&txt_seq="
    enzymeinformation=enzyme
    linkend = "&but_report.x=169&but_report.y=10&prot=&e2=&e3=&ktory=&e1=eee"
    link = linkhead+seq+enzymeinformation+linkend
    driver.get(link)
    return None

def getdata(driver,seqid,content):
    """
    get the data about hydrolase peptides
    return content, all hydrolase peptides about these proteins
    """
    hpep_xpath = "/html/body/div/table[1]/tbody"
    hpep = driver.find_element(By.XPATH,hpep_xpath).text.replace(" - ",",")
    hpep=hpep.replace("Results of enzyme action\n","")
    hpep2=hpep.replace(",",","+str(seqid)+"\n")
    content+=hpep2+"\n"
    return content,hpep

In [5]:
def statpep(string,num,content): 
    """
    Statistical number of peptide from a protein
    return string, one line consist of peptide+frequency+protein sequence index
    """
    content = content.split(",")
    cdict = Counter(content) # a dict
    peptidename = list(cdict.keys())
    peptidefreq = list(cdict.values())
    length = len(peptidefreq)
    del cdict
    for i in range(length):
        string += peptidename[i]+","+str(peptidefreq[i])+","+str(num)+"\n"
    return string

In [6]:
def intoperportyPage(driver):
    paramter_link = "/html/body/div/table[2]/tbody/tr/td[2]/form/input[8]"
    driver.find_element(By.XPATH,paramter_link).click()
    time.sleep(3)
    return None

def _GetDHt(driver):
    table_xpath = "/html/body/form/table[1]/tbody"
    data = driver.find_element(By.XPATH,table_xpath).text
    data = data.replace("DHt [%] \n","")
    return data

def _recordDHt(string,seqid,data):
    string += data+","+str(seqid)+"\n"
    return string

def _GetParameter(driver):
    table_xpath = "/html/body/form/table[2]"
    table = driver.find_element(By.XPATH,table_xpath)
    tabledatal = []
    tr = table.find_elements(By.TAG_NAME,'tr') 
    for linedata in tr:
        tabledatal.append(linedata.text)
    return tabledatal

def intoperportyPage2(driver):
    pl = "/html/body/div/table[2]/tbody/tr/td[1]/form/input[9]"
    driver.find_element(By.XPATH,pl).click()
    time.sleep(2)
    return None

def getaop(driver):
    table_ele = "/html/body/form/table/tbody"
    table = driver.find_element(By.XPATH,table_ele)
    tdl = []
    tr = table.find_elements(By.TAG_NAME,'tr') 
    for linedata in tr:
        tdl.append(linedata.text)
    return tdl

def foundParameter(clist):
    """
    found the antioxidant property about the protein sequence
    return data, if this Hydrolysate did not contained any AOP, the data equal to antioxidative,0,0,0,0
    """
    data = ""
    pattern = "antioxidative"
    for string in clist:
        hit = re.search(pattern,string)
        if hit:
            data = clist[clist.index(string)]
            break
        else:
            pass
    if data != "":
        data = data.replace("  ",",")[2:]
        
    elif data == "":
        data = "antioxidative,0,0,0,0"
    return data

def foundParameter2(clist,string,seqid):
    AOP = []
    pattern = "antioxidative"
    for line in clist:
        hit = re.search(pattern,line)
        if hit:
            data = line.split("  ")[2]
            AOP.append(data)
        else:
            pass
    if len(AOP) != 0:
        AOP = list(set(AOP))
        for i in AOP:
            string += i+","+str(seqid)+"\n"
    elif len(AOP) == 0:
        string += "None"+ ","+str(seqid)+"\n"
    return string

### Main section:

In [30]:
pepFragmentcontent=""  #collect enzyme action peptide fragment
freqstring = ""  #get the frequency of peptide happen in sequence
DHTstring = ""  #record DHt
otherparameters = ""  #record Ae and W
aopstring = ""  #record the AOP in sequence after enzyme action
badpep = ""  #record the sequence which can'n be analysis


driver = openbrower() #open the brower
for i in range(len(data1)):
    try:
        if len(data1[i]) != 0:
            seqtolink(driver,data1[i]) #going to the enzyme action finished web page
            pepFragmentcontent,content = getdata(driver,i,pepFragmentcontent) #collect the enzyme action fragments

            freqstring = statpep(freqstring,i,content) #get each peptide frequency

            intoperportyPage(driver) #into new page

            prodata = _GetDHt(driver)
            DHTstring = _recordDHt(DHTstring,i,prodata)

            oplist = _GetParameter(driver)  #get other parameters shch as AE, W
            otherparameters += foundParameter(oplist)+","+str(i)+"\n"

            seqtolink(driver,data1[i]) #update page

            intoperportyPage2(driver)
            aoplist = getaop(driver)
            aopstring = foundParameter2(aoplist,aopstring,i)
            if i%100 == 0:
                print("Luckly! Hydrolysis information for "+str(i)+" sequences has been obtained.")
        else:
            pass
    except Exception as e:
        print(e)
        print("Note! "+str(i)+" is Error!")
        badpep += str(i+1)+","+data1[i]+"\n"
        continue

with open("DHT.csv","w") as f:
    f.write("DHT,seqid\n")
    f.write(DHTstring)
    f.close()

with open("OtherP.csv","w") as f:
    f.write("Activity,AE,W,BE,V,seqid\n")
    f.write(otherparameters)
    f.close()
    
with open("AOP.csv","w") as f:
    f.write("AOP,seqid\n")
    f.write(aopstring)
    f.close()
    
with open("enzymeP.csv","w") as f:
    f.write("peptide,seqid\n")
    f.write(pepFragmentcontent)
    f.close()
with open("enzymePstat.csv","w") as f:
    f.write("peptide,freq,seqid\n")
    f.write(freqstring)
    f.close()

with open("badpep.csv","w") as f:
    f.write("pepid,sequence\n")
    f.write(badpep)
    f.close()

0


In [20]:
with open("DHT.csv","w") as f:
    f.write("DHT,seqid\n")
    f.write(DHTstring)
    f.close()

with open("OtherP.csv","w") as f:
    f.write("Activity,AE,W,BE,V,seqid\n")
    f.write(otherparameters)
    f.close()
    
with open("AOP.csv","w") as f:
    f.write("AOP,seqid\n")
    f.write(aopstring)
    f.close()
    
with open("enzymeP.csv","w") as f:
    f.write("peptide,seqid\n")
    f.write(pepFragmentcontent)
    f.close()
with open("enzymePstat.csv","w") as f:
    f.write("peptide,freq,seqid\n")
    f.write(freqstring)
    f.close()
with open("badpep.csv","w") as f:
    f.write("pepid,sequence\n")
    f.write(badpep)
    