Import function

In [12]:
import pandas as pd
import concurrent.futures
import re
import requests
from zeep import Client
import hashlib
import numpy as np
from Bio import Entrez
import statistics

wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
password = hashlib.sha256("anh2cuchit".encode("utf-8")).hexdigest()
email = "phi.nguyenphinguyen\@hcmut.edu.vn"
client = Client(wsdl)

ec_num = "3.2.1.23"


def PubIDfromLitID(ec_num, ec_lit):
    lit_payload = f"https://www.brenda-enzymes.org/literature.php?e={ec_num}&r={ec_lit[0]}"
    lit_respond = requests.get(lit_payload).text
    #print(lit_respond)

    pub_id = re.findall(r"https:\/\/pubmed\.ncbi\.nlm\.nih\.gov\/([0-9]+)",lit_respond)
    #print(pub_id)
    
    if len(pub_id) > 0:
        return pub_id[0]
    else:
        return float("nan")

def MakeDDF(data): #Create dataframe from extract list of dictionary
    headers = list(data[0])
    #print(headers)

    total_list = []
    for header in headers:
        tem = []
        for dic in data:
            tem.append(dic[header])                      
        total_list.append(tem)

    tata = pd.DataFrame(list(zip(*total_list)), columns=headers)
    tata = tata.drop("ecNumber", axis=1)
    tata["literature"] = tata["literature"].apply(lambda x: PubIDfromLitID(ec_num, x))
    #print(tata)
    return tata

def RepuScore_html(pub_id): # Get reputation score from html
    
    payload = f"https://pubmed.ncbi.nlm.nih.gov/?linkname=pubmed_pubmed_citedin&from_uid={pub_id}"
    page = requests.get(payload)
    html_doc = page.text
    total = re.findall(r"<span class=\"value\">(\d+)</span>", html_doc)
    if len(total) > 0:
        return int(total[0])
    else:
        return 0

def GetPhStability():## Get pH stability
	phsta_parameters =  (email,password,f"ecNumber*{ec_num}",
	                     "phStability*", "phStabilityMaximum*",
	                     "commentary*", "organism*", "literature*")

	while True:
		try:
			result_phsta = client.service.getPhStability(*phsta_parameters)
			#print(result_phsta[0:2])
		except:
			continue
		break

	ph_sta = MakeDDF(result_phsta)
	print("Get pH stability done!")
	return ph_sta

def GetPhRange(): ## Get pH range
	phra_parameters =  (email,password,f"ecNumber*{ec_num}",
	                     "phRange*", "phRangeMaximum*", 
	                    "commentary*", "organism*", "literature*")
	while True:
		try:
			result_phra = client.service.getPhRange(*phra_parameters)
			#print(result_phra[0:2])
		except:
			continue
		break

	ph_range = MakeDDF(result_phra)
	print("Get pH range done!")
	return ph_range

def GetPhOptimal(): ## Get pH optimal
	phop_parameters =  (email,password,f"ecNumber*{ec_num}",
	                     "phOptimum*", "phOptimumMaximum*", 
	                    "commentary*", "organism*", "literature*")

	while True:
		try:
			result_phop = client.service.getPhOptimum(*phop_parameters)
			#print(result_phop[0:2])
		except:
			continue
		break

	ph_opt = MakeDDF(result_phop)
	print("Get pH optimal done!")
	return ph_opt

def GetTemperatureOptimum(): ## Get Temperature Optimum
	temop_parameters =  (email,password,f"ecNumber*{ec_num}", 
	                     "temperatureOptimum*", "temperatureOptimumMaximum*", 
	                     "commentary*", "organism*", "literature*")
	while True:
		try:
			result_temop = client.service.getTemperatureOptimum(*temop_parameters)
			#print(result_temop[0:2])
		except:
			continue
		break

	tem_opt = MakeDDF(result_temop)
	print("Get Temperature Optimum done!")
	return tem_opt

def GetTemperatureRange(): ## Get Temperature Range
	temran_parameters =  (email,password,f"ecNumber*{ec_num}",
	                      "temperatureRange*", "temperatureRangeMaximum*", 
	                      "commentary*", "organism*", "literature*")
	while True:
		try:
			result_temran = client.service.getTemperatureRange(*temran_parameters)
			#print(result_temran[0:2])
		except:
			continue
		break

	tem_range = MakeDDF(result_temran)
	print("Get Temperature Range done!")
	return tem_range

def GetTemperatureStability(): ## Get Temperature Stability
	temsta_parameters =  (email,password,f"ecNumber*{ec_num}",
	                      "temperatureStability*", "temperatureStabilityMaximum*", 
	                      "commentary*", "organism*", "literature*")
	while True:
		try:
			result_temsta = client.service.getTemperatureStability(*temsta_parameters)
			#print(result_temsta[0:2])
		except:
			continue
		break

	tem_sta = MakeDDF(result_temsta)
	print("Get Temperature Stability done!")
	return tem_sta

def ClearData(ph_range): # Preplace -999 and add reputation score
    
    collay = [i for i in ph_range.columns.values if i not in ['organism',"literature","commentary"]]
    ph_range[collay] = ph_range[collay].apply(pd.to_numeric, errors='coerce')
    
    ph_range = ph_range.replace(-999, np.nan)## Replace -999 value in phRange by NaN
    ph_range["reputation"] = ph_range["literature"].apply(lambda x: RepuScore_html(x)) # Find reputation score
    return ph_range

def GroupByOrg(ph_range): # Group data by organism
    clear_ph_range = ClearData(ph_range)
    extr_ph_range = clear_ph_range.loc[:, ~clear_ph_range.columns.isin(["literature","commentary"])]
    tem1 = extr_ph_range.loc[:,~extr_ph_range.columns.isin(["reputation"])].groupby("organism").mean()
    tem2 = extr_ph_range.loc[:,["reputation","organism"]].groupby("organism").sum()
    total = pd.merge(tem1, tem2, how="outer", left_index=True, right_index=True)
    return total

def ProcessInfor(ph_range,ph_opt,ph_sta):
    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        thread1 = executor.submit(GroupByOrg,ph_range)
        thread2 = executor.submit(GroupByOrg,ph_sta)
        thread3 = executor.submit(GroupByOrg,ph_opt)

    extr_ph_range = thread1.result()
    extr_ph_sta = thread2.result()
    extr_ph_opt = thread3.result()
    
    total_repu1 = pd.merge(extr_ph_opt["reputation"], extr_ph_sta["reputation"],how="outer", left_index=True, right_index=True).sum(axis=1)
    total_repu1.name = "reputation"

    total_repu2 = pd.merge(total_repu1, extr_ph_range["reputation"],how="outer", left_index=True, right_index=True).sum(axis=1)
    total_repu2.name = "reputation"
    
    data1 = pd.merge(extr_ph_range.iloc[:,~extr_ph_range.columns.isin(["reputation"])],
         extr_ph_sta.iloc[:,~extr_ph_sta.columns.isin(["reputation"])],
         how="outer", left_index=True, right_index=True)
    data2 = pd.merge(data1,
             extr_ph_opt.iloc[:,~extr_ph_opt.columns.isin(["reputation"])],
             how="outer", left_index=True, right_index=True)
    
    total_data = pd.merge(data2, total_repu2, how="outer", left_index=True, right_index=True)
    return total_data 

def CompleteData(ph_data, tem_data):
	total_data = pd.merge(ph_data, tem_data, how="outer",
	                      left_index=True, right_index=True,
	                      suffixes=["_ph","_tem"])
	total_data["sum_reputation"] = total_data.loc[:,["reputation_ph", "reputation_tem"]].sum(axis=1)
	total_data.sort_values(by=['sum_reputation',"reputation_ph", "reputation_tem"], inplace=True, ascending=False)
	return(total_data)

Get data

In [2]:
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
	thread1 = executor.submit(GetPhStability)
	thread2 = executor.submit(GetPhRange)
	thread3 = executor.submit(GetPhOptimal)


	thread4 = executor.submit(GetTemperatureOptimum)
	thread5 = executor.submit(GetTemperatureRange)
	thread6 = executor.submit(GetTemperatureStability)


ph_sta = thread1.result()
ph_range = thread2.result()
ph_opt = thread3.result()

tem_opt = thread4.result()
tem_range = thread5.result()
tem_sta = thread6.result()

Get pH range done!
Get Temperature Range done!
Get pH stability done!
Get Temperature Optimum done!
Get Temperature Stability done!
Get pH optimal done!


In [3]:
ph_range.head()

Unnamed: 0,literature,phRange,phRangeMaximum,commentary,organism
0,17459724.0,-999.0,,"pH activity profile of the recombinant enzyme,...",Arthrobacter psychrolactophilus
1,19453169.0,0.5,8.0,,Bispora sp.
2,,2.2,4.6,about 50% of maximal activity at pH 2.2 and at...,Marchantia polymorpha
3,,2.3,6.0,"pH 2.3: about 30% of maximal activity, pH 6.0:...",Papiliotrema laurentii
4,236999.0,2.5,6.0,"pH 2.5: about 50% of maximal activity, pH 6.0:...",Aspergillus oryzae


In [13]:
clear_ph_range = ClearData(ph_range)

In [15]:
clear_ph_range.head()

Unnamed: 0,literature,phRange,phRangeMaximum,commentary,organism,reputation
0,17459724.0,,,"pH activity profile of the recombinant enzyme,...",Arthrobacter psychrolactophilus,10
1,19453169.0,0.5,8.0,,Bispora sp.,7
2,,2.2,4.6,about 50% of maximal activity at pH 2.2 and at...,Marchantia polymorpha,0
3,,2.3,6.0,"pH 2.3: about 30% of maximal activity, pH 6.0:...",Papiliotrema laurentii,0
4,236999.0,2.5,6.0,"pH 2.5: about 50% of maximal activity, pH 6.0:...",Aspergillus oryzae,14


In [39]:
extr_ph_range = clear_ph_range.loc[:, ~clear_ph_range.columns.isin(["literature","commentary"])]
extr_ph_range.head()

Unnamed: 0,phRange,phRangeMaximum,organism,reputation
0,,,Arthrobacter psychrolactophilus,10
1,0.5,8.0,Bispora sp.,7
2,2.2,4.6,Marchantia polymorpha,0
3,2.3,6.0,Papiliotrema laurentii,0
4,2.5,6.0,Aspergillus oryzae,14


In [66]:
collay = [i for i in extr_ph_range.columns.values if i != 'organism']
extr_ph_range.loc[:,collay].convert_dtypes()
extr_ph_range.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   phRange         48 non-null     float64
 1   phRangeMaximum  48 non-null     float64
 2   organism        49 non-null     object 
 3   reputation      49 non-null     float64
dtypes: float64(3), object(1)
memory usage: 1.7+ KB


In [85]:
tem1 = extr_ph_range.loc[:,~extr_ph_range.columns.isin(["reputation"])].groupby(["organism"])
tem1.head()

Unnamed: 0,phRange,phRangeMaximum,organism
0,,,Arthrobacter psychrolactophilus
1,0.5,8.0,Bispora sp.
2,2.2,4.6,Marchantia polymorpha
3,2.3,6.0,Papiliotrema laurentii
4,2.5,6.0,Aspergillus oryzae
5,2.5,5.0,Oryza sativa
6,3.0,6.0,Prunus persica
7,3.0,7.0,Tausonia pullulans
8,3.0,8.5,Trichoderma reesei
9,3.0,6.5,Aspergillus oryzae


In [17]:
tem2 = extr_ph_range.loc[:,["reputation","organism"]].groupby("organism").sum()
total = pd.merge(tem1, tem2, how="outer", left_index=True, right_index=True)
total

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,reputation
organism,Unnamed: 1_level_1
Achatina achatina,5
Arthrobacter psychrolactophilus,20
Arthrobacter sp.,27
Aspergillus oryzae,28
Bacillus licheniformis,0
Bacillus sp. (in: Bacteria),0
Bacteroides polypragmatus,0
Bispora sp.,7
Caldicellulosiruptor saccharolyticus,0
Escherichia coli,0


In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    thread1 = executor.submit(GroupByOrg,ph_range)
    thread2 = executor.submit(GroupByOrg,ph_sta)
    thread3 = executor.submit(GroupByOrg,ph_opt)

extr_ph_range = thread1.result()
extr_ph_sta = thread2.result()
extr_ph_opt = thread3.result()

In [11]:
total_repu1 = pd.merge(extr_ph_opt["reputation"], extr_ph_sta["reputation"],how="outer", left_index=True, right_index=True).sum(axis=1)
total_repu1.name = "reputation"

total_repu2 = pd.merge(total_repu1, extr_ph_range["reputation"],how="outer", left_index=True, right_index=True).sum(axis=1)
total_repu2.name = "reputation"

data1 = pd.merge(extr_ph_range.iloc[:,~extr_ph_range.columns.isin(["reputation"])],
     extr_ph_sta.iloc[:,~extr_ph_sta.columns.isin(["reputation"])],
     how="outer", left_index=True, right_index=True)
data2 = pd.merge(data1,
         extr_ph_opt.iloc[:,~extr_ph_opt.columns.isin(["reputation"])],
         how="outer", left_index=True, right_index=True)

total_data = pd.merge(data2, total_repu2, how="outer", left_index=True, right_index=True)

  result = self.fn(*self.args, **self.kwargs)


KeyboardInterrupt: 