In [190]:
import pandas as pd
import time
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
import numpy as np


In [179]:

class Alkamid():
	"""docstring for Alkamid"""
	def __init__(self, save_to_json=False):
		self.save_to_json = save_to_json
		self.url ='http://alkamid.ugent.be/alkamidresults.php'

		# dictionaries for chemical_details and chemicals_in_plants
		self.chemical_details = {}
		self.chemicals_in_plants = {}
		self.details = []
		

		# dataframes for chemical_details and chemicals_in_plants
		self.df_details = pd.DataFrame()

		self.df_chemical_details = pd.DataFrame()
		self.df_chemicals_in_plants = pd.DataFrame()


		# statistics of the crawl
		# of unique plants
		# of unique chemicals
		# of unique (plant, chemical) pairs
		# mean & standard deviation of chemicals per plant
		# the longest chemical name

		self.statistics = {}
		self.stat_unique_plants = 0
		self.stat_unique_chemicals = 0
		self.stat_unique_pairs = 0
		self.mean_standard_deviation = 0
		self.longest_chemical_name = ''



		self.crawl()


		self.details = [self.chemical_details, self.chemicals_in_plants]

		if(self.save_to_json == True):
			print('saving data to json files')
			# detect the current working directory and print it
			path = os.getcwd()
			print ("The current working directory is %s" % path)

			
			details_file = self.df_chemical_details.to_json(orient='table')
			with open(path+'/'+str(int(time.time()))+'_chemical_details.json', 'w+') as out_file:
				out_file.write(details_file)

			plants_details_file = self.df_chemicals_in_plants.to_json(orient='table')
			with open(path+'/'+str(int(time.time()))+'_chemicals_in_plants.json', 'w+') as out_file:
				out_file.write(plants_details_file)


	def crawl(self, query=''):
		
		html = urlopen(self.url + query)
		soup = BeautifulSoup(html, 'lxml')

		next_page = soup.select('.pagenumber.unselected a')

		if next_page and next_page[0]:
			next_page = soup.select('.pagenumber.unselected a')[0]['href']


		list_tr = soup.find_all('tr')
		del list_tr[0]
		for x in list_tr:
			S_chemical_name  = x.select_one('td:nth-of-type(3)').text
			S_trivial_name  = x.select_one('td:nth-of-type(4)').text
			S_formula  = x.select_one('td:nth-of-type(5)').text
			S_origin  = x.select_one('td:nth-of-type(6)').text
			S_mw  = x.select_one('td:nth-of-type(7)').text



			trivial_name = S_trivial_name

			if trivial_name == '-' :
				trivial_name = None


			plant_origin = S_origin
			chemical_name = S_chemical_name

			if plant_origin == '-':
				plant_origin = None

			if chemical_name == '-':
				chemical_name = None


			self.chemical_details[chemical_name] = {
				'trivial_name' : {trivial_name},
				'formula' : S_formula,
				'molecular_weight' : S_mw
			}

			if chemical_name:
				if len(self.longest_chemical_name) < len(chemical_name):
					self.longest_chemical_name = chemical_name


			cip = self.chemicals_in_plants.get(plant_origin)
			if cip:
				plant_origin_value = cip
				plant_origin_value.add(chemical_name)

			else:
				plant_origin_value = {chemical_name}


			self.chemicals_in_plants[plant_origin] = {plant for plant in plant_origin_value if plant} 


		if next_page:
			self.crawl(next_page)
		else:


			unique_plants =  {pl for pl in self.chemicals_in_plants if pl}
			self.stat_unique_plants =  len(unique_plants)




			unique_chemicals =  {cd for cd in self.chemical_details if cd}
			self.stat_unique_chemicals =  len(unique_chemicals)



			# processing dataframe
			# processing chemical details dataframe
			self.df_chemical_details = pd.DataFrame(self.chemical_details)
			self.df_chemical_details = self.df_chemical_details.transpose()

			self.df_chemical_details['trivial_name'] = self.df_chemical_details['trivial_name'].apply(list)
			self.df_chemical_details.index.name = 'chemical'

			self.df_chemical_details = self.df_chemical_details.reset_index()
			self.df_chemical_details = self.df_chemical_details.dropna() 
			


			# processing chemicals in plants dataframe
			self.df_chemicals_in_plants = pd.DataFrame(self.chemicals_in_plants.items(), columns = [ 'origin', 'chemical'])
			self.df_chemicals_in_plants['chemical'] = self.df_chemicals_in_plants['chemical'].apply(list)
			self.df_chemicals_in_plants = self.df_chemicals_in_plants.explode('chemical')
			self.df_chemicals_in_plants = self.df_chemicals_in_plants.dropna(subset=['chemical']) 

			#joining two data frames on chemical details field
			self.df_details = pd.merge(self.df_chemicals_in_plants, self.df_chemical_details, on='chemical', how='inner')




In [211]:
# from alkamid import Alkamid





# details = Alkamid().details
# print(details)

# chemical_details = Alkamid().chemical_details
# print(chemical_details)



chemicals_in_plants = Alkamid().chemicals_in_plants
print(chemicals_in_plants)



# df_chemical_details = Alkamid().df_chemical_details
# print(df_chemical_details)



# df_chemicals_in_plants = Alkamid().df_chemicals_in_plants
# print(df_chemicals_in_plants)


# result = Alkamid(save_to_json=True)

{'Asteraceae': {'Deca-2E,6Z-diene-8,9-dihydroxic acid isobutylamide', 'Nona-2Z-ene-6,8-diynoic acid phenylethylamide', 'Hepta-2E,4E,6E-triene-thiophenic acid piperideide', 'Deca-2E,6Z,8E-trienoic acid isobutylamide', 'Deca-2E,6Z,8E-triene-10-isobutylcarboxylic acid isobutylamide', 'Deca-2E-ene-4,6,8-triynoic acid isobutylamide', 'Hexadeca-2Z,6Z,8Z,12E-tetraenoic-10-ynoic acid 2,3-didehydropyrrolidide', 'Hepta-2E,4E,6E-triene-thiophenic acid isobutylamide', 'Deca-2E,8Z-diene-4,6-diynoic acid isobutylamide', 'Tetradeca-2E,4E,8Z,10E-tetraene-7-hydroxic acid isobutylamide', 'Deca-2E,4E-dienoic acid piperidide', 'Deca-2E,4E,6Z,8E-tetraenoic acid isobutylamide', 'Undeca-7Z,9E-diene-2-hydroxic acid isobutylamide', 'Undeca-2E,4E-dienoic acid isobutylamide', 'Undeca-2E,4Z-diene-7,9-diynoic acid isobutylamide', 'Tetradeca-2E,4E-dienoic acid piperidide', 'Dodeca-2E,4E-dienoic acid 2-methylbutylamide', 'Deca-2E,4E-dienoic acid phenylethylamide', 'Hexadeca-2E,7Z-diene-10-ynoic acid 2,3-didehydropyr

In [185]:
df_chemical_details = pd.DataFrame(d1)
df_chemical_details = df_chemical_details.transpose()
df_chemical_details['trivial_name'] = df_chemical_details['trivial_name'].apply(list)
df_chemical_details.index.name = 'chemical'
df_chemicals_in_plants = pd.DataFrame(d2.items(), columns = [ 'origin', 'chemical'])
df_chemicals_in_plants['chemical'] = df_chemicals_in_plants['chemical'].apply(list)

Unnamed: 0,origin,chemical
0,Asteraceae,"Deca-2E,6Z-diene-8,9-dihydroxic acid isobutyla..."
0,Asteraceae,"Nona-2Z-ene-6,8-diynoic acid phenylethylamide"
0,Asteraceae,"Hepta-2E,4E,6E-triene-thiophenic acid piperideide"
0,Asteraceae,"Deca-2E,6Z,8E-trienoic acid isobutylamide"
0,Asteraceae,"Deca-2E,6Z,8E-triene-10-isobutylcarboxylic aci..."
...,...,...
21,Lauraceae,N-benzoyl tyramide
21,Lauraceae,N-2-hydroxybenzoyl 4-methoxy phenylethylamide
21,Lauraceae,N-benzoyl 4-methoxy phenylethylamide
21,Lauraceae,N-2-hydroxy-6-methoxybenzoyl 4-methoxy phenyle...


In [None]:
# processing dataframe
# processing chemical details dataframe
df_chemical_details = pd.DataFrame(chemical_details)
df_chemical_details = df_chemical_details.transpose()

df_chemical_details['trivial_name'] = df_chemical_details['trivial_name'].apply(list)
df_chemical_details.index.name = 'chemical'

df_chemical_details = df_chemical_details.reset_index()
df_chemical_details = df_chemical_details.dropna() 



# processing chemicals in plants dataframe
df_chemicals_in_plants = pd.DataFrame(chemicals_in_plants.items(), columns = [ 'origin', 'chemical'])
df_chemicals_in_plants['chemical'] = df_chemicals_in_plants['chemical'].apply(list)
df_chemicals_in_plants = df_chemicals_in_plants.explode('chemical')
df_chemicals_in_plants = df_chemicals_in_plants.dropna(subset=['chemical']) 

#joining two data frames on chemical details field
df_details = pd.merge(df_chemicals_in_plants, df_chemical_details, on='chemical', how='inner')

In [162]:
# duplicate_chemical_details = df_chemical_details[df_chemical_details.duplicated('chemical')]
# duplicate_chemical_details
# df_chemical_details #376
# df_chemicals_in_plants #22 explode 383 and dropna all 374 drop na 375


In [169]:
# df_chemical_details #376
df_chemical_details = df_chemical_details.reset_index()
df_chemical_details = df_chemical_details.dropna() #375
df_chemical_details


Unnamed: 0,chemical,trivial_name,formula,molecular_weight
0,"Deca-2E,6Z,8E-trienoic acid isobutylamide","[Spilanthol, affinin]",C14H23NO,221.34
1,"Deca-2E,6Z,8E-trienoic acid 2-methylbutylamide",[Homospilanthol],C15H25NO,235.37
2,"Dodeca-2E,4E,8Z,10Z-tetraenoic acid isobutylamide",[None],C16H25NO,247.38
3,"Dodeca-2E,4E,8Z,10E-tetraenoic acid isobutylamide",[None],C16H25NO,247.38
4,"Undeca-2E-ene-8,10-diynoic acid 2-methylbutyla...",[None],C16H23NO,245.36
...,...,...,...,...
371,"Nona-2,3-dihydroxy-6,8-diynoic acid phenylethy...",[None],C17H19NO3,285.34
372,"Dodeca-2E,7E,9E-triene-6-hydroxy-11-hydroic ac...",[Zanthoxylumamide A],C16H25NO3,281.4
373,"Dodeca-2E,7E,9E-triene-6-hydro-11-hydroxic aci...",[Zanthoxylumamide B],C16H25NO3,279.38
374,"Dodeca-2E,7E,9E-triene-6,11-dihydroxic acid is...",[Zanthoxylumamide C],C16H27NO3,281.4


In [170]:
df_chemicals_in_plants #22 explode 383 and dropna all 374 drop na 375


Unnamed: 0,origin,chemical
0,Asteraceae,"[Deca-2E,6Z-diene-8,9-dihydroxic acid isobutyl..."
1,Rutaceae,"[N-[2-methoxy-2-(3,4-methoxyphenyl)ethyl]benza..."
2,Synthetic,"[Tetradeca-2E,4E,8E,10E-tetraenoic acid isobut..."
3,Aristolochiaceae,"[Dodeca-2E,4Z,8Z,10Z-tetraenoic acid isobutyla..."
4,Piperaceae,"[Tri-2Z-ene-3-(3,4,5-trimethoxy)phenyl acid 3,..."
5,Menispermacea,"[Octa-2E,4E-dienoic acid isobutylamide]"
6,Convolvulaceae,"[Tri-2Z-ene-3-(4-hydroxy)phenyl acid 2,3-dihyd..."
7,Brassicaceae,"[Hexadeca acid phenylmethylamide, Pentadecanoi..."
8,Fabaceae,[]
9,Solanaceae,"[Octa-7 methylic acid 4-hydroxy,5-methoxy phen..."


In [171]:
df_chemicals_in_plants = df_chemicals_in_plants.explode('chemical')
df_chemicals_in_plants = df_chemicals_in_plants.dropna(subset=['chemical']) 
df_chemicals_in_plants

Unnamed: 0,origin,chemical
0,Asteraceae,"Deca-2E,6Z-diene-8,9-dihydroxic acid isobutyla..."
0,Asteraceae,"Nona-2Z-ene-6,8-diynoic acid phenylethylamide"
0,Asteraceae,"Hepta-2E,4E,6E-triene-thiophenic acid piperideide"
0,Asteraceae,"Deca-2E,6Z,8E-trienoic acid isobutylamide"
0,Asteraceae,"Deca-2E,6Z,8E-triene-10-isobutylcarboxylic aci..."
...,...,...
21,Lauraceae,N-benzoyl tyramide
21,Lauraceae,N-2-hydroxybenzoyl 4-methoxy phenylethylamide
21,Lauraceae,N-benzoyl 4-methoxy phenylethylamide
21,Lauraceae,N-2-hydroxy-6-methoxybenzoyl 4-methoxy phenyle...


In [172]:
df_inner = pd.merge(df_chemicals_in_plants, df_chemical_details, on='chemical', how='inner')
df_inner

Unnamed: 0,origin,chemical,trivial_name,formula,molecular_weight
0,Asteraceae,"Deca-2E,6Z-diene-8,9-dihydroxic acid isobutyla...",[None],C14H25NO3,255.36
1,Asteraceae,"Nona-2Z-ene-6,8-diynoic acid phenylethylamide",[None],C17H17NO,251.33
2,Asteraceae,"Hepta-2E,4E,6E-triene-thiophenic acid piperideide",[None],C16H17NOS,271.38
3,Asteraceae,"Deca-2E,6Z,8E-trienoic acid isobutylamide","[Spilanthol, affinin]",C14H23NO,221.34
4,Asteraceae,"Deca-2E,6Z,8E-triene-10-isobutylcarboxylic aci...",[10-hydroxy spilanthol isovalerate],C19H31NO3,321.46
...,...,...,...,...,...
370,Lauraceae,N-benzoyl tyramide,[None],C15H15NO2,241.29
371,Lauraceae,N-2-hydroxybenzoyl 4-methoxy phenylethylamide,[None],C16H17NO3,271.32
372,Lauraceae,N-benzoyl 4-methoxy phenylethylamide,[None],C16H17NO2,255.32
373,Lauraceae,N-2-hydroxy-6-methoxybenzoyl 4-methoxy phenyle...,[None],C17H19NO4,301.34


In [173]:
unique_chemical_in_plants = df_chemicals_in_plants['chemical'].unique() 
unique_chemical_in_plants

array(['Deca-2E,6Z-diene-8,9-dihydroxic acid isobutylamide',
       'Nona-2Z-ene-6,8-diynoic acid phenylethylamide',
       'Hepta-2E,4E,6E-triene-thiophenic acid piperideide',
       'Deca-2E,6Z,8E-trienoic acid isobutylamide',
       'Deca-2E,6Z,8E-triene-10-isobutylcarboxylic acid isobutylamide',
       'Deca-2E-ene-4,6,8-triynoic acid isobutylamide',
       'Hexadeca-2Z,6Z,8Z,12E-tetraenoic-10-ynoic acid 2,3-didehydropyrrolidide',
       'Hepta-2E,4E,6E-triene-thiophenic acid isobutylamide',
       'Deca-2E,8Z-diene-4,6-diynoic acid isobutylamide',
       'Tetradeca-2E,4E,8Z,10E-tetraene-7-hydroxic acid isobutylamide',
       'Deca-2E,4E-dienoic acid piperidide',
       'Deca-2E,4E,6Z,8E-tetraenoic acid isobutylamide',
       'Undeca-7Z,9E-diene-2-hydroxic acid isobutylamide',
       'Undeca-2E,4E-dienoic acid isobutylamide',
       'Undeca-2E,4Z-diene-7,9-diynoic acid isobutylamide',
       'Tetradeca-2E,4E-dienoic acid piperidide',
       'Dodeca-2E,4E-dienoic acid 2-methylbutyla

In [174]:
len(unique_chemical_in_plants)

375

In [188]:
details

Unnamed: 0,origin,chemical,trivial_name,formula,molecular_weight
0,Asteraceae,"Deca-2E,6Z-diene-8,9-dihydroxic acid isobutyla...",[None],C14H25NO3,255.36
1,Asteraceae,"Nona-2Z-ene-6,8-diynoic acid phenylethylamide",[None],C17H17NO,251.33
2,Asteraceae,"Hepta-2E,4E,6E-triene-thiophenic acid piperideide",[None],C16H17NOS,271.38
3,Asteraceae,"Deca-2E,6Z,8E-trienoic acid isobutylamide","[Spilanthol, affinin]",C14H23NO,221.34
4,Asteraceae,"Deca-2E,6Z,8E-triene-10-isobutylcarboxylic aci...",[10-hydroxy spilanthol isovalerate],C19H31NO3,321.46
...,...,...,...,...,...
370,Lauraceae,N-benzoyl tyramide,[None],C15H15NO2,241.29
371,Lauraceae,N-2-hydroxybenzoyl 4-methoxy phenylethylamide,[None],C16H17NO3,271.32
372,Lauraceae,N-benzoyl 4-methoxy phenylethylamide,[None],C16H17NO2,255.32
373,Lauraceae,N-2-hydroxy-6-methoxybenzoyl 4-methoxy phenyle...,[None],C17H19NO4,301.34


In [187]:
df_chemicals_in_plants

Unnamed: 0,origin,chemical
0,Asteraceae,"Deca-2E,6Z-diene-8,9-dihydroxic acid isobutyla..."
0,Asteraceae,"Nona-2Z-ene-6,8-diynoic acid phenylethylamide"
0,Asteraceae,"Hepta-2E,4E,6E-triene-thiophenic acid piperideide"
0,Asteraceae,"Deca-2E,6Z,8E-trienoic acid isobutylamide"
0,Asteraceae,"Deca-2E,6Z,8E-triene-10-isobutylcarboxylic aci..."
...,...,...
21,Lauraceae,N-benzoyl tyramide
21,Lauraceae,N-2-hydroxybenzoyl 4-methoxy phenylethylamide
21,Lauraceae,N-benzoyl 4-methoxy phenylethylamide
21,Lauraceae,N-2-hydroxy-6-methoxybenzoyl 4-methoxy phenyle...


In [192]:
column_values = df_chemicals_in_plants[["origin", "chemical"]].values.ravel()
unique_values =  pd.unique(column_values)
unique_values


array(['Asteraceae', 'Deca-2E,6Z-diene-8,9-dihydroxic acid isobutylamide',
       'Nona-2Z-ene-6,8-diynoic acid phenylethylamide',
       'Hepta-2E,4E,6E-triene-thiophenic acid piperideide',
       'Deca-2E,6Z,8E-trienoic acid isobutylamide',
       'Deca-2E,6Z,8E-triene-10-isobutylcarboxylic acid isobutylamide',
       'Deca-2E-ene-4,6,8-triynoic acid isobutylamide',
       'Hexadeca-2Z,6Z,8Z,12E-tetraenoic-10-ynoic acid 2,3-didehydropyrrolidide',
       'Hepta-2E,4E,6E-triene-thiophenic acid isobutylamide',
       'Deca-2E,8Z-diene-4,6-diynoic acid isobutylamide',
       'Tetradeca-2E,4E,8Z,10E-tetraene-7-hydroxic acid isobutylamide',
       'Deca-2E,4E-dienoic acid piperidide',
       'Deca-2E,4E,6Z,8E-tetraenoic acid isobutylamide',
       'Undeca-7Z,9E-diene-2-hydroxic acid isobutylamide',
       'Undeca-2E,4E-dienoic acid isobutylamide',
       'Undeca-2E,4Z-diene-7,9-diynoic acid isobutylamide',
       'Tetradeca-2E,4E-dienoic acid piperidide',
       'Dodeca-2E,4E-dienoic acid 

In [193]:
len(unique_values)

389

In [195]:
dupli = df_chemicals_in_plants.drop_duplicates(subset=["origin", "chemical"])
dupli

Unnamed: 0,origin,chemical
0,Asteraceae,"Deca-2E,6Z-diene-8,9-dihydroxic acid isobutyla..."
0,Asteraceae,"Nona-2Z-ene-6,8-diynoic acid phenylethylamide"
0,Asteraceae,"Hepta-2E,4E,6E-triene-thiophenic acid piperideide"
0,Asteraceae,"Deca-2E,6Z,8E-trienoic acid isobutylamide"
0,Asteraceae,"Deca-2E,6Z,8E-triene-10-isobutylcarboxylic aci..."
...,...,...
21,Lauraceae,N-benzoyl tyramide
21,Lauraceae,N-2-hydroxybenzoyl 4-methoxy phenylethylamide
21,Lauraceae,N-benzoyl 4-methoxy phenylethylamide
21,Lauraceae,N-2-hydroxy-6-methoxybenzoyl 4-methoxy phenyle...


In [224]:

df_chemicals_in_plants = pd.DataFrame(chemicals_in_plants.items(), columns = [ 'origin', 'chemical'])
df_chemicals_in_plants['chemical'] = df_chemicals_in_plants['chemical'].apply(list)
df_chemicals_in_plants

Unnamed: 0,origin,chemical
0,Asteraceae,"[Deca-2E,6Z-diene-8,9-dihydroxic acid isobutyl..."
1,Rutaceae,"[N-[2-methoxy-2-(3,4-methoxyphenyl)ethyl]benza..."
2,Synthetic,"[Tetradeca-2E,4E,8E,10E-tetraenoic acid isobut..."
3,Aristolochiaceae,"[Dodeca-2E,4Z,8Z,10Z-tetraenoic acid isobutyla..."
4,Piperaceae,"[Tri-2Z-ene-3-(3,4,5-trimethoxy)phenyl acid 3,..."
5,Menispermacea,"[Octa-2E,4E-dienoic acid isobutylamide]"
6,Convolvulaceae,"[Tri-2Z-ene-3-(4-hydroxy)phenyl acid 2,3-dihyd..."
7,Brassicaceae,"[Hexadeca acid phenylmethylamide, Pentadecanoi..."
8,Fabaceae,[]
9,Solanaceae,"[Octa-7 methylic acid 4-hydroxy,5-methoxy phen..."


In [225]:

df_chemicals_in_plants =df_chemicals_in_plants.explode('chemical')
df_chemicals_in_plants = df_chemicals_in_plants.dropna(subset=['chemical']) 

df_chemicals_in_plants

Unnamed: 0,origin,chemical
0,Asteraceae,"Deca-2E,6Z-diene-8,9-dihydroxic acid isobutyla..."
0,Asteraceae,"Nona-2Z-ene-6,8-diynoic acid phenylethylamide"
0,Asteraceae,"Hepta-2E,4E,6E-triene-thiophenic acid piperideide"
0,Asteraceae,"Deca-2E,6Z,8E-trienoic acid isobutylamide"
0,Asteraceae,"Deca-2E,6Z,8E-triene-10-isobutylcarboxylic aci..."
...,...,...
21,Lauraceae,N-benzoyl tyramide
21,Lauraceae,N-2-hydroxybenzoyl 4-methoxy phenylethylamide
21,Lauraceae,N-benzoyl 4-methoxy phenylethylamide
21,Lauraceae,N-2-hydroxy-6-methoxybenzoyl 4-methoxy phenyle...


In [244]:
cip_values = df_chemicals_in_plants['origin'].value_counts()


In [245]:
cip_values = cip_values.std()


In [246]:
cip_values

49.831639624763575