In [87]:
import pandas as pd
import numpy as np
import pickle
import tqdm
import camelot
from tqdm import tqdm
from itertools import product
import pdfplumber
import pubchempy as pcp
tqdm.pandas()
import cirpy
import rdkit
from rdkit import Chem

In [2]:
df_yaws = pd.read_table('Yaws Collection.tsv')
df_yaws['Pc'] = df_yaws['Pc']/100000

In [3]:
df_passut=   pd.read_table('PassutDanner1973.tsv')
df_passut['Pc']  = df_passut['Pc']/100000

In [4]:
merged = pd.concat([df_yaws, df_passut], axis = 0).reset_index(drop = True)

In [5]:
df_PSRK = pd.read_table('Appendix to PSRK Revision 4.tsv')
df_PSRK['Pc']  = df_PSRK['Pc']/100000

In [6]:
merged = pd.concat([merged, df_PSRK], axis = 0).reset_index(drop = True)

In [7]:
df_CRC = pd.read_table('CRCCriticalOrganics.tsv')
df_CRC.drop(columns = ['Tc_error', 'Pc_error', 'Vc_error'], inplace = True)
df_CRC['Pc']  = df_CRC['Pc']/100000

In [8]:
merged = pd.concat([merged, df_CRC], axis = 0).reset_index(drop = True)

In [9]:
df_DIPP = pd.read_table('DIPPRPinaMartines.tsv')
df_DIPP['Pc'] = df_DIPP['Pc']/100000

In [10]:
merged = pd.concat([merged, df_DIPP], axis = 0).reset_index(drop = True)

In [11]:
df_IUPAC = pd.read_table('IUPACOrganicCriticalProps.tsv')
df_IUPAC.drop(columns = ['MW', 'Reference'], inplace = True)
df_IUPAC['Pc'] = df_IUPAC['Pc']/100000

In [12]:
merged = pd.concat([merged, df_IUPAC], axis = 0).reset_index(drop = True)

Here we transform our CAS and name to SMILES

In [43]:
def CAS_to_smiles(cas):
    try:
        comp = pcp.get_compounds(cas, 'name')[0]
        smi = comp.canonical_smiles
        mol = Chem.MolFromSmiles(smi)
        can_smi = Chem.MolToSmiles(mol)
        return can_smi
    except:
        return np.NaN
    

In [85]:
def name_to_smiles(name):
    try:
        comp = pcp.get_compounds(name, 'name')[0]
        smi = comp.canonical_smiles
        mol = Chem.MolFromSmiles(smi)
        can_smi = Chem.MolToSmiles(mol)
        return can_smi
    except:
        return np.NaN

In [21]:
def caconicalize_SMILES(smi):
    mol = Chem.MolFromSmiles(smi)
    can_smi = Chem.MolToSmiles(mol)
    return can_smi

In [50]:
merged['SMILES'] = merged['CAS'].progress_apply(CAS_to_smiles)

 33%|████████████████████████▍                                                  | 3986/12206 [46:58<1:30:12,  1.52it/s][21:38:41] Explicit valence for atom # 1 Cl, 7, is greater than permitted
 34%|█████████████████████████▌                                                 | 4159/12206 [48:53<1:23:41,  1.60it/s][21:40:36] Explicit valence for atom # 1 Br, 3, is greater than permitted
 34%|█████████████████████████▌                                                 | 4167/12206 [48:59<1:32:45,  1.44it/s][21:40:42] Explicit valence for atom # 1 Br, 5, is greater than permitted
 34%|█████████████████████████▋                                                 | 4189/12206 [49:12<1:24:49,  1.58it/s][21:40:55] Explicit valence for atom # 1 Cl, 3, is greater than permitted
 37%|███████████████████████████▌                                               | 4478/12206 [52:35<1:34:16,  1.37it/s][21:44:18] Explicit valence for atom # 1 Cl, 5, is greater than permitted
 39%|█████████████████████████████ 

In [67]:
merged_non_valid = merged[merged['SMILES'].isnull()]

In [71]:
merged_valid = merged.dropna(axis = 0, subset = 'SMILES')

In [87]:
merged_non_valid['SMILES'] = merged_non_valid['Chemical'].progress_apply(name_to_smiles)

 11%|████████▌                                                                      | 127/1170 [01:22<10:29,  1.66it/s][23:24:39] Explicit valence for atom # 1 Cl, 7, is greater than permitted
 14%|███████████                                                                    | 163/1170 [01:44<10:07,  1.66it/s][23:25:01] Explicit valence for atom # 1 Br, 3, is greater than permitted
 14%|███████████                                                                    | 164/1170 [01:44<10:07,  1.66it/s][23:25:01] Explicit valence for atom # 1 Br, 5, is greater than permitted
 14%|███████████▏                                                                   | 165/1170 [01:45<10:29,  1.60it/s][23:25:02] Explicit valence for atom # 1 Cl, 3, is greater than permitted
 17%|█████████████▎                                                                 | 198/1170 [02:06<09:44,  1.66it/s][23:25:23] Explicit valence for atom # 1 Cl, 5, is greater than permitted
 19%|███████████████               

In [96]:
merged_non_valid = merged_non_valid.dropna(axis = 0, subset = 'SMILES')

In [106]:
final_table = pd.concat([merged_valid, merged_non_valid], axis = 0)
final_table.drop(columns = ['Zc'], inplace = True)
final_table.to_csv('from_chemicals_datasets.csv')


In [108]:
final_table.isnull().sum()

CAS            0
Chemical    1785
Tc             8
Pc           295
Vc           607
omega       4557
SMILES         0
dtype: int64

In [32]:
final_table = pd.read_csv('from_chemicals_datasets.csv', index_col=0).reset_index(drop = True).drop(columns = ['CAS', 'Chemical'])

In [26]:
article_ds = pd.read_csv('./ci3c00546_si_002/CritProp_SI/all_data/experimental_data/critprop_data_only_smiles_mean_value_expt.csv')
article_ds = article_ds.rename(columns = {'smiles':'SMILES', 'Tc (K)':'Tc', 'Pc (bar)':'Pc', 'omega (-)':'omega'}).drop(columns = ['rhoc (mol/L)', 'Tb (K)', 'Tm (K)', 'dHvap (kJ/mol)', 'dHfus (kJ/mol)'])

In [28]:
article_ds['SMILES'] = article_ds['SMILES'].progress_apply(caconicalize_SMILES)

100%|████████████████████████████████████████████████████████████████████████████| 5680/5680 [00:00<00:00, 7237.86it/s]


In [35]:
concatenated_table = pd.concat([final_table, article_ds])

In [80]:
grouped_table = concatenated_table.groupby(by = 'SMILES').agg({'Tc':'mean', 
                                               'Pc':'mean',
                                               'omega':'mean'}).reset_index()

In [81]:
grouped_table['mol'] =  grouped_table['SMILES'].progress_apply(Chem.MolFromSmiles)

100%|███████████████████████████████████████████████████████████████████████████| 7533/7533 [00:00<00:00, 12000.14it/s]


In [83]:
grouped_table['Is_organic'] = grouped_table['mol'].progress_apply(lambda x: x.HasSubstructMatch(Chem.MolFromSmarts('[C,c]')))
grouped_table = grouped_table[grouped_table['Is_organic']]

100%|███████████████████████████████████████████████████████████████████████████| 7533/7533 [00:00<00:00, 57280.39it/s]


In [84]:
grouped_table['num_fragments'] = grouped_table['mol'].progress_apply(lambda x: len(Chem.GetMolFrags(x)) < 2)
grouped_table = grouped_table[grouped_table['num_fragments']]

100%|██████████████████████████████████████████████████████████████████████████| 7115/7115 [00:00<00:00, 171389.28it/s]


In [86]:
grouped_table.to_csv('grouped_table.csv')