In [1]:
import numpy as np
import pandas as pd
import json
import ast

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

import pyrfume

from rdkit import Chem
from rdkit.Chem import AllChem, PandasTools, Descriptors, rdMolDescriptors
from mordred import Calculator, descriptors


In [2]:
arc_sparce = pyrfume.load_data('arctander_1960/behavior_1_sparse.csv')
arc_stim = pyrfume.load_data('arctander_1960/stimuli.csv')
arc_mol = pyrfume.load_data('arctander_1960/molecules.csv')

In [3]:
arc_sparce[arc_sparce['Labels'].isna()]

Unnamed: 0_level_0,Labels
Stimulus,Unnamed: 1_level_1
1,
27,
42,
44,
50,
...,...
3053,
3065,
3066,
3071,


In [4]:
merged_df = pd.merge(arc_stim, arc_mol, left_on='new_CID', right_index=True, how='left')[arc_sparce['Labels'].isna()]
merged_df

Unnamed: 0_level_0,ChemicalName,CAS,new_CID,MolecularWeight,IsomericSMILES,IUPACName,name
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,ABIETIC ACID,514-10-3,10569,302.50,CC(C)C1=CC2=CC[C@@H]3[C@@]([C@H]2CC1)(CCC[C@@]...,"(1R,4aR,4bR,10aR)-1,4a-dimethyl-7-propan-2-yl-...",abietic acid
27,ACETOQUINALDINE,,228524,185.22,CC(=O)CC1=NC2=CC=CC=C2C=C1,1-quinolin-2-ylpropan-2-one,1-(quinolin-2-yl)propan-2-one
42,ACETYL MESITYLENE,198-67-8,15461,162.23,CC1=CC(=C(C(=C1)C)C(=O)C)C,"1-(2,4,6-trimethylphenyl)ethanone",1667-01-2
44,"3-ACETYL-6-METHYL-2,4 -PYRANDIONE",520-45-6,122903,168.15,CC1=CC(=O)C(C(=O)O1)C(=O)C,"3-acetyl-6-methylpyran-2,4-dione",dehydroacetic acid
50,ACETYLVANILLIN DIMETHYLACETAL,,11989360,240.25,CC(=O)OC1=C(C=C(C=C1)C(OC)OC)OC,[4-(dimethoxymethyl)-2-methoxyphenyl] acetate,schembl4964245
...,...,...,...,...,...,...,...
3053,iso-VALERALDEHYDE,,11552,86.13,CC(C)CC=O,3-methylbutanal,3-methylbutanal
3065,VALINE,,6287,117.15,CC(C)[C@@H](C(=O)O)N,(2S)-2-amino-3-methylbutanoic acid,l-valine
3066,VANILLIC ACID,,8468,168.15,COC1=C(C=CC(=C1)C(=O)O)O,4-hydroxy-3-methoxybenzoic acid,vanillic acid
3071,VAN I LLIN TRIACETATE,,22035534,332.30,CC(=O)O.CC(=O)O.CC(=O)O.COC1=C(C=CC(=C1)C=O)O,acetic acid;4-hydroxy-3-methoxybenzaldehyde,acetic acid;4-hydroxy-3-methoxybenzaldehyde


In [5]:
merged_df[['new_CID', 'IsomericSMILES']]

Unnamed: 0_level_0,new_CID,IsomericSMILES
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10569,CC(C)C1=CC2=CC[C@@H]3[C@@]([C@H]2CC1)(CCC[C@@]...
27,228524,CC(=O)CC1=NC2=CC=CC=C2C=C1
42,15461,CC1=CC(=C(C(=C1)C)C(=O)C)C
44,122903,CC1=CC(=O)C(C(=O)O1)C(=O)C
50,11989360,CC(=O)OC1=C(C=C(C=C1)C(OC)OC)OC
...,...,...
3053,11552,CC(C)CC=O
3065,6287,CC(C)[C@@H](C(=O)O)N
3066,8468,COC1=C(C=CC(=C1)C(=O)O)O
3071,22035534,CC(=O)O.CC(=O)O.CC(=O)O.COC1=C(C=CC(=C1)C=O)O


In [6]:
arc_mol = merged_df[['IsomericSMILES']]
arc_mol.columns = ['SMILES']
# arc_mol['SMILES'] = arc_mol['IsomericSMILES']
# arc_mol = arc_mol[['SMILES']]
arc_mol.index = merged_df['new_CID']
arc_mol = arc_mol.sort_index().dropna()
arc_mol['SMILES']

new_CID
176                                    CC(=O)O
196                         C(CCC(=O)O)CC(=O)O
235                 C(C(C1C(=O)C(=C(O1)O)O)O)O
235                 C(C(C1C(=O)C(=C(O1)O)O)O)O
243                        C1=CC=C(C=C1)C(=O)O
                           ...                
87101585                CC(=O)C(C(CO)O)OC(=O)C
129279387       CC(C)OC(=O)C(C)(C)CC1=CC=CC=C1
130024617            CC1=CC(=C(C(C1)(C)C)C=O)C
131751246        CC(C)/C=C/1\C2CC=CC=C2C(=O)O1
152190363    CC(C1=CC=CC=C1)C(=O)OC2=CC=CC=C2N
Name: SMILES, Length: 244, dtype: object

In [7]:
PandasTools.AddMoleculeColumnToFrame(arc_mol, smilesCol='SMILES')

In [8]:
# Write each molecule to an MDL Mol file
for i, (cid, mol) in enumerate(arc_mol['ROMol'].items()):
    mol_file_name = f'../data/molecules/-1/{cid}.mol'
    # Write mol to file
    with open(mol_file_name, 'w') as mol_file:
        mol_file.write(Chem.MolToMolBlock(mol))

In [9]:
# import os

# def concat_csv_files(output_file, input_files):
#     # Read each CSV file into a DataFrame and concatenate them
#     dfs = [pd.read_csv(f'../data/{file}') for file in input_files]
#     combined_df = pd.concat(dfs, ignore_index=True)

#     # Write the combined DataFrame to a new CSV file
#     combined_df.to_csv(output_file, index=False)

# # Example usage
# output_file = 'PaDEL_output.csv'
# input_files = ['PaDEL_output0.csv', 'PaDEL_output1.csv', 'PaDEL_output2.csv']

# if os.path.exists(output_file):
#     os.remove(output_file)

# concat_csv_files(output_file, input_files)


In [10]:
arc_PaDEL = pd.read_csv('../data/PaDEL_output-1.csv')
arc_PaDEL

Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,AUTOGEN_80799,0,-5.1261,26.276901,79.7593,72.756478,0,0,71,25,...,4.990842,48.648662,1.945946,5.474819,5.474819,0.000000,2498.0,23.0,10.948,96.0
1,AUTOGEN_10759,0,-0.9176,0.841990,4.9415,17.257965,6,6,14,9,...,8.644925,17.528614,1.947624,4.815165,4.815165,0.000000,88.0,9.0,2.185,40.0
2,AUTOGEN_5281517,0,4.3258,18.712546,72.4120,42.403032,0,0,39,15,...,5.235585,28.309024,1.887268,0.000000,0.000000,0.000000,484.0,14.0,5.992,60.0
3,AUTOGEN_168696,0,1.4448,2.087447,64.9559,40.111446,0,0,37,15,...,5.572083,29.008255,1.933884,2.456459,2.456459,0.000000,374.0,21.0,4.401,72.0
4,AUTOGEN_7170,0,0.0326,0.001063,26.1774,29.767516,6,6,26,14,...,7.387640,27.349234,1.953517,7.849139,7.849139,0.000000,336.0,16.0,3.456,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,AUTOGEN_62842,0,2.5266,6.383708,47.3706,29.070688,0,0,27,11,...,5.634078,20.254935,1.841358,2.400011,2.400011,0.000000,180.0,10.0,2.121,46.0
235,AUTOGEN_311,3,-1.3950,1.946025,35.6239,21.508344,0,0,21,13,...,9.144143,23.918451,1.839881,16.754366,16.754366,0.000000,246.0,16.0,-2.247,58.0
236,AUTOGEN_12463483,0,0.5048,0.254823,27.6443,15.274344,0,0,13,5,...,6.771898,8.849874,1.769975,2.362437,0.000000,0.000000,20.0,2.0,1.407,14.0
237,AUTOGEN_8110,0,-1.3721,1.882658,23.6888,16.703137,0,0,16,7,...,6.191776,12.833789,1.833398,5.218211,2.401269,2.816942,56.0,4.0,2.904,22.0


In [11]:
arc_PaDEL['CID'] = arc_PaDEL['Name'].str.split('_').str[1]
arc_PaDEL = arc_PaDEL.drop(columns=['Name']).set_index('CID')
arc_PaDEL.index = arc_PaDEL.index.astype(int)
arc_PaDEL

Unnamed: 0_level_0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
80799,0,-5.1261,26.276901,79.7593,72.756478,0,0,71,25,46,...,4.990842,48.648662,1.945946,5.474819,5.474819,0.000000,2498.0,23.0,10.948,96.0
10759,0,-0.9176,0.841990,4.9415,17.257965,6,6,14,9,5,...,8.644925,17.528614,1.947624,4.815165,4.815165,0.000000,88.0,9.0,2.185,40.0
5281517,0,4.3258,18.712546,72.4120,42.403032,0,0,39,15,24,...,5.235585,28.309024,1.887268,0.000000,0.000000,0.000000,484.0,14.0,5.992,60.0
168696,0,1.4448,2.087447,64.9559,40.111446,0,0,37,15,22,...,5.572083,29.008255,1.933884,2.456459,2.456459,0.000000,374.0,21.0,4.401,72.0
7170,0,0.0326,0.001063,26.1774,29.767516,6,6,26,14,12,...,7.387640,27.349234,1.953517,7.849139,7.849139,0.000000,336.0,16.0,3.456,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62842,0,2.5266,6.383708,47.3706,29.070688,0,0,27,11,16,...,5.634078,20.254935,1.841358,2.400011,2.400011,0.000000,180.0,10.0,2.121,46.0
311,3,-1.3950,1.946025,35.6239,21.508344,0,0,21,13,8,...,9.144143,23.918451,1.839881,16.754366,16.754366,0.000000,246.0,16.0,-2.247,58.0
12463483,0,0.5048,0.254823,27.6443,15.274344,0,0,13,5,8,...,6.771898,8.849874,1.769975,2.362437,0.000000,0.000000,20.0,2.0,1.407,14.0
8110,0,-1.3721,1.882658,23.6888,16.703137,0,0,16,7,9,...,6.191776,12.833789,1.833398,5.218211,2.401269,2.816942,56.0,4.0,2.904,22.0


In [12]:
arc_des_dataset = pd.read_csv('../data/arc_final_des.csv', index_col=0)
arc_des_dataset

Unnamed: 0,Floral,Fruity,Herbal,Green,Woody,Sweet,Balsamic,Earth,Spicy,Chemical,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
107,0,0,0,0,0,1,1,0,0,0,...,0.870601,-0.485781,0.428391,-0.097812,0.001562,-0.242563,-0.427532,-0.596113,-0.024422,-0.443478
126,0,0,0,0,1,0,1,0,0,0,...,1.750794,-1.013401,0.300976,-0.033139,0.067627,-0.242563,-0.633394,-0.737723,-1.165779,-0.836798
174,0,0,0,0,0,0,0,1,0,0,...,0.032445,-2.425877,-3.359749,-0.152376,-0.054176,-0.242563,-0.829452,-1.870605,-2.584962,-2.311751
177,0,0,0,0,0,1,0,0,0,0,...,0.108736,-2.684755,-4.617587,-0.998939,-0.918961,-0.242563,-0.844156,-2.012215,-1.772042,-2.508411
179,0,0,0,0,0,1,0,0,0,0,...,0.108736,-1.942763,-2.790262,-0.142069,-0.043648,-0.242563,-0.782888,-1.445774,-2.119984,-1.721770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140998267,1,0,1,0,0,1,0,0,0,0,...,0.052427,0.621329,-0.971686,0.947798,1.069679,-0.242563,0.344448,0.678379,-0.439317,0.638154
141266555,0,1,1,0,0,0,0,0,0,0,...,-0.872139,0.654758,-0.722589,0.139547,0.244031,-0.242563,0.410618,0.395159,0.916077,0.441493
142523980,1,0,0,0,1,0,1,0,0,0,...,-1.057710,0.027658,0.386557,-0.922173,-0.840543,-0.242563,-0.167755,0.111938,0.624544,0.048173
144116082,1,0,0,1,1,0,0,0,0,0,...,-0.893362,0.236738,-0.035581,-0.909362,-0.827456,-0.242563,-0.104036,0.961600,0.232845,0.539824


In [13]:
arc_des = arc_PaDEL[arc_des_dataset.columns[12:]]
arc_des

Unnamed: 0_level_0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
80799,0,-5.1261,26.276901,79.7593,72.756478,0,0,71,25,46,...,4.990842,48.648662,1.945946,5.474819,5.474819,0.000000,2498.0,23.0,10.948,96.0
10759,0,-0.9176,0.841990,4.9415,17.257965,6,6,14,9,5,...,8.644925,17.528614,1.947624,4.815165,4.815165,0.000000,88.0,9.0,2.185,40.0
5281517,0,4.3258,18.712546,72.4120,42.403032,0,0,39,15,24,...,5.235585,28.309024,1.887268,0.000000,0.000000,0.000000,484.0,14.0,5.992,60.0
168696,0,1.4448,2.087447,64.9559,40.111446,0,0,37,15,22,...,5.572083,29.008255,1.933884,2.456459,2.456459,0.000000,374.0,21.0,4.401,72.0
7170,0,0.0326,0.001063,26.1774,29.767516,6,6,26,14,12,...,7.387640,27.349234,1.953517,7.849139,7.849139,0.000000,336.0,16.0,3.456,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62842,0,2.5266,6.383708,47.3706,29.070688,0,0,27,11,16,...,5.634078,20.254935,1.841358,2.400011,2.400011,0.000000,180.0,10.0,2.121,46.0
311,3,-1.3950,1.946025,35.6239,21.508344,0,0,21,13,8,...,9.144143,23.918451,1.839881,16.754366,16.754366,0.000000,246.0,16.0,-2.247,58.0
12463483,0,0.5048,0.254823,27.6443,15.274344,0,0,13,5,8,...,6.771898,8.849874,1.769975,2.362437,0.000000,0.000000,20.0,2.0,1.407,14.0
8110,0,-1.3721,1.882658,23.6888,16.703137,0,0,16,7,9,...,6.191776,12.833789,1.833398,5.218211,2.401269,2.816942,56.0,4.0,2.904,22.0


In [14]:
for i, col in enumerate(arc_des_dataset.columns[:12]):
    arc_des.insert(i, col, 0)

In [15]:
arc_des

Unnamed: 0_level_0,Floral,Fruity,Herbal,Green,Woody,Sweet,Balsamic,Earth,Spicy,Chemical,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
80799,0,0,0,0,0,0,0,0,0,0,...,4.990842,48.648662,1.945946,5.474819,5.474819,0.000000,2498.0,23.0,10.948,96.0
10759,0,0,0,0,0,0,0,0,0,0,...,8.644925,17.528614,1.947624,4.815165,4.815165,0.000000,88.0,9.0,2.185,40.0
5281517,0,0,0,0,0,0,0,0,0,0,...,5.235585,28.309024,1.887268,0.000000,0.000000,0.000000,484.0,14.0,5.992,60.0
168696,0,0,0,0,0,0,0,0,0,0,...,5.572083,29.008255,1.933884,2.456459,2.456459,0.000000,374.0,21.0,4.401,72.0
7170,0,0,0,0,0,0,0,0,0,0,...,7.387640,27.349234,1.953517,7.849139,7.849139,0.000000,336.0,16.0,3.456,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62842,0,0,0,0,0,0,0,0,0,0,...,5.634078,20.254935,1.841358,2.400011,2.400011,0.000000,180.0,10.0,2.121,46.0
311,0,0,0,0,0,0,0,0,0,0,...,9.144143,23.918451,1.839881,16.754366,16.754366,0.000000,246.0,16.0,-2.247,58.0
12463483,0,0,0,0,0,0,0,0,0,0,...,6.771898,8.849874,1.769975,2.362437,0.000000,0.000000,20.0,2.0,1.407,14.0
8110,0,0,0,0,0,0,0,0,0,0,...,6.191776,12.833789,1.833398,5.218211,2.401269,2.816942,56.0,4.0,2.904,22.0


In [16]:
def normalize_column(col):
    return (col - col.mean()) / col.std()

In [17]:
comb_arc_des = pd.concat([arc_des_dataset, arc_des], axis=0).sort_index()
comb_arc_des[comb_arc_des.columns[12:]] = comb_arc_des[comb_arc_des.columns[12:]].apply(normalize_column, axis=0)
comb_arc_des = comb_arc_des.fillna(0)
comb_arc_des.min().min()

  sqr = _ensure_numeric((avg - values) ** 2)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


-40.560580888228934

In [18]:
comb_arc_des.to_csv('../data/arc_final_des_w_odourless.csv')