#Imports

In [1]:
!pip install rdkit transformers



In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import BRICS
from rdkit.Chem.MolStandardize import rdMolStandardize
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
from transformers import RobertaTokenizer, RobertaModel, AutoTokenizer, AutoModel
import torch
import seaborn as sns

#Importing data

In [3]:
df = pd.read_csv("/content/olfactionbase_odors_odorant_data_with_smiles.csv")

In [4]:
df

Unnamed: 0,Sr. No,Primary Odor,Sub-odor,CAS-Id,Chemical name,SMILES
0,1,Alliaceous,Alliaceous,74-93-1,Methanethiol,CS
1,2,Alliaceous,Alliaceous,2075-08-01 00:00:00,Ethanethiol,CCS
2,3,Alliaceous,Alliaceous,75-33-2,2-Propanethiol,CC(C)S
3,4,Alliaceous,Alliaceous,100-53-8,Benzyl mercaptan,C1=CC=C(C=C1)CS
4,5,Alliaceous,Alliaceous,107-03-9,1-Propanethiol,CCCS
...,...,...,...,...,...,...
43340,43341,Woody/ Resinous,Woody,1323-75-7,Santalyl phenylacetate,CC(=CCCC1(C2CCC(C2)C1=C)C)COC(=O)CC3=CC=CC=C3
43341,43342,Woody/ Resinous,Woody,94248-21-2,"(1S,7S)-5,9-Dimethyltricyclo[5.2.1.02,6]dec-4-...",
43342,43343,Ylang,Yoghurt,2096-04-08 00:00:00,"2,3-Heptanedione",CCCCC(=O)C(=O)C
43343,43344,Ylang,Yoghurt,616-09-1,Propyl lactate,CCCOC(=O)C(C)O


##Pre-processing

In [5]:
df = df[['Primary Odor', 'SMILES']].drop_duplicates().dropna().reset_index(drop=True)

In [6]:
df

Unnamed: 0,Primary Odor,SMILES
0,Alliaceous,CS
1,Alliaceous,CCS
2,Alliaceous,CC(C)S
3,Alliaceous,C1=CC=C(C=C1)CS
4,Alliaceous,CCCS
...,...,...
30813,Woody/ Resinous,CC1=C(/C(=N/O)/C(CC1)C(C)C)O
30814,Woody/ Resinous,CC1C(C2CCC3CC(OC3C2C1(C)C)(C)C)(C)C
30815,Woody/ Resinous,CC(=CCCC1(C2CCC(C2)C1=C)C)COC(=O)CC3=CC=CC=C3
30816,Ylang,CCCCC(=O)C(=O)C


In [7]:
def standardize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    # Standardize the molecule
    normalizer = rdMolStandardize.Normalizer()
    mol = normalizer.normalize(mol)
    return Chem.MolToSmiles(mol)

In [8]:
df['SMILES'] = df['SMILES'].apply(standardize_smiles)

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initializing Normalizer
[10:56:55] Running Normalizer
[10:56:55] Initia

In [9]:
df

Unnamed: 0,Primary Odor,SMILES
0,Alliaceous,CS
1,Alliaceous,CCS
2,Alliaceous,CC(C)S
3,Alliaceous,SCc1ccccc1
4,Alliaceous,CCCS
...,...,...
30813,Woody/ Resinous,CC1=C(O)/C(=N/O)C(C(C)C)CC1
30814,Woody/ Resinous,CC1C(C)(C)C2CCC3CC(C)(C)OC3C2C1(C)C
30815,Woody/ Resinous,C=C1C2CCC(C2)C1(C)CCC=C(C)COC(=O)Cc1ccccc1
30816,Ylang,CCCCC(=O)C(C)=O


In [10]:
#Generating InChIKey
from rdkit import Chem

def smiles_to_inchikey(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToInchiKey(mol) if mol else None
    except:
        return None

# Apply the function to generate the InChIKey for each SMILES
df["InChIKey"] = df["SMILES"].apply(smiles_to_inchikey)

In [11]:
df

Unnamed: 0,Primary Odor,SMILES,InChIKey
0,Alliaceous,CS,LSDPWZHWYPCBBB-UHFFFAOYSA-N
1,Alliaceous,CCS,DNJIEGIFACGWOD-UHFFFAOYSA-N
2,Alliaceous,CC(C)S,KJRCEJOSASVSRA-UHFFFAOYSA-N
3,Alliaceous,SCc1ccccc1,UENWRTRMUIOCKN-UHFFFAOYSA-N
4,Alliaceous,CCCS,SUVIGLJNEAMWEG-UHFFFAOYSA-N
...,...,...,...
30813,Woody/ Resinous,CC1=C(O)/C(=N/O)C(C(C)C)CC1,QMTPEVQEOGPPGW-PKNBQFBNSA-N
30814,Woody/ Resinous,CC1C(C)(C)C2CCC3CC(C)(C)OC3C2C1(C)C,HUYXPANWJVOYCI-UHFFFAOYSA-N
30815,Woody/ Resinous,C=C1C2CCC(C2)C1(C)CCC=C(C)COC(=O)Cc1ccccc1,FIZFZQIBGCHOJY-UHFFFAOYSA-N
30816,Ylang,CCCCC(=O)C(C)=O,FJPGAMCQJNLTJC-UHFFFAOYSA-N


In [12]:
#Total ammount of InChIKeys
len(df["InChIKey"].value_counts())

3738

In [13]:
#Checking if there were any problems generating the InChIKeys
has_none = df["InChIKey"].isnull().any()

# Display result
if has_none:
    print("There are None values in the InChIKey column.")
else:
    print("No None values in the InChIKey column.")

No None values in the InChIKey column.


##Extracting families based on odors

In [14]:
odor_categories = {
    "Human Perception": [
        "Chemical/ Hydrocarbon", "Fragrant", "Lemon", "Minty", "Non-citrus fruity",
        "Popcorn", "Sharp/ Pungent", "Sickening", "Sweet", "Woody/ Resinous"
    ],
    "Fragrance": [
        "Aromatic", "Citrus", "Dry woods", "Floral", "Fruity", "Green",
        "Mossy woods", "Oriental", "Soft floral", "Soft oriental", "Water", "Woody", "Woody oriental"
    ],
    "Drinking Water": [
        "Chemical/ Hydrocarbon", "Chlorinous/ Ozonous", "Earthy/ Musty/ Moldy", "Fishy/ Rancid",
        "Fragrant", "Grassy/ Woody", "Marshy/ Septic/ Sulphurous", "Medicinal/ Phenolic"
    ],
    "Compost": [
        "Earthy/ Musty/ Moldy", "Fecal/ Sewery", "Fishy/ Ammonia", "Fragrant/ Fruity",
        "Grassy/ Woody", "Putrid/ Dead Animal", "Rancid", "Solvent/ Hydrocarbon",
        "Sulphur/ Cabbage/ Garlic", "Sweet", "Terpenes/ Pine/ Lemon"
    ],
    "City Smell": [
        "Beverage", "Building materials & construction", "Cleaning materials", "Food",
        "Industrial odours", "Nature", "Non-food items", "People & animals",
        "Synthetic", "Tobacco smoke", "Traffic emissions", "Waste"
    ],
    "Wine": [
        "Woody", "Caramel", "Chemical/ Hydrocarbon", "Earthy", "Floral",
        "Fruity", "Herbaceous", "Microbiological", "Nutty", "Pungent", "Spices"
    ],
    "Waste Water": [
        "Solvent/ Hydrocarbon", "Sulphur/ Cabbage/ Garlic", "Ammonia", "Chlorinous",
        "Earthy/ Musty/ Moldy", "Fecal/ Sewery", "Fragrant/ Fruity",
        "Grassy/ Woody", "Medicinal/ Alcohol", "Nose Feel", "Rancid/ Putrid"
    ],
    "Urban Odor": [
        "Sweet", "Terpenes/ Pine/ Lemon", "Bakery", "Earthy/ Musty/ Moldy",
        "Fecal/ Sewery", "Fragrant/ Fruity", "Fuel/ Gas Station/ Solvent", "Rancid",
        "Restaurant", "Sulphur/ Cabbage/ Garlic"
    ],
    "Perfumes": [
        "Nauseating", "Alliaceous", "Ambrosial", "Aromatic", "Foul", "Fragrant", "Hircine"
    ]
}

In [15]:
def determine_categories(odor):
    categories = [category for category, odors in odor_categories.items() if odor in odors]
    return categories if categories else ["Other"]

# Apply the function to the dataset
df['Category'] = df['Primary Odor'].apply(determine_categories)

In [16]:
df = df[~df['Category'].apply(lambda x: x == ['Other'])].reset_index(drop=True)

In [17]:
df

Unnamed: 0,Primary Odor,SMILES,InChIKey,Category
0,Alliaceous,CS,LSDPWZHWYPCBBB-UHFFFAOYSA-N,[Perfumes]
1,Alliaceous,CCS,DNJIEGIFACGWOD-UHFFFAOYSA-N,[Perfumes]
2,Alliaceous,CC(C)S,KJRCEJOSASVSRA-UHFFFAOYSA-N,[Perfumes]
3,Alliaceous,SCc1ccccc1,UENWRTRMUIOCKN-UHFFFAOYSA-N,[Perfumes]
4,Alliaceous,CCCS,SUVIGLJNEAMWEG-UHFFFAOYSA-N,[Perfumes]
...,...,...,...,...
23511,Woody/ Resinous,CC1=CCC(C(C)(C)OC(=O)c2ccccc2)CC1,SNCWRHHQNIWULG-UHFFFAOYSA-N,[Human Perception]
23512,Woody/ Resinous,CC1CC2C(C/C1=C/CC#N)C2(C)C,WHPKEUKLQUSZBC-YHYXMXQVSA-N,[Human Perception]
23513,Woody/ Resinous,CC1=C(O)/C(=N/O)C(C(C)C)CC1,QMTPEVQEOGPPGW-PKNBQFBNSA-N,[Human Perception]
23514,Woody/ Resinous,CC1C(C)(C)C2CCC3CC(C)(C)OC3C2C1(C)C,HUYXPANWJVOYCI-UHFFFAOYSA-N,[Human Perception]


In [18]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
[Human Perception],3757
[City Smell],3729
"[Fragrance, Wine]",3018
[Wine],2168
"[Human Perception, Compost, Urban Odor]",1981
"[Compost, Waste Water, Urban Odor]",1942
"[Human Perception, Drinking Water, Perfumes]",1941
[Fragrance],1848
"[Drinking Water, Compost, Waste Water]",664
"[Fragrance, Perfumes]",600


In [19]:
grouped_df = df.groupby('InChIKey').agg({
    'SMILES': list,
    'Primary Odor': list,
    'Category': list
}).reset_index()

grouped_df['Category'] = grouped_df['Category'].apply(lambda x: list(set([item for sublist in x for item in sublist])))

In [20]:
grouped_df

Unnamed: 0,InChIKey,SMILES,Primary Odor,Category
0,AALXTPRRKXUUOM-UHFFFAOYSA-N,"[CC1(C)COC(C)(c2ccccc2)OC1, CC1(C)COC(C)(c2ccc...","[Fragrant, Grassy/ Woody, Minty, Sweet, Woody,...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
1,AANIECMHWBXFCP-UHFFFAOYSA-N,"[CCC1NC(CC)SC(CC)S1, CCC1NC(CC)SC(CC)S1, CCC1N...","[Alliaceous, Bakery, Food, Nature, Sickening]","[Perfumes, City Smell, Urban Odor, Human Perce..."
2,AANMVENRNJYEMK-UHFFFAOYSA-N,"[CC(C)C1C=CC(=O)CC1, CC(C)C1C=CC(=O)CC1, CC(C)...","[Aromatic, Food, Fragrant, Grassy/ Woody, Herb...","[Wine, Drinking Water, Perfumes, Compost, Frag..."
3,AAPBYIVJOWCMGH-HWKANZROSA-N,"[C/C=C/SSCCC, C/C=C/SSCCC]","[Nature, Sickening]","[City Smell, Human Perception]"
4,AAQDYYFAFXGBFZ-UHFFFAOYSA-N,"[CC(=O)OCC1CCCO1, CC(=O)OCC1CCCO1, CC(=O)OCC1C...","[Beverage, Caramel, Fragrant, Fragrant/ Fruity...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
...,...,...,...,...
3649,ZYXGECMFJMLZNA-SOFGYWHQSA-N,"[O=C1CCCCCCCCC/C=C/CCCO1, O=C1CCCCCCCCC/C=C/CC...","[Ambrosial, Fragrant, Fruity, Non-food items, ...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
3650,ZYXNLVMBIHVDRH-UHFFFAOYSA-N,"[CC(=O)CC(=O)OCC(C)C, CC(=O)CC(=O)OCC(C)C, CC(...","[Beverage, Fragrant/ Fruity, Nature, Sweet]","[Urban Odor, Compost, City Smell, Human Percep..."
3651,ZZFNQLFGNVPSOG-VURMDHGXSA-N,"[CC/C=C\COC(=O)CCCCC, CC/C=C\COC(=O)CCCCC, CC/...","[Citrus, Fruity, Non-citrus fruity]","[Human Perception, Wine, Fragrance]"
3652,ZZLCFHIKESPLTH-UHFFFAOYSA-N,"[Cc1ccc(-c2ccccc2)cc1, Cc1ccc(-c2ccccc2)cc1, C...","[Aromatic, Caramel, Floral, Food, Fragrant, He...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."


In [21]:
def longest_smile(smiles_list):
    return max(smiles_list, key=len)

# Apply the function to select the longest SMILES for each InChIKey group
grouped_df['SMILES'] = grouped_df['SMILES'].apply(longest_smile)

In [22]:
grouped_df

Unnamed: 0,InChIKey,SMILES,Primary Odor,Category
0,AALXTPRRKXUUOM-UHFFFAOYSA-N,CC1(C)COC(C)(c2ccccc2)OC1,"[Fragrant, Grassy/ Woody, Minty, Sweet, Woody,...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
1,AANIECMHWBXFCP-UHFFFAOYSA-N,CCC1NC(CC)SC(CC)S1,"[Alliaceous, Bakery, Food, Nature, Sickening]","[Perfumes, City Smell, Urban Odor, Human Perce..."
2,AANMVENRNJYEMK-UHFFFAOYSA-N,CC(C)C1C=CC(=O)CC1,"[Aromatic, Food, Fragrant, Grassy/ Woody, Herb...","[Wine, Drinking Water, Perfumes, Compost, Frag..."
3,AAPBYIVJOWCMGH-HWKANZROSA-N,C/C=C/SSCCC,"[Nature, Sickening]","[City Smell, Human Perception]"
4,AAQDYYFAFXGBFZ-UHFFFAOYSA-N,CC(=O)OCC1CCCO1,"[Beverage, Caramel, Fragrant, Fragrant/ Fruity...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
...,...,...,...,...
3649,ZYXGECMFJMLZNA-SOFGYWHQSA-N,O=C1CCCCCCCCC/C=C/CCCO1,"[Ambrosial, Fragrant, Fruity, Non-food items, ...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
3650,ZYXNLVMBIHVDRH-UHFFFAOYSA-N,CC(=O)CC(=O)OCC(C)C,"[Beverage, Fragrant/ Fruity, Nature, Sweet]","[Urban Odor, Compost, City Smell, Human Percep..."
3651,ZZFNQLFGNVPSOG-VURMDHGXSA-N,CC/C=C\COC(=O)CCCCC,"[Citrus, Fruity, Non-citrus fruity]","[Human Perception, Wine, Fragrance]"
3652,ZZLCFHIKESPLTH-UHFFFAOYSA-N,Cc1ccc(-c2ccccc2)cc1,"[Aromatic, Caramel, Floral, Food, Fragrant, He...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."


In [23]:
rows_with_large_categories = grouped_df[grouped_df['Category'].apply(len) > 8]
count_large_categories = len(rows_with_large_categories)
indexes_large_categories = rows_with_large_categories.index.tolist()

In [24]:
count_large_categories

1113

##Separating families

In [25]:
df_fragrance = grouped_df[grouped_df['Category'].apply(lambda x: 'Fragrance' in x)]
df_wine = grouped_df[grouped_df['Category'].apply(lambda x: 'Wine' in x)]
df_perfumes = grouped_df[grouped_df['Category'].apply(lambda x: 'Perfumes' in x)]
df_urban_odor = grouped_df[grouped_df['Category'].apply(lambda x: 'Urban Odor' in x)]
df_city_smell = grouped_df[grouped_df['Category'].apply(lambda x: 'City Smell' in x)]
df_compost = grouped_df[grouped_df['Category'].apply(lambda x: 'Compost' in x)]
df_waste_water = grouped_df[grouped_df['Category'].apply(lambda x: 'Waste Water' in x)]
df_human_perception = grouped_df[grouped_df['Category'].apply(lambda x: 'Human Perception' in x)]
df_drinking_water = grouped_df[grouped_df['Category'].apply(lambda x: 'Drinking Water' in x)]

In [26]:
df_compost

Unnamed: 0,InChIKey,SMILES,Primary Odor,Category
0,AALXTPRRKXUUOM-UHFFFAOYSA-N,CC1(C)COC(C)(c2ccccc2)OC1,"[Fragrant, Grassy/ Woody, Minty, Sweet, Woody,...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
2,AANMVENRNJYEMK-UHFFFAOYSA-N,CC(C)C1C=CC(=O)CC1,"[Aromatic, Food, Fragrant, Grassy/ Woody, Herb...","[Wine, Drinking Water, Perfumes, Compost, Frag..."
4,AAQDYYFAFXGBFZ-UHFFFAOYSA-N,CC(=O)OCC1CCCO1,"[Beverage, Caramel, Fragrant, Fragrant/ Fruity...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
5,AAYIYWGTGAUKQY-WAYWQWQTSA-N,CCC/C=C\CCOC(C)=O,"[Food, Fragrant/ Fruity, Fruity, Green, Non-ci...","[Urban Odor, Wine, Compost, Fragrance, City Sm..."
6,ABDKAPXRBAPSQN-UHFFFAOYSA-N,COc1ccccc1OC,"[Fragrant, Medicinal/ Phenolic, Nutty, Sharp/ ...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
...,...,...,...,...
3647,ZYTMANIQRDEHIO-KXUCPTDWSA-N,C=C(C)[C@@H]1CC[C@@H](C)C[C@H]1O,"[Aromatic, Fragrant, Grassy/ Woody, Herbaceous...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
3648,ZYTMANIQRDEHIO-UHFFFAOYSA-N,C=C(C)C1CCC(C)CC1O,"[Aromatic, Fragrant, Fragrant/ Fruity, Grassy/...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
3649,ZYXGECMFJMLZNA-SOFGYWHQSA-N,O=C1CCCCCCCCC/C=C/CCCO1,"[Ambrosial, Fragrant, Fruity, Non-food items, ...","[Urban Odor, Wine, Drinking Water, Perfumes, C..."
3650,ZYXNLVMBIHVDRH-UHFFFAOYSA-N,CC(=O)CC(=O)OCC(C)C,"[Beverage, Fragrant/ Fruity, Nature, Sweet]","[Urban Odor, Compost, City Smell, Human Percep..."


In [27]:
df_fragrance['InChIKey'].value_counts()

Unnamed: 0_level_0,count
InChIKey,Unnamed: 1_level_1
ZZLCFHIKESPLTH-UHFFFAOYSA-N,1
AALXTPRRKXUUOM-UHFFFAOYSA-N,1
AANMVENRNJYEMK-UHFFFAOYSA-N,1
AAYIYWGTGAUKQY-WAYWQWQTSA-N,1
ABDKAPXRBAPSQN-UHFFFAOYSA-N,1
...,...
AEVBKBWVXISVBJ-UHFFFAOYSA-N,1
AEJRTNBCFUOSEM-UHFFFAOYSA-N,1
AEGTXRAMXBTODF-UHFFFAOYSA-N,1
ADNADZOSMJDVIS-UHFFFAOYSA-N,1


#Dimensionality

In [28]:
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd

##Obtianing the embeddings

In [29]:
model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

# Function to convert SMILES to embeddings
def smiles_to_embedding(smiles_list):
    embeddings = []
    with torch.no_grad():
        for smiles in smiles_list:
            inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
            embeddings.append(embedding)
    return np.array(embeddings)

colors = [
    "blue", "green", "red", "purple", "orange", "brown",
    "pink", "gray", "olive", "cyan"
]

def plot_tsne(tsne_data, title, color):
    plt.figure(figsize=(8, 6))
    plt.scatter(tsne_data[:, 0], tsne_data[:, 1], alpha=0.7, color=color, label=title)
    plt.title(f"Chemical Space of {title}")
    plt.xlabel("t-SNE Dimension 1")
    plt.ylabel("t-SNE Dimension 2")

    # Add legend
    plt.legend(loc="upper right")

    # Save the plot as PNG
    filename = f"{title.replace(' ', '_').lower()}.png"
    plt.savefig(filename, format="png")
    plt.close()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [30]:
dataframes = [
    (df_fragrance, "Fragrance"),
    (df_wine, "Wine"),
    (df_perfumes, "Perfumes"),
    (df_urban_odor, "Urban Odor"),
    (df_city_smell, "City Smell"),
    (df_compost, "Compost"),
    (df_waste_water, "Waste Water"),
    (df_human_perception, "Human Perception"),
    (df_drinking_water, "Drinking Water")
]

In [31]:
for (df_tsne, name), color in zip(dataframes, colors):
    if len(df_tsne) > 0:  # Ensure there is data to process
        # Generate embeddings
        embeddings = smiles_to_embedding(df_tsne['SMILES'].tolist())

        # Apply t-SNE for dimensionality reduction
        tsne_result = TSNE(n_components=2, random_state=42).fit_transform(embeddings)

        # Plot and save the chemical space with a specific color and legend
        plot_tsne(tsne_result, name, color)

##Analysis per category and odors

In [32]:
df

Unnamed: 0,Primary Odor,SMILES,InChIKey,Category
0,Alliaceous,CS,LSDPWZHWYPCBBB-UHFFFAOYSA-N,[Perfumes]
1,Alliaceous,CCS,DNJIEGIFACGWOD-UHFFFAOYSA-N,[Perfumes]
2,Alliaceous,CC(C)S,KJRCEJOSASVSRA-UHFFFAOYSA-N,[Perfumes]
3,Alliaceous,SCc1ccccc1,UENWRTRMUIOCKN-UHFFFAOYSA-N,[Perfumes]
4,Alliaceous,CCCS,SUVIGLJNEAMWEG-UHFFFAOYSA-N,[Perfumes]
...,...,...,...,...
23511,Woody/ Resinous,CC1=CCC(C(C)(C)OC(=O)c2ccccc2)CC1,SNCWRHHQNIWULG-UHFFFAOYSA-N,[Human Perception]
23512,Woody/ Resinous,CC1CC2C(C/C1=C/CC#N)C2(C)C,WHPKEUKLQUSZBC-YHYXMXQVSA-N,[Human Perception]
23513,Woody/ Resinous,CC1=C(O)/C(=N/O)C(C(C)C)CC1,QMTPEVQEOGPPGW-PKNBQFBNSA-N,[Human Perception]
23514,Woody/ Resinous,CC1C(C)(C)C2CCC3CC(C)(C)OC3C2C1(C)C,HUYXPANWJVOYCI-UHFFFAOYSA-N,[Human Perception]


In [33]:
categories = [
    "Fragrance",
    "Wine",
    "Perfumes",
    "Urban Odor",
    "City Smell",
    "Compost",
    "Waste Water",
    "Human Perception",
    "Drinking Water"
]

# Create separate DataFrames for each category
category_subsamples = {category: df[df['Category'].apply(lambda x: category in x)] for category in categories}

In [34]:
df_fragrance = category_subsamples["Fragrance"]
df_wine = category_subsamples["Wine"]
df_perfumes = category_subsamples["Perfumes"]
df_urban_odor = category_subsamples["Urban Odor"]
df_city_smell = category_subsamples["City Smell"]
df_compost = category_subsamples["Compost"]
df_waste_water = category_subsamples["Waste Water"]
df_human_perception = category_subsamples["Human Perception"]
df_drinking_water = category_subsamples["Drinking Water"]

In [35]:
df_fragrance

Unnamed: 0,Primary Odor,SMILES,InChIKey,Category
302,Aromatic,c1ccccc1,UHOVQNZJYSORNB-UHFFFAOYSA-N,"[Fragrance, Perfumes]"
303,Aromatic,COC(=O)c1ccccc1O,OSWPMRLSEDHDFF-UHFFFAOYSA-N,"[Fragrance, Perfumes]"
304,Aromatic,C=C(C)C(=O)OC,VVQNEPGJFQJSBK-UHFFFAOYSA-N,"[Fragrance, Perfumes]"
305,Aromatic,CCC(=O)c1ccccc1,KRIOVPPHQSLHCZ-UHFFFAOYSA-N,"[Fragrance, Perfumes]"
306,Aromatic,CC(=O)OCc1ccc2c(c1)OCO2,PFWYHTORQZAGCA-UHFFFAOYSA-N,"[Fragrance, Perfumes]"
...,...,...,...,...
22216,Woody oriental,CC1=CCC(/C=C/C(C)(C)C(C)O)C1(C)C,QZFSNJAQFWEXEA-MDZDMXLPSA-N,[Fragrance]
22217,Woody oriental,C=C(CC(C)CO)C1CC=C(C)C1(C)C,DGHXZJZALPECTJ-UHFFFAOYSA-N,[Fragrance]
22218,Woody oriental,CC1C2CCC(C3(O)CCCCC3)(C2)C1(C)C,JMLPIRYUSKIOGR-UHFFFAOYSA-N,[Fragrance]
22219,Woody oriental,CC1CC2C(C/C1=C/CC#N)C2(C)C,WHPKEUKLQUSZBC-YHYXMXQVSA-N,[Fragrance]


In [None]:
import matplotlib.pyplot as plt
import torch
import numpy as np
from sklearn.manifold import TSNE
from transformers import AutoModel, AutoTokenizer
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap

# Load the model and tokenizer
model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

# Function to convert SMILES to embeddings
def smiles_to_embedding(smiles_list):
    embeddings = []
    with torch.no_grad():
        for smiles in smiles_list:
            inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
            embeddings.append(embedding)
    return np.array(embeddings)

# List of DataFrames and category names
dataframes = [
    (df_fragrance, "Fragrance"),
    (df_wine, "Wine"),
    (df_perfumes, "Perfumes"),
    (df_urban_odor, "Urban Odor"),
    (df_city_smell, "City Smell"),
    (df_compost, "Compost"),
    (df_waste_water, "Waste Water"),
    (df_human_perception, "Human Perception"),
    (df_drinking_water, "Drinking Water")
]

# Loop over each category, process SMILES, apply t-SNE, and save the plot
for df, category_name in dataframes:
    smiles_list = df['SMILES'].tolist()
    primary_odors = df['Primary Odor'].tolist()

    # Generate embeddings
    embeddings = smiles_to_embedding(smiles_list)

    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)

    # Map each unique primary odor to a color
    unique_odors = list(set(primary_odors))
    color_map = cm.get_cmap("tab20", len(unique_odors))
    odor_to_color = {odor: color_map(i) for i, odor in enumerate(unique_odors)}
    colors = [odor_to_color[odor] for odor in primary_odors]

    # Plot and save the results
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=colors, label=primary_odors)

    # Create a custom legend to show the color for each primary odor
    handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=odor_to_color[odor], markersize=8)
               for odor in unique_odors]
    plt.legend(handles, unique_odors, title="Primary Odor", bbox_to_anchor=(1.05, 1), loc='upper left', prop={'size': 12})

    plt.title(category_name)
    plt.xlabel("t-SNE Dimension 1")
    plt.ylabel("t-SNE Dimension 2")
    plt.savefig(f"{category_name}.png", bbox_inches='tight')  # Save as PNG with category name
    plt.close()


  color_map = cm.get_cmap("tab20", len(unique_odors))
  color_map = cm.get_cmap("tab20", len(unique_odors))
  color_map = cm.get_cmap("tab20", len(unique_odors))
  color_map = cm.get_cmap("tab20", len(unique_odors))
  color_map = cm.get_cmap("tab20", len(unique_odors))
  color_map = cm.get_cmap("tab20", len(unique_odors))
  color_map = cm.get_cmap("tab20", len(unique_odors))


In [None]:
for df, category_name in dataframes:
  print("category", df.value_counts())