In [15]:
# Import required libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns

from sentence_transformers import SentenceTransformer, util

sns.set_context('notebook')
sns.set_style('white')

In [2]:
# Load the data into a pandas dataframe
df = pd.read_csv("final_perfume_data.csv", encoding="unicode_escape")
df.head()

Unnamed: 0,Name,Brand,Description,Notes,Image URL
0,Tihota Eau de Parfum,Indult,"Rapa Nui for sugar, Tihota is, quite simply, ...","Vanilla bean, musks",https://static.luckyscent.com/images/products/...
1,Sola Parfum,Di Ser,A tribute to the expanse of space extending f...,"Lavender, Yuzu, Lemongrass, Magnolia, Geraniu...",https://static.luckyscent.com/images/products/...
2,Kagiroi Parfum,Di Ser,An aromatic ode to the ancient beauty of Japa...,"Green yuzu, green shikuwasa, sansho seed, cor...",https://static.luckyscent.com/images/products/...
3,Velvet Fantasy Eau de Parfum,Montale,Velvet Fantasy is a solar fragrance where cit...,"tangerine, pink pepper, black coffee, leat...",https://static.luckyscent.com/images/products/...
4,A Blvd. Called Sunset Eau de Parfum,A Lab on Fire,There's no way A Lab On Fire could relocate t...,"Bergamot, almond, violet, jasmine, leather, s...",https://static.luckyscent.com/images/products/...


In [3]:
# Inspect the notes column in the data
list(df.Notes[0:10])

[' Vanilla bean, musks',
 ' Lavender, Yuzu, Lemongrass, Magnolia, Geranium, Jasmine, Frankincense, Myrrh',
 ' Green yuzu, green shikuwasa, sansho seed, coriander, ylang-ylang, shiso, rosewood, vetiver, hinoki, cypriol, patchouli, agarwood',
 ' tangerine,  pink pepper,  black coffee,  leather,  violet,  jasmine,  lily of the valley,  heliotrope powder,  vanilla,  amber, sandalwood,  toffee,  musk,  oakmoss',
 ' Bergamot, almond, violet, jasmine, leather, sandalwood, vanilla, tonka',
 ' Orange flower, neroli, honeysuckle, warm milk, pastry, salicylates, sandalwood, vanilla bean, heliotrope',
 ' Timur JE, Soap Foam Accord (Aldehydes & Musk), Pink Pepper, Jasmine e-pure, Rose Superessence, Lily-of-the-valley Accord, Patchouli, Moss Absolute, Sandalwood Accord & Orcanox',
 ' Tobacco, hay, elemi, copaiba, olibanum, nutmeg, black pepper, castoreum, atlas cedar, oakmoss, cognac, ambroxan, norlimbanol, cinnamon, cumin seed, ash',
 ' Saffron, champaca, fir balsam, beeswax, amber, damask rose, ro

In [4]:
# Rename the "ï»¿Name" column to "Name", concatenate the brand name with the perfume name to create a new "Name" column, and drop unnecessary columns
df.rename(columns={"ï»¿Name": "Name"}, inplace=True)
df['Name'] = df['Brand'] + " - " + df['Name']
df.drop(labels=['Description', 'Image URL', 'Brand'], axis=1, inplace=True)
df.head()

Unnamed: 0,Name,Notes
0,Indult - Tihota Eau de Parfum,"Vanilla bean, musks"
1,Di Ser - Sola Parfum,"Lavender, Yuzu, Lemongrass, Magnolia, Geraniu..."
2,Di Ser - Kagiroi Parfum,"Green yuzu, green shikuwasa, sansho seed, cor..."
3,Montale - Velvet Fantasy Eau de Parfum,"tangerine, pink pepper, black coffee, leat..."
4,A Lab on Fire - A Blvd. Called Sunset Eau de P...,"Bergamot, almond, violet, jasmine, leather, s..."


In [5]:
# Check for null values in the notes column
df.Notes.isnull().sum()

80

In [6]:
# Drop rows with null values in the notes column, reset the index, and update the dataframe
df.dropna(inplace = True)
df.reset_index(inplace=True, drop = True)
df.shape

(2111, 2)

In [7]:
# Define a list of words to check for in perfume names
words = ["Perfume Oil", "Extrait", "Travel", "Hair", "Body", "Hand", "Intense", "Intensivo", "Oil"]

# Create a list of indices for perfumes that contain any of the words in the "words" list, drop those rows, and update the dataframe
index_to_drop = []
for index, name in enumerate(df.Name):
    if any(word.lower() in name.lower() for word in words):
        index_to_drop.append(index)
df.drop(index_to_drop, axis=0, inplace=True)
df.reset_index(inplace=True, drop = True)
df.shape

(1612, 2)

In [8]:
# Convert the notes column to strings and create a list of notes
df.Notes = df.Notes.apply(lambda x: str(x))
notes = df.Notes.to_list()
len(notes)

1612

In [9]:
# Load a pre-trained sentence transformer model and encode the notes to get embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
note_embeddings = model.encode(notes, show_progress_bar=True, batch_size=64)

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

In [10]:
# Inspect the shape and values of the embeddings
print(note_embeddings.shape)
print(note_embeddings[0][:50]) # first 50 values in the embedding of "Vanilla bean, musks"

(1612, 384)
[-0.00921524 -0.00588333  0.07965603  0.01108184  0.0920339  -0.06118878
  0.06980839  0.04193391  0.01312284 -0.00108816  0.06511044 -0.08008223
  0.01984981 -0.13829458 -0.02251236 -0.00748578  0.12410549  0.06448846
  0.00684044 -0.01724513  0.04117164  0.01706347  0.00854874  0.06719033
 -0.05611396  0.01932547  0.02498643 -0.02417995 -0.03028038 -0.12058594
 -0.01737692  0.03183731  0.01314924  0.02147919 -0.10208161  0.0320526
 -0.01535685 -0.02603945  0.06542445 -0.01513855  0.01592387 -0.06788438
  0.01501888 -0.01736868 -0.06789885 -0.01907123  0.01838118 -0.0566425
 -0.00130135 -0.01895572]


In [11]:
# Calculate cosine similarity scores between all pairs of note embeddings
cosine_scores = util.cos_sim(note_embeddings, note_embeddings)
cosine_scores.shape

torch.Size([1612, 1612])

In [12]:
# Create a list of pairs of perfume indices and their cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({"index": [i,j], "score": cosine_scores[i][j]})

In [13]:
# Define a function to get perfume recommendations based on user input
def get_recommendation(user_perfume, user_notes):
    # Encode user notes
    user_embeddings = model.encode([user_notes], show_progress_bar=True)

    # Calculate cosine similarity scores between user embeddings and dataset embeddings
    cosine_scores = util.cos_sim(user_embeddings, note_embeddings)

    # Create dataframe to store cosine similarity scores for each perfume
    recommendations = pd.DataFrame({'Perfume': df['Name'], 'Score': cosine_scores[0]})

    # Sort recommendations by score
    recommendations = recommendations.sort_values(by='Score', ascending=False)

    # Remove the user's perfume from the recommendations
    recommendations = recommendations[recommendations['Perfume'] != user_perfume]
    
    # Sort the scores in descending order and recommend top 5 perfumes
    my_pairs=[]
    for j in range(cosine_scores.shape[1]):
        my_pairs.append({"index": j, "score": cosine_scores[0][j]})
    my_sorted_pairs = sorted(my_pairs, key=lambda x: x['score'], reverse=True)
    
    # print the top 5 recommendations
    print(f"Recommended for {user_perfume}:")
    for no, pair in enumerate(my_sorted_pairs[:5]):
        print(f" {no+1}. {df.iloc[pair['index'], 0]} (Score: {pair['score']:.2f})")

    return

In [14]:
# Test the recommendation function with sample input
user_perfume = 'Jo Malone - English Pear & Freesia'
user_notes = 'Pear, Melon, Freesia, Rose, Musk, Patchouli, Rhuburb, Amber'
recommendations = get_recommendation(user_perfume, user_notes)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Recommended for Jo Malone - English Pear & Freesia:
 1. Alexandre. J - Silver Ombre Eau de Parfum (Score: 0.80)
 2. Montale - Starry Nights Eau de Parfum (Score: 0.79)
 3. BDK Parfums - Bouquet de Hongrie Eau de Parfum (Score: 0.79)
 4. Jovoy Paris - Psychedelique Eau de Parfum (Score: 0.79)
 5. L'Artisan Parfumeur - Champ de Fleurs Eau de Cologne (Score: 0.78)


In [18]:
import pickle

# create a dictionary to store the model, dataframe and note_embeddings
model_dict = {
    'model': model,  # the SentenceTransformer model used to encode notes
    'df': df,  # the dataframe with perfume names and notes
    'note_embeddings': note_embeddings  # the embeddings for each note in the dataframe
}

In [20]:
# open a new file called "model.pkl" in write binary mode
with open('FYF_model.pkl', 'wb') as f:
    # write the model_dict dictionary to the file using pickle
    pickle.dump(model_dict, f)