In [1]:
# Import required libraries and dependencies
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import ast

In [2]:
#more libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
#import dependency for SQLite database
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect("../Resources/perfume.db")

# Create a cursor object to execute SQL queries
cur = conn.cursor()

In [4]:
# Fetch all rows from the 'Review_Included' table
cur.execute("SELECT * FROM Reviews_Included")

# Fetch all rows from the 'Review_Included' table
rows = cur.fetchall()

# Close the cursor and connection
cur.close()
conn.close()

In [5]:
# Convert fetched data into a pandas DataFrame
# Get column names
columns = [col[0] for col in cur.description]
perfume_df = pd.DataFrame(rows, columns=columns)

# Display the first few rows of the DataFrame
perfume_df.head()

Unnamed: 0,name,company,image,for_gender,rating,number_votes,main accords,description,top notes,middle notes,base notes,longevity,sillage,gender_vote,price value
0,Angels' Share,By Kilian,https://fimgs.net/mdimg/perfume/375x500.62615.jpg,for women and men,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...",Angels' Share by By Kilian is a Oriental Vanil...,['Cognac'],"['Cinnamon', 'Tonka Bean', 'Oak']","['Praline', 'Vanilla', 'Sandalwood']","{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15...","{'female': 40, 'more female': 39, 'unisex': 22...","{'way overpriced': 64, 'overpriced': 143, 'ok'..."
1,My Way,Giorgio Armani,https://fimgs.net/mdimg/perfume/375x500.62036.jpg,for women,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...",My Way by Giorgio Armani is a Floral fragrance...,"['Orange Blossom', 'Bergamot']","['Tuberose', 'Indian Jasmine']","['White Musk', 'Madagascar Vanilla', 'Virginia...","{'very weak': 49, 'weak': 84, 'moderate': 200,...","{'intimate': 127, 'moderate': 322, 'strong': 2...","{'female': 349, 'more female': 21, 'unisex': 4...","{'way overpriced': 38, 'overpriced': 121, 'ok'..."
2,Libre Intense,Yves Saint Laurent,https://fimgs.net/mdimg/perfume/375x500.62318.jpg,for women,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...",Libre Intense by Yves Saint Laurent is a Orien...,"['Lavender', 'Mandarin Orange', 'Bergamot']","['Lavender', 'Tunisian Orange Blossom', 'Jasmi...","['Madagascar Vanilla', 'Tonka Bean', 'Ambergri...","{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23...","{'female': 162, 'more female': 91, 'unisex': 7...","{'way overpriced': 11, 'overpriced': 59, 'ok':..."
3,Dior Homme 2020,Christian Dior,https://fimgs.net/mdimg/perfume/375x500.58714.jpg,for men,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...",Dior Homme 2020 by Christian Dior is a Woody f...,"['Bergamot', 'Pink Pepper', 'elemi']","['Cashmere Wood', 'Atlas Cedar', 'Patchouli']","['Iso E Super', 'Haitian Vetiver', 'White Musk']","{'very weak': 125, 'weak': 83, 'moderate': 174...","{'intimate': 214, 'moderate': 370, 'strong': 1...","{'female': 3, 'more female': 1, 'unisex': 17, ...","{'way overpriced': 31, 'overpriced': 59, 'ok':..."
4,Acqua di Giò Profondo,Giorgio Armani,https://fimgs.net/mdimg/perfume/375x500.59532.jpg,for men,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...",Acqua di Giò Profondo by Giorgio Armani is a A...,"['Sea Notes', 'Aquozone', 'Bergamot', 'Green M...","['Rosemary', 'Cypress', 'Lavender', 'Mastic or...","['Mineral notes', 'Musk', 'Patchouli', 'Amber']","{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1...","{'female': 3, 'more female': 0, 'unisex': 5, '...","{'way overpriced': 32, 'overpriced': 84, 'ok':..."


In [6]:
# Remove unnecessary columns (at this point we have decided to use the main accords in lieu of the top, middle, and base notes for accuracy)
columns_to_remove = ['image', 'for_gender', 'description', 'gender_vote', 'price value', 'top notes', 'middle notes', 'base notes']

# Remove the specified columns
perfume_df = perfume_df.drop(columns=columns_to_remove)

# Rename the column 'main accords' to 'main_accords' in perfume_df
perfume_df.rename(columns={'main accords': 'main_accords'}, inplace=True)

# Display the DataFrame after dropping the specified columns
perfume_df.head()

Unnamed: 0,name,company,rating,number_votes,main_accords,longevity,sillage
0,Angels' Share,By Kilian,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...","{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15..."
1,My Way,Giorgio Armani,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...","{'very weak': 49, 'weak': 84, 'moderate': 200,...","{'intimate': 127, 'moderate': 322, 'strong': 2..."
2,Libre Intense,Yves Saint Laurent,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23..."
3,Dior Homme 2020,Christian Dior,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...","{'very weak': 125, 'weak': 83, 'moderate': 174...","{'intimate': 214, 'moderate': 370, 'strong': 1..."
4,Acqua di Giò Profondo,Giorgio Armani,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1..."


In [7]:
# Define the threshold for removing rows based on rating
rating_threshold = 3.59

# Filter the DataFrame to exclude rows with ratings at or below the threshold
perfume_df = perfume_df[perfume_df['rating'] > rating_threshold]

# Display the DataFrame after removing rows
print(perfume_df)

                           name             company  rating  number_votes  \
0                 Angels' Share           By Kilian    4.31         682.0   
2                 Libre Intense  Yves Saint Laurent    4.02         858.0   
4         Acqua di Giò Profondo      Giorgio Armani    4.03         869.0   
5             Le Male Le Parfum  Jean Paul Gaultier    4.26         690.0   
6    Beau De Jour Eau de Parfum            Tom Ford    4.25         729.0   
..                          ...                 ...     ...           ...   
473                 La Capitale             Xerjoff    4.13         187.0   
474                  DKNY Women         Donna Karan    3.87        2285.0   
475                       Toy 2            Moschino    3.80         940.0   
476                       Derby            Guerlain    4.38         389.0   
477                       Aimer     Pascal Morabito    3.71           7.0   

                                          main_accords  \
0    {'woody': 10

In [8]:
# Define the threshold for removing rows based on number of votes
votes_threshold = 100

# Filter the DataFrame to exclude rows with number of votes less than the threshold
perfume_df = perfume_df[perfume_df['number_votes'] >= votes_threshold]

# Display the DataFrame after removing rows
print(perfume_df)

                           name             company  rating  number_votes  \
0                 Angels' Share           By Kilian    4.31         682.0   
2                 Libre Intense  Yves Saint Laurent    4.02         858.0   
4         Acqua di Giò Profondo      Giorgio Armani    4.03         869.0   
5             Le Male Le Parfum  Jean Paul Gaultier    4.26         690.0   
6    Beau De Jour Eau de Parfum            Tom Ford    4.25         729.0   
..                          ...                 ...     ...           ...   
472                Laudano Nero     Tiziana Terenzi    4.13        1693.0   
473                 La Capitale             Xerjoff    4.13         187.0   
474                  DKNY Women         Donna Karan    3.87        2285.0   
475                       Toy 2            Moschino    3.80         940.0   
476                       Derby            Guerlain    4.38         389.0   

                                          main_accords  \
0    {'woody': 10

In [9]:
# Rename the DataFrame to reflect its cleaned state
# Make a copy to preserve the original DataFrame
cleaned_perfume_df = perfume_df.copy()
cleaned_perfume_df

Unnamed: 0,name,company,rating,number_votes,main_accords,longevity,sillage
0,Angels' Share,By Kilian,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...","{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15..."
2,Libre Intense,Yves Saint Laurent,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23..."
4,Acqua di Giò Profondo,Giorgio Armani,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1..."
5,Le Male Le Parfum,Jean Paul Gaultier,4.26,690.0,"{'warm spicy': 100.0, 'vanilla': 72.2173, 'aro...","{'very weak': 18, 'weak': 31, 'moderate': 129,...","{'intimate': 80, 'moderate': 262, 'strong': 17..."
6,Beau De Jour Eau de Parfum,Tom Ford,4.25,729.0,"{'aromatic': 100.0, 'fresh spicy': 94.7562, 'l...","{'very weak': 30, 'weak': 19, 'moderate': 88, ...","{'intimate': 45, 'moderate': 190, 'strong': 19..."
...,...,...,...,...,...,...,...
472,Laudano Nero,Tiziana Terenzi,4.13,1693.0,"{'woody': 100.0, 'amber': 93.699, 'smoky': 68....","{'very weak': 41, 'weak': 38, 'moderate': 84, ...","{'intimate': 94, 'moderate': 245, 'strong': 42..."
473,La Capitale,Xerjoff,4.13,187.0,"{'fruity': 100.0, 'sweet': 92.341, 'amber': 85...","{'very weak': 5, 'weak': 9, 'moderate': 20, 'l...","{'intimate': 22, 'moderate': 36, 'strong': 53,..."
474,DKNY Women,Donna Karan,3.87,2285.0,"{'citrus': 100.0, 'aromatic': 97.6724, 'green'...","{'very weak': 23, 'weak': 44, 'moderate': 190,...","{'intimate': 71, 'moderate': 244, 'strong': 21..."
475,Toy 2,Moschino,3.80,940.0,"{'fruity': 100.0, 'floral': 73.9229, 'fresh': ...","{'very weak': 54, 'weak': 78, 'moderate': 147,...","{'intimate': 135, 'moderate': 200, 'strong': 1..."


In [10]:
print(cleaned_perfume_df.columns)

Index(['name', 'company', 'rating', 'number_votes', 'main_accords',
       'longevity', 'sillage'],
      dtype='object')


In [11]:
# Iterate over each row and extract the keys from the dictionary.
cleaned_perfume_df['main_accords'] = cleaned_perfume_df['main_accords'].apply(lambda x: ast.literal_eval(x))
cleaned_perfume_df['main_accords'] = cleaned_perfume_df['main_accords'].apply(lambda x: ' '.join(x.keys()))

# Printing the first few rows to verify the changes
cleaned_perfume_df.head()

Unnamed: 0,name,company,rating,number_votes,main_accords,longevity,sillage
0,Angels' Share,By Kilian,4.31,682.0,woody sweet warm spicy vanilla cinnamon amber ...,"{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15..."
2,Libre Intense,Yves Saint Laurent,4.02,858.0,vanilla aromatic sweet white floral lavender a...,"{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23..."
4,Acqua di Giò Profondo,Giorgio Armani,4.03,869.0,aromatic marine citrus fresh spicy woody fresh...,"{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1..."
5,Le Male Le Parfum,Jean Paul Gaultier,4.26,690.0,warm spicy vanilla aromatic lavender amber pow...,"{'very weak': 18, 'weak': 31, 'moderate': 129,...","{'intimate': 80, 'moderate': 262, 'strong': 17..."
6,Beau De Jour Eau de Parfum,Tom Ford,4.25,729.0,aromatic fresh spicy lavender woody patchouli ...,"{'very weak': 30, 'weak': 19, 'moderate': 88, ...","{'intimate': 45, 'moderate': 190, 'strong': 19..."


In [12]:
# Define mapping from categories to numeric values
longevity_mapping = {'very weak': 1, 'weak': 2, 'moderate': 3, 'long lasting': 4, 'eternal': 5}

# Convert longevity entries
def convert_longevity(longevity_entry):
    longevity_dict = ast.literal_eval(longevity_entry)
    numeric_longevity = {longevity_mapping[key]: value for key, value in longevity_dict.items()}
    max_category = max(numeric_longevity, key=numeric_longevity.get)
    return max_category

# Apply conversion to create a Series of numeric longevity values
numeric_longevity_series = cleaned_perfume_df['longevity'].apply(convert_longevity)

# Rename the column to 'longevity_rating'
numeric_longevity_series.name = 'longevity_rating'

# Concatenate the new Series to the original DataFrame
cleaned_perfume_df = pd.concat([cleaned_perfume_df, numeric_longevity_series], axis=1)

# Drop the original 'longevity' column
cleaned_perfume_df.drop(columns='longevity', inplace=True)

# Display the result
cleaned_perfume_df

Unnamed: 0,name,company,rating,number_votes,main_accords,sillage,longevity_rating
0,Angels' Share,By Kilian,4.31,682.0,woody sweet warm spicy vanilla cinnamon amber ...,"{'intimate': 40, 'moderate': 187, 'strong': 15...",4
2,Libre Intense,Yves Saint Laurent,4.02,858.0,vanilla aromatic sweet white floral lavender a...,"{'intimate': 39, 'moderate': 155, 'strong': 23...",4
4,Acqua di Giò Profondo,Giorgio Armani,4.03,869.0,aromatic marine citrus fresh spicy woody fresh...,"{'intimate': 115, 'moderate': 333, 'strong': 1...",4
5,Le Male Le Parfum,Jean Paul Gaultier,4.26,690.0,warm spicy vanilla aromatic lavender amber pow...,"{'intimate': 80, 'moderate': 262, 'strong': 17...",4
6,Beau De Jour Eau de Parfum,Tom Ford,4.25,729.0,aromatic fresh spicy lavender woody patchouli ...,"{'intimate': 45, 'moderate': 190, 'strong': 19...",4
...,...,...,...,...,...,...,...
472,Laudano Nero,Tiziana Terenzi,4.13,1693.0,woody amber smoky aromatic warm spicy sweet fr...,"{'intimate': 94, 'moderate': 245, 'strong': 42...",5
473,La Capitale,Xerjoff,4.13,187.0,fruity sweet amber warm spicy leather caramel ...,"{'intimate': 22, 'moderate': 36, 'strong': 53,...",5
474,DKNY Women,Donna Karan,3.87,2285.0,citrus aromatic green fresh spicy leather frui...,"{'intimate': 71, 'moderate': 244, 'strong': 21...",4
475,Toy 2,Moschino,3.80,940.0,fruity floral fresh sweet citrus musky powdery...,"{'intimate': 135, 'moderate': 200, 'strong': 1...",3


In [13]:
# Define mapping from categories to numeric values
sillage_mapping = {'intimate': 1, 'moderate': 2, 'strong': 3, 'enormous': 4}

# Convert sillage entries
def convert_sillage(sillage_entry):
    sillage_dict = ast.literal_eval(sillage_entry)
    numeric_sillage = {sillage_mapping[key]: value for key, value in sillage_dict.items()}
    max_category = max(numeric_sillage, key=numeric_sillage.get)
    return max_category

# Apply conversion to create a Series of numeric sillage values
numeric_sillage_series = cleaned_perfume_df['sillage'].apply(convert_sillage)

# Rename the column to 'sillage_rating'
numeric_sillage_series.name = 'sillage_rating'

# Concatenate the new Series to the original DataFrame
cleaned_perfume_df = pd.concat([cleaned_perfume_df, numeric_sillage_series], axis=1)

# Drop the original 'sillage' column
cleaned_perfume_df.drop(columns='sillage', inplace=True)

# Combine 'name' and 'company' columns into a new column 'perfume'
cleaned_perfume_df['perfume'] = cleaned_perfume_df['name'] + ' by ' + cleaned_perfume_df['company']

# Drop the 'name' and 'company' columns
cleaned_perfume_df.drop(['name', 'company'], axis=1, inplace=True)

# Display the result
cleaned_perfume_df

Unnamed: 0,rating,number_votes,main_accords,longevity_rating,sillage_rating,perfume
0,4.31,682.0,woody sweet warm spicy vanilla cinnamon amber ...,4,2,Angels' Share by By Kilian
2,4.02,858.0,vanilla aromatic sweet white floral lavender a...,4,3,Libre Intense by Yves Saint Laurent
4,4.03,869.0,aromatic marine citrus fresh spicy woody fresh...,4,2,Acqua di Giò Profondo by Giorgio Armani
5,4.26,690.0,warm spicy vanilla aromatic lavender amber pow...,4,2,Le Male Le Parfum by Jean Paul Gaultier
6,4.25,729.0,aromatic fresh spicy lavender woody patchouli ...,4,3,Beau De Jour Eau de Parfum by Tom Ford
...,...,...,...,...,...,...
472,4.13,1693.0,woody amber smoky aromatic warm spicy sweet fr...,5,4,Laudano Nero by Tiziana Terenzi
473,4.13,187.0,fruity sweet amber warm spicy leather caramel ...,5,4,La Capitale by Xerjoff
474,3.87,2285.0,citrus aromatic green fresh spicy leather frui...,4,2,DKNY Women by Donna Karan
475,3.80,940.0,fruity floral fresh sweet citrus musky powdery...,3,2,Toy 2 by Moschino


In [14]:
# Get unique scents from main_accords column
unique_scents = set()
for accords in cleaned_perfume_df['main_accords']:
    unique_scents.update(accords.split())

# Create new columns for each scent
for scent in unique_scents:
    cleaned_perfume_df[scent] = 0

# Iterate over rows and set scent columns
for idx, row in cleaned_perfume_df.iterrows():
    accords = row['main_accords'].split()
    for scent in accords:
        cleaned_perfume_df.at[idx, scent] = 1

# Dropping the original 'main_accords' column if needed
cleaned_perfume_df.drop('main_accords', axis=1, inplace=True)

# Preview Dataframe
cleaned_perfume_df

Unnamed: 0,rating,number_votes,longevity_rating,sillage_rating,perfume,tropical,animalic,conifer,sweet,honey,...,herbal,oud,soft,marine,iris,balsamic,savory,aromatic,violet,citrus
0,4.31,682.0,4,2,Angels' Share by By Kilian,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,4.02,858.0,4,3,Libre Intense by Yves Saint Laurent,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
4,4.03,869.0,4,2,Acqua di Giò Profondo by Giorgio Armani,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
5,4.26,690.0,4,2,Le Male Le Parfum by Jean Paul Gaultier,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
6,4.25,729.0,4,3,Beau De Jour Eau de Parfum by Tom Ford,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,4.13,1693.0,5,4,Laudano Nero by Tiziana Terenzi,0,0,0,1,0,...,0,1,0,0,0,1,0,1,0,0
473,4.13,187.0,5,4,La Capitale by Xerjoff,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
474,3.87,2285.0,4,2,DKNY Women by Donna Karan,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
475,3.80,940.0,3,2,Toy 2 by Moschino,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1


Test and Train the model

In [15]:
# We must train a different model for each scent
# Get a list of all scent columns
scent_columns = cleaned_perfume_df.columns[5:]

# Iterate over each scent column and train a model
for scent in scent_columns:
    # Split features (X) and target variable (y) for the current scent
    X = cleaned_perfume_df.drop(['rating', 'number_votes', 'perfume', scent], axis=1)
    y = cleaned_perfume_df[scent]
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Decision Tree Classifier
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    # Predict on test set
    y_pred = clf.predict(X_test)

    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy for scent", scent, ":", accuracy)

Accuracy for scent tropical : 0.8076923076923077
Accuracy for scent animalic : 0.8589743589743589
Accuracy for scent conifer : 0.9743589743589743
Accuracy for scent sweet : 0.7307692307692307
Accuracy for scent honey : 0.9487179487179487
Accuracy for scent coconut : 0.8974358974358975
Accuracy for scent rum : 0.9615384615384616
Accuracy for scent soapy : 0.9871794871794872
Accuracy for scent vodka : 0.9871794871794872
Accuracy for scent mineral : 1.0
Accuracy for scent fresh : 0.5641025641025641
Accuracy for scent anis : 1.0
Accuracy for scent nutty : 0.9615384615384616
Accuracy for scent smoky : 0.9615384615384616
Accuracy for scent camphor : 1.0
Accuracy for scent woody : 0.717948717948718
Accuracy for scent cinnamon : 0.9230769230769231
Accuracy for scent caramel : 0.9102564102564102
Accuracy for scent metallic : 1.0
Accuracy for scent salty : 0.9615384615384616
Accuracy for scent earthy : 0.8205128205128205
Accuracy for scent amber : 0.7564102564102564
Accuracy for scent coffee : 0

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Iterate over each scent column and evaluate the model
for scent in scent_columns:
    # Split features (X) and target variable (y) for the current scent
    X = cleaned_perfume_df.drop(['rating', 'number_votes', 'perfume', scent], axis=1)
    y = cleaned_perfume_df[scent]
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Decision Tree Classifier
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    # Predict on test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Metrics for scent", scent, ":")
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

Metrics for scent tropical :
Precision: 0.23076923076923078
Recall: 0.42857142857142855
F1-score: 0.3
Metrics for scent animalic :
Precision: 0.5714285714285714
Recall: 0.3076923076923077
F1-score: 0.4
Metrics for scent conifer :
Precision: 0.0
Recall: 0.0
F1-score: 0.0
Metrics for scent sweet :
Precision: 0.7959183673469388
Recall: 0.7959183673469388
F1-score: 0.7959183673469388
Metrics for scent honey :
Precision: 0.14285714285714285
Recall: 0.5
F1-score: 0.22222222222222224
Metrics for scent coconut :
Precision: 0.14285714285714285
Recall: 0.25
F1-score: 0.18181818181818182
Metrics for scent rum :
Precision: 0.0
Recall: 0.0
F1-score: 0.0
Metrics for scent soapy :
Precision: 0.6666666666666666
Recall: 1.0
F1-score: 0.8
Metrics for scent vodka :
Precision: 0.0
Recall: 0.0
F1-score: 0.0
Metrics for scent mineral :
Precision: 0.0
Recall: 0.0
F1-score: 0.0
Metrics for scent fresh :
Precision: 0.5555555555555556
Recall: 0.5405405405405406
F1-score: 0.547945205479452
Metrics for scent anis

Model Integration

In [17]:
from ipywidgets import interact, widgets

# Assuming cleaned_perfume_df is your dataframe containing perfume data
perfume_list = cleaned_perfume_df['perfume'].unique()  # Get a list of all perfumes

# Create a dropdown widget for selecting perfumes
perfume_dropdown = widgets.Dropdown(options=perfume_list, description='Choose a perfume:')

# Interact function to link the dropdown selection with the prediction function
@interact(perfume_name=perfume_dropdown)
def select_perfume(perfume_name):
    # Retrieve features for the selected perfume
    perfume_features = cleaned_perfume_df[cleaned_perfume_df['perfume'] == perfume_name].drop(columns=['rating', 'number_votes', 'perfume'])
    
    # Present questions to the user
    print(f"Let's see if you'll like {perfume_name}!")
    longevity_pref = int(input("How long do you like your perfume to last? (Rate from 1 to 5): "))
    sillage_pref = int(input("How strong do you like your perfume? (Rate from 1 to 4): "))
    fresh_pref = input("Do you like fresh scents? (yes/no): ")

    # Assuming the model already knows if the perfume has a scent and its sillage
    # You can directly use the perfume_features and model to predict the user preference
    
    # Determine if the user will like the perfume based on their responses
    predicted_rating = model.predict([[longevity_pref, sillage_pref] + [1 if fresh_pref.lower() == 'yes' else 0]])

    if predicted_rating == 1:
        print(f"Based on your preferences, you might like {perfume_name}!")
    else:
        print(f"Based on your preferences, you might not like {perfume_name}.")

How long do you like your perfume to last? (Rate from 1 to 5): 2
How strong do you like your perfume? (Rate from 1 to 4): 4
Do you like fresh scents? (yes/no): yes


interactive(children=(Dropdown(description='Choose a perfume:', options=("Angels' Share by By Kilian", 'Libre …

In [None]:
from ipywidgets import interact, widgets

# Get a list of all perfumes from the dataframe
perfume_list = cleaned_perfume_df['perfume'].unique()

# Define a function to present questions and predict user preference
def predict_preference(perfume_name):
    # Retrieve features for the selected perfume
    perfume_features = cleaned_perfume_df[cleaned_perfume_df['perfume'] == perfume_name].drop(columns=['rating', 'number_votes', 'longevity_rating', 'sillage_rating', 'perfume'])
    
    # Present questions to the user
    print(f"Let's see if you'll like {perfume_name}!")
    longevity_pref = int(input("How long do you like your perfume to last? (Rate from 1 to 5): "))
    sillage_pref = int(input("How strong do you like your perfume? (Rate from 1 to 4): "))
    fresh_pref = input("Do you like fresh scents? (yes/no): ")

    # Determine if the user will like the perfume based on their responses
    predicted_rating = model.predict([[longevity_pref, sillage_pref] + [1 if fresh_pref.lower() == 'yes' else 0]])

    if predicted_rating == 1:
        print(f"Based on your preferences, you might like {perfume_name}!")
    else:
        print(f"Based on your preferences, you might not like {perfume_name}.")

# Create a dropdown widget for selecting perfumes
perfume_dropdown = widgets.Dropdown(options=perfume_list, description='Choose a perfume:')

# Interact function to link the dropdown selection with the prediction function
interact(predict_preference, perfume_name=perfume_dropdown);

Decision Tree Machine Model Creation

In [None]:
# Import dependencies
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

# Use CountVectorizer to convert 'perfume' and 'main_accords' into numerical features
vectorizer = CountVectorizer()
perfume_features = vectorizer.fit_transform(cleaned_perfume_df['perfume'])
main_accords_features = vectorizer.transform(cleaned_perfume_df['main_accords'])

# Combine the numerical features with other numerical features
X = pd.concat([pd.DataFrame(perfume_features.toarray()), pd.DataFrame(main_accords_features.toarray()), 
               cleaned_perfume_df[['rating', 'number_votes', 'longevity_rating', 'sillage_rating']]], axis=1)

# Define the target variable
y = cleaned_perfume_df['perfume']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Model Evaluation
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# ChatBot

In [None]:
# Import necessary libraries
from sklearn.neighbors import KNeighborsClassifier

# Define the main function for perfume recommendation
def recommend_perfume(decision_tree_data, cleaned_perfume_df_melted_sorted):
    # Data preparation
    X, y, clf = prepare_data_and_classifier(decision_tree_data)
    
    # Handle user input
    user_input = get_user_input(cleaned_perfume_df_melted_sorted)
    
    # Make recommendation
    make_recommendation(user_input, clf)

In [None]:
# Define helper function for data preparation
def prepare_data_and_classifier(decision_tree_data):
    
    # One-hot encode the 'scent' column
    encoded_columns = pd.get_dummies(decision_tree_data[['scent']])
    X = pd.concat([encoded_columns, decision_tree_data[['scent_strength']]], axis=1)
    y = decision_tree_data.index
    
    # Split data into training and testing sets
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.6, random_state=22)
    
    # Train K Nearest Neighbors classifier
    clf = KNeighborsClassifier()
    clf.fit(X_train, y_train)
    return X, y, clf

In [None]:
# Define helper function for handling user input
def get_user_input(cleaned_perfume_df_melted_sorted):
    
    # Create user input DataFrame
    user_input = pd.DataFrame(columns=cleaned_perfume_df_melted_sorted.columns)
    
    # Ask user for scent allergies
    print("Do you have any scent allergies? (yes/no)")
    scent_allergies = input().lower()
    if scent_allergies == 'yes':
        # Assuming user provides the scent they are allergic to
        print("Which scent are you allergic to?")
        allergic_scent = input()
        # Remove perfumes containing allergic scent
        cleaned_perfume_df_melted_sorted = cleaned_perfume_df_melted_sorted[
            ~cleaned_perfume_df_melted_sorted['scent'].str.contains(allergic_scent, case=False)
        ]

    # Ask user for strength preference of main note
    print("How strong do you prefer the main note to be? (1-100)")
    scent_strength = float(input())
    user_input['scent_strength'] = scent_strength

    # Ask user for longevity preference
    print("How long would you like your perfume to last? (1-5)")
    longevity = float(input())
    user_input['longevity_rating'] = longevity
    
    # Ask user for sillage preference
    print("How strong do you prefer the sillage? (1-4)")
    sillage = float(input())
    user_input['sillage_rating'] = sillage

    # Ask user for scent preference
    print("Do you prefer a fruity or floral scent?")
    scent_preference = input().lower()
    if scent_preference == 'fruity':
        user_input['scent_fruity'] = 1
    elif scent_preference == 'floral':
        user_input['scent_floral'] = 1
        
    # Ask user for scent preference
    print("Do you prefer a fresh or spicy scent?")
    scent_preference = input().lower()
    if scent_preference == 'fresh':
        user_input['scent_fresh'] = 1
    elif scent_preference == 'spicy':
        user_input['scent_spicy'] = 1      
     
    # Ask user for scent preference
    print("Do you prefer a musky or aromatic scent?")
    scent_preference = input().lower()
    if scent_preference == 'musky':
        user_input['scent_musky'] = 1
    elif scent_preference == 'aromatic':
        user_input['scent_aromatic'] = 1
        
    return user_input
# Define helper function for making recommendation
def make_recommendation(user_input, clf):
    # Fill missing values with 0
    user_input.fillna(0, inplace=True)
    # Make prediction
    prediction = clf.predict(user_input)
    print("Recommended perfume:", prediction)
# Call the main function to recommend perfumes based on user preferences
recommend_perfume(decision_tree_data, cleaned_perfume_df_melted_sorted)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def recommend_perfume(decision_tree_data, cleaned_perfume_df_melted_sorted):
    # One-hot encode the 'scent' and 'sillage' columns
    encoded_columns = pd.get_dummies(decision_tree_data[['scent', 'sillage']])

    # Add the 'scent_strength' column
    X = pd.concat([encoded_columns, decision_tree_data[['scent_strength']]], axis=1)

    # Determine your X and Y values
    y = decision_tree_data.index

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=22)

    # Train K Nearest Neighbors classifier
    clf = KNeighborsClassifier()
    clf.fit(X_train, y_train)

    # Create user input DataFrame
    user_input = pd.DataFrame(columns=X_train.columns)
    
    # Ask user for scent allergies
    print("Do you have any scent allergies? (yes/no)")
    scent_allergies = input().lower()
    if scent_allergies == 'yes':
        # Assuming user provides the scent they are allergic to
        print("Which scent are you allergic to?")
        allergic_scent = input()
        # Remove perfumes containing allergic scent
        cleaned_perfume_df_melted_sorted = cleaned_perfume_df_melted_sorted[
            ~cleaned_perfume_df_melted_sorted['scent'].str.contains(allergic_scent, case=False)
        ]

    # Ask user for strength preference
    print("How strong do you prefer your perfume to be? (1-100)")
    scent_strength = float(input())
    user_input['scent_strength'] = scent_strength

    # Ask user for longevity preference
    print("How long would you like your perfume to last? (1-1000)")
    sillage = float(input())
    sillage_column_name = 'sillage_{}'.format(sillage)
    
    # Check if the sillage column exists, if not, add it with default value 0
    if sillage_column_name not in user_input.columns:
        user_input[sillage_column_name] = 0
    user_input[sillage_column_name] = 1  # Update the value based on user input

    # Ask user for scent preference
    print("Do you prefer a fruity or floral scent?")
    scent_preference = input().lower()
    if scent_preference == 'fruity':
        user_input['scent_fruity'] = 1
    elif scent_preference == 'floral':
        user_input['scent_floral'] = 1
        
    # Ask user for scent preference
    print("Do you prefer a fresh or spicy scent?")
    scent_preference = input().lower()
    if scent_preference == 'fresh':
        user_input['scent_fresh'] = 1
    elif scent_preference == 'spicy':
        user_input['scent_spicy'] = 1      
     
    # Ask user for scent preference
    print("Do you prefer a musky or aromatic scent?")
    scent_preference = input().lower()
    if scent_preference == 'musky':
        user_input['scent_musky'] = 1
    elif scent_preference == 'aromatic':
        user_input['scent_aromatic'] = 1  
        
        
    # Debugging: Print user input values
    print("User input values:", user_input)

    # Fill missing values with 0
    user_input.fillna(0, inplace=True)

    # Make prediction
    prediction = clf.predict(user_input)
    print("Recommended perfume:", prediction)

# Call the function to recommend perfumes based on user preferences
recommend_perfume(decision_tree_data, cleaned_perfume_df_melted_sorted)


# steps required for a simple chatbot

## Train a Model:
Prepare your dataset: Ensure your dataset is structured properly with features and labels. You'll need a dataset where each row represents a perfume, and the columns represent features such as scent, strength, longevity, etc. The label would typically be the type of perfume or some indication of its characteristics.

Split your dataset: Split your dataset into training and testing sets. The training set is used to train the model, while the testing set is used to evaluate its performance.

Choose a model: Select a machine learning algorithm suitable for your task. For recommendation systems, algorithms like decision trees, random forests, or even more advanced methods like neural networks can be used.

Train the model: Fit the chosen model to your training data. This involves finding the optimal parameters of the model that minimize the difference between predicted and actual values.

Evaluate the model: Assess the performance of your trained model using the testing set. Common metrics for evaluation include accuracy, precision, recall, and F1-score.

## Implement the recommend_perfume Function:

Load the trained model: In the recommend_perfume function, load the trained model that you obtained from the training step. This could involve using libraries like scikit-learn for simpler models or TensorFlow/Keras for more complex models.

Process user input: Receive the user input in the recommend_perfume function and format it appropriately to match the input requirements of your trained model.

Make recommendations: Call the make_recommendation function you've defined, passing in the user input and the loaded model. 
This function will use the trained model to predict the recommended perfume based on the user's preferences.

## Testing and Debugging:

Test your code with various user inputs to ensure it provides meaningful recommendations.

Debug any errors or unexpected behavior that arise during testing.

Refine your model or code as necessary based on the testing results.

In [None]:
# This sample has a section I don't have in mine and might be worth testing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load your perfume dataset (replace 'perfume_data.csv' with your actual dataset)
perfume_data = pd.read_csv('perfume_data.csv')

# Split the dataset into features (X) and labels (y)
X = perfume_data.drop(columns=['perfume_type'])  # Features
y = perfume_data['perfume_type']  # Labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a KNN classifier
clf = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors as needed
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Define the recommend_perfume function
def recommend_perfume(user_input, clf):
    # Make prediction
    prediction = clf.predict(user_input)
    print("Recommended perfume:", prediction)

# Now you can use the recommend_perfume function to make recommendations based on user input
# For example:
user_input = pd.DataFrame({
    'scent_strength': [80],
    'sillage_100': [1],
    'scent_fruity': [1],
    'scent_fresh': [1],
    'scent_musky': [0]
})
recommend_perfume(user_input, clf)

We use a KNeighborsClassifier instead of DecisionTreeClassifier.

We train the KNN classifier using the training data.

We evaluate the model's accuracy on the testing set.

We define the recommend_perfume function to make recommendations based on user input.