In [1]:
# Import required libraries and dependencies
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
#more libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
#import dependency for SQLite database
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect("../Resources/perfume.db")

# Create a cursor object to execute SQL queries
cur = conn.cursor()

In [4]:
# Fetch all rows from the 'Review_Included' table
cur.execute("SELECT * FROM Reviews_Included")

# Fetch all rows from the 'Review_Included' table
rows = cur.fetchall()

# Close the cursor and connection
cur.close()
conn.close()

In [5]:
# Convert fetched data into a pandas DataFrame
# Get column names
columns = [col[0] for col in cur.description]
perfume_df = pd.DataFrame(rows, columns=columns)

# Display the first few rows of the DataFrame
perfume_df.head()

Unnamed: 0,name,company,image,for_gender,rating,number_votes,main accords,description,top notes,middle notes,base notes,longevity,sillage,gender_vote,price value
0,Angels' Share,By Kilian,https://fimgs.net/mdimg/perfume/375x500.62615.jpg,for women and men,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...",Angels' Share by By Kilian is a Oriental Vanil...,['Cognac'],"['Cinnamon', 'Tonka Bean', 'Oak']","['Praline', 'Vanilla', 'Sandalwood']","{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15...","{'female': 40, 'more female': 39, 'unisex': 22...","{'way overpriced': 64, 'overpriced': 143, 'ok'..."
1,My Way,Giorgio Armani,https://fimgs.net/mdimg/perfume/375x500.62036.jpg,for women,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...",My Way by Giorgio Armani is a Floral fragrance...,"['Orange Blossom', 'Bergamot']","['Tuberose', 'Indian Jasmine']","['White Musk', 'Madagascar Vanilla', 'Virginia...","{'very weak': 49, 'weak': 84, 'moderate': 200,...","{'intimate': 127, 'moderate': 322, 'strong': 2...","{'female': 349, 'more female': 21, 'unisex': 4...","{'way overpriced': 38, 'overpriced': 121, 'ok'..."
2,Libre Intense,Yves Saint Laurent,https://fimgs.net/mdimg/perfume/375x500.62318.jpg,for women,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...",Libre Intense by Yves Saint Laurent is a Orien...,"['Lavender', 'Mandarin Orange', 'Bergamot']","['Lavender', 'Tunisian Orange Blossom', 'Jasmi...","['Madagascar Vanilla', 'Tonka Bean', 'Ambergri...","{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23...","{'female': 162, 'more female': 91, 'unisex': 7...","{'way overpriced': 11, 'overpriced': 59, 'ok':..."
3,Dior Homme 2020,Christian Dior,https://fimgs.net/mdimg/perfume/375x500.58714.jpg,for men,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...",Dior Homme 2020 by Christian Dior is a Woody f...,"['Bergamot', 'Pink Pepper', 'elemi']","['Cashmere Wood', 'Atlas Cedar', 'Patchouli']","['Iso E Super', 'Haitian Vetiver', 'White Musk']","{'very weak': 125, 'weak': 83, 'moderate': 174...","{'intimate': 214, 'moderate': 370, 'strong': 1...","{'female': 3, 'more female': 1, 'unisex': 17, ...","{'way overpriced': 31, 'overpriced': 59, 'ok':..."
4,Acqua di Giò Profondo,Giorgio Armani,https://fimgs.net/mdimg/perfume/375x500.59532.jpg,for men,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...",Acqua di Giò Profondo by Giorgio Armani is a A...,"['Sea Notes', 'Aquozone', 'Bergamot', 'Green M...","['Rosemary', 'Cypress', 'Lavender', 'Mastic or...","['Mineral notes', 'Musk', 'Patchouli', 'Amber']","{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1...","{'female': 3, 'more female': 0, 'unisex': 5, '...","{'way overpriced': 32, 'overpriced': 84, 'ok':..."


In [6]:
# Remove unnecessary columns (at this point we have decided to use the main accords in lieu of the top, middle, and base notes for accuracy)
columns_to_remove = ['image', 'for_gender', 'description', 'gender_vote', 'price value', 'top notes', 'middle notes', 'base notes']

# Remove the specified columns
perfume_df = perfume_df.drop(columns=columns_to_remove)

# Rename the column 'main accords' to 'main_accords' in perfume_df
perfume_df.rename(columns={'main accords': 'main_accords'}, inplace=True)

# Display the DataFrame after dropping the specified columns
perfume_df.head()

Unnamed: 0,name,company,rating,number_votes,main_accords,longevity,sillage
0,Angels' Share,By Kilian,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...","{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15..."
1,My Way,Giorgio Armani,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...","{'very weak': 49, 'weak': 84, 'moderate': 200,...","{'intimate': 127, 'moderate': 322, 'strong': 2..."
2,Libre Intense,Yves Saint Laurent,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23..."
3,Dior Homme 2020,Christian Dior,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...","{'very weak': 125, 'weak': 83, 'moderate': 174...","{'intimate': 214, 'moderate': 370, 'strong': 1..."
4,Acqua di Giò Profondo,Giorgio Armani,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1..."


In [7]:
# Define the threshold for removing rows based on rating
rating_threshold = 3.59

# Filter the DataFrame to exclude rows with ratings at or below the threshold
perfume_df = perfume_df[perfume_df['rating'] > rating_threshold]

# Display the DataFrame after removing rows
print(perfume_df)

                           name             company  rating  number_votes  \
0                 Angels' Share           By Kilian    4.31         682.0   
2                 Libre Intense  Yves Saint Laurent    4.02         858.0   
4         Acqua di Giò Profondo      Giorgio Armani    4.03         869.0   
5             Le Male Le Parfum  Jean Paul Gaultier    4.26         690.0   
6    Beau De Jour Eau de Parfum            Tom Ford    4.25         729.0   
..                          ...                 ...     ...           ...   
473                 La Capitale             Xerjoff    4.13         187.0   
474                  DKNY Women         Donna Karan    3.87        2285.0   
475                       Toy 2            Moschino    3.80         940.0   
476                       Derby            Guerlain    4.38         389.0   
477                       Aimer     Pascal Morabito    3.71           7.0   

                                          main_accords  \
0    {'woody': 10

In [8]:
# Define the threshold for removing rows based on number of votes
votes_threshold = 100

# Filter the DataFrame to exclude rows with number of votes less than the threshold
perfume_df = perfume_df[perfume_df['number_votes'] >= votes_threshold]

# Display the DataFrame after removing rows
print(perfume_df)

                           name             company  rating  number_votes  \
0                 Angels' Share           By Kilian    4.31         682.0   
2                 Libre Intense  Yves Saint Laurent    4.02         858.0   
4         Acqua di Giò Profondo      Giorgio Armani    4.03         869.0   
5             Le Male Le Parfum  Jean Paul Gaultier    4.26         690.0   
6    Beau De Jour Eau de Parfum            Tom Ford    4.25         729.0   
..                          ...                 ...     ...           ...   
472                Laudano Nero     Tiziana Terenzi    4.13        1693.0   
473                 La Capitale             Xerjoff    4.13         187.0   
474                  DKNY Women         Donna Karan    3.87        2285.0   
475                       Toy 2            Moschino    3.80         940.0   
476                       Derby            Guerlain    4.38         389.0   

                                          main_accords  \
0    {'woody': 10

In [9]:
# Rename the DataFrame to reflect its cleaned state
# Make a copy to preserve the original DataFrame
cleaned_perfume_df = perfume_df.copy()
cleaned_perfume_df

Unnamed: 0,name,company,rating,number_votes,main_accords,longevity,sillage
0,Angels' Share,By Kilian,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...","{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15..."
2,Libre Intense,Yves Saint Laurent,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23..."
4,Acqua di Giò Profondo,Giorgio Armani,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1..."
5,Le Male Le Parfum,Jean Paul Gaultier,4.26,690.0,"{'warm spicy': 100.0, 'vanilla': 72.2173, 'aro...","{'very weak': 18, 'weak': 31, 'moderate': 129,...","{'intimate': 80, 'moderate': 262, 'strong': 17..."
6,Beau De Jour Eau de Parfum,Tom Ford,4.25,729.0,"{'aromatic': 100.0, 'fresh spicy': 94.7562, 'l...","{'very weak': 30, 'weak': 19, 'moderate': 88, ...","{'intimate': 45, 'moderate': 190, 'strong': 19..."
...,...,...,...,...,...,...,...
472,Laudano Nero,Tiziana Terenzi,4.13,1693.0,"{'woody': 100.0, 'amber': 93.699, 'smoky': 68....","{'very weak': 41, 'weak': 38, 'moderate': 84, ...","{'intimate': 94, 'moderate': 245, 'strong': 42..."
473,La Capitale,Xerjoff,4.13,187.0,"{'fruity': 100.0, 'sweet': 92.341, 'amber': 85...","{'very weak': 5, 'weak': 9, 'moderate': 20, 'l...","{'intimate': 22, 'moderate': 36, 'strong': 53,..."
474,DKNY Women,Donna Karan,3.87,2285.0,"{'citrus': 100.0, 'aromatic': 97.6724, 'green'...","{'very weak': 23, 'weak': 44, 'moderate': 190,...","{'intimate': 71, 'moderate': 244, 'strong': 21..."
475,Toy 2,Moschino,3.80,940.0,"{'fruity': 100.0, 'floral': 73.9229, 'fresh': ...","{'very weak': 54, 'weak': 78, 'moderate': 147,...","{'intimate': 135, 'moderate': 200, 'strong': 1..."


In [10]:
print(cleaned_perfume_df.columns)

Index(['name', 'company', 'rating', 'number_votes', 'main_accords',
       'longevity', 'sillage'],
      dtype='object')


In [11]:
import ast

#Working to create a new entry for each scent creating multiple entries for each perfume based on the number of scents in it
# Parse the string representations of dictionaries in 'main accords' into separate columns
cleaned_perfume_df = cleaned_perfume_df.copy()

# Parse the string representations of dictionaries into actual dictionaries
cleaned_perfume_df['main_accords'] = cleaned_perfume_df['main_accords'].apply(ast.literal_eval)

# Expand the 'main_accords' column into separate columns
main_accords_df = pd.json_normalize(cleaned_perfume_df['main_accords'])

# Concatenate the expanded 'main_accords' dataframe with the original dataframe
cleaned_perfume_df = pd.concat([cleaned_perfume_df, main_accords_df], axis=1)

# Drop the original 'main_accords' column
cleaned_perfume_df.drop(columns=['main_accords'], inplace=True)

# Using cleaned_perfume_df
cleaned_perfume_df = cleaned_perfume_df.copy()

# Create a dataframe with 'name' and 'company' columns, along with each scent in 'main_accords' as a separate row
scent_df = cleaned_perfume_df[['name', 'company', 'woody', 'sweet', 'warm spicy', 'vanilla', 'cinnamon', 'amber', 'powdery', 'lactonic', 'aromatic']].melt(id_vars=['name', 'company'], var_name='scent', value_name='scent_strength').dropna()

# preview df
scent_df.head()

Unnamed: 0,name,company,scent,scent_strength
0,Angels' Share,By Kilian,woody,100.0
1,Libre Intense,Yves Saint Laurent,woody,67.7451
2,Acqua di Giò Profondo,Giorgio Armani,woody,62.7869
3,Le Male Le Parfum,Jean Paul Gaultier,woody,60.2007
4,Beau De Jour Eau de Parfum,Tom Ford,woody,52.3889


In [12]:
# Create a copy of the dataframe
cleaned_perfume_df_copy = cleaned_perfume_df.copy()

# Melt the dataframe to have each scent as its own row
cleaned_perfume_df_melted = cleaned_perfume_df_copy.melt(id_vars=['name', 'company', 'rating', 'number_votes', 'longevity', 'sillage'], var_name='scent', value_name='scent_strength')

# Drop rows with NaN values in the 'scent_strength' column
cleaned_perfume_df_melted = cleaned_perfume_df_melted.dropna(subset=['scent_strength'])

# Reset the index
cleaned_perfume_df_melted.reset_index(drop=True, inplace=True)

# Display the resulting dataframe
cleaned_perfume_df_melted

Unnamed: 0,name,company,rating,number_votes,longevity,sillage,scent,scent_strength
0,Angels' Share,By Kilian,4.31,682.0,"{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15...",woody,100.0000
1,Libre Intense,Yves Saint Laurent,4.02,858.0,"{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23...",woody,67.7451
2,Acqua di Giò Profondo,Giorgio Armani,4.03,869.0,"{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1...",woody,62.7869
3,Le Male Le Parfum,Jean Paul Gaultier,4.26,690.0,"{'very weak': 18, 'weak': 31, 'moderate': 129,...","{'intimate': 80, 'moderate': 262, 'strong': 17...",woody,60.2007
4,Beau De Jour Eau de Parfum,Tom Ford,4.25,729.0,"{'very weak': 30, 'weak': 19, 'moderate': 88, ...","{'intimate': 45, 'moderate': 190, 'strong': 19...",woody,52.3889
...,...,...,...,...,...,...,...,...
3388,,,,,,,conifer,57.7621
3389,Chanel N°22,Chanel,4.26,1208.0,"{'very weak': 21, 'weak': 18, 'moderate': 72, ...","{'intimate': 45, 'moderate': 124, 'strong': 11...",Champagne,62.1377
3390,,,,,,,camphor,46.5055
3391,,,,,,,savory,84.0410


In [13]:
# Sort the dataframe by the 'name' column in alphabetical order
cleaned_perfume_df_melted_sorted = cleaned_perfume_df_melted.sort_values(by='name')

# Reset the index
cleaned_perfume_df_melted_sorted.reset_index(drop=True, inplace=True)

# Drop rows with NaN values in the 'name' column
cleaned_perfume_df_melted_sorted.dropna(subset=['name'], inplace=True)

# Display the resulting dataframe
cleaned_perfume_df_melted_sorted

Unnamed: 0,name,company,rating,number_votes,longevity,sillage,scent,scent_strength
0,02 L'Air du Desert Marocain,Tauer Perfumes,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",warm spicy,100.0000
1,02 L'Air du Desert Marocain,Tauer Perfumes,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",musky,45.1861
2,02 L'Air du Desert Marocain,Tauer Perfumes,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",green,70.2955
3,02 L'Air du Desert Marocain,Tauer Perfumes,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",woody,55.8338
4,02 L'Air du Desert Marocain,Tauer Perfumes,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",aromatic,67.0314
...,...,...,...,...,...,...,...,...
2650,Zahrat Hawai,Al-Rehab,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",aromatic,61.3975
2651,Zahrat Hawai,Al-Rehab,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",white floral,61.2162
2652,Zahrat Hawai,Al-Rehab,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",fresh spicy,47.5000
2653,Zahrat Hawai,Al-Rehab,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",floral,76.5721


In [14]:
# Combine 'name' and 'company' columns together
cleaned_perfume_df_melted_sorted['perfume'] = cleaned_perfume_df_melted_sorted['name'] + ' by ' + cleaned_perfume_df_melted_sorted['company']

# Drop the original 'name' and 'company' columns if needed
cleaned_perfume_df_melted_sorted.drop(columns=['name', 'company'], inplace=True)

# Sort the DataFrame by the new 'perfume' column
cleaned_perfume_df_melted_sorted.sort_values(by='perfume', inplace=True)

# Reset the index
cleaned_perfume_df_melted_sorted.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
cleaned_perfume_df_melted_sorted

Unnamed: 0,rating,number_votes,longevity,sillage,scent,scent_strength,perfume
0,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",warm spicy,100.0000,02 L'Air du Desert Marocain by Tauer Perfumes
1,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",musky,45.1861,02 L'Air du Desert Marocain by Tauer Perfumes
2,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",green,70.2955,02 L'Air du Desert Marocain by Tauer Perfumes
3,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",woody,55.8338,02 L'Air du Desert Marocain by Tauer Perfumes
4,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",aromatic,67.0314,02 L'Air du Desert Marocain by Tauer Perfumes
...,...,...,...,...,...,...,...
2650,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",woody,81.4752,Zahrat Hawai by Al-Rehab
2651,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",aromatic,61.3975,Zahrat Hawai by Al-Rehab
2652,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",white floral,61.2162,Zahrat Hawai by Al-Rehab
2653,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",fresh spicy,47.5000,Zahrat Hawai by Al-Rehab


In [15]:
# Define mapping from categories to numeric values
#category_mapping = {'very weak': 1, 'weak': 2, 'moderate': 3, 'long lasting': 4, 'eternal': 5}

# Convert longevity entries
#def convert_longevity(longevity_entry):
    #longevity_dict = ast.literal_eval(longevity_entry)
    #numeric_longevity = {category_mapping[key]: value for key, value in longevity_dict.items()}
    #max_category = max(numeric_longevity, key=numeric_longevity.get)
    #return {category: 1 if category == max_category else 0 for category in numeric_longevity}

# Apply conversion and create DataFrame of binary features
#binary_features = pd.DataFrame(list(cleaned_perfume_df_melted_sorted['longevity'].apply(convert_longevity)))

# Concatenate original DataFrame and binary features DataFrame
#cleaned_perfume_df_melted_sorted = pd.concat([cleaned_perfume_df_melted_sorted, binary_features], axis=1)

# Display the result
#cleaned_perfume_df_melted_sorted

# Define mapping from categories to numeric values
category_mapping = {'very weak': 1, 'weak': 2, 'moderate': 3, 'long lasting': 4, 'eternal': 5}

# Convert longevity entries
def convert_longevity(longevity_entry):
    longevity_dict = ast.literal_eval(longevity_entry)
    numeric_longevity = {category_mapping[key]: value for key, value in longevity_dict.items()}
    max_category = max(numeric_longevity, key=numeric_longevity.get)
    return max_category

# Apply conversion to create a Series of numeric longevity values
numeric_longevity_series = cleaned_perfume_df_melted_sorted['longevity'].apply(convert_longevity)

# Rename the column to 'longevity_rating'
numeric_longevity_series.name = 'longevity_rating'

# Concatenate the new Series to the original DataFrame
cleaned_perfume_df_melted_sorted = pd.concat([cleaned_perfume_df_melted_sorted, numeric_longevity_series], axis=1)

# Drop the original 'longevity' column
cleaned_perfume_df_melted_sorted.drop(columns='longevity', inplace=True)

# Display the result
cleaned_perfume_df_melted_sorted

Unnamed: 0,rating,number_votes,sillage,scent,scent_strength,perfume,longevity_rating
0,4.43,4568.0,"{'intimate': 203, 'moderate': 641, 'strong': 9...",warm spicy,100.0000,02 L'Air du Desert Marocain by Tauer Perfumes,5
1,4.43,4568.0,"{'intimate': 203, 'moderate': 641, 'strong': 9...",musky,45.1861,02 L'Air du Desert Marocain by Tauer Perfumes,5
2,4.43,4568.0,"{'intimate': 203, 'moderate': 641, 'strong': 9...",green,70.2955,02 L'Air du Desert Marocain by Tauer Perfumes,5
3,4.43,4568.0,"{'intimate': 203, 'moderate': 641, 'strong': 9...",woody,55.8338,02 L'Air du Desert Marocain by Tauer Perfumes,5
4,4.43,4568.0,"{'intimate': 203, 'moderate': 641, 'strong': 9...",aromatic,67.0314,02 L'Air du Desert Marocain by Tauer Perfumes,5
...,...,...,...,...,...,...,...
2650,3.81,212.0,"{'intimate': 15, 'moderate': 63, 'strong': 28,...",woody,81.4752,Zahrat Hawai by Al-Rehab,3
2651,3.81,212.0,"{'intimate': 15, 'moderate': 63, 'strong': 28,...",aromatic,61.3975,Zahrat Hawai by Al-Rehab,3
2652,3.81,212.0,"{'intimate': 15, 'moderate': 63, 'strong': 28,...",white floral,61.2162,Zahrat Hawai by Al-Rehab,3
2653,3.81,212.0,"{'intimate': 15, 'moderate': 63, 'strong': 28,...",fresh spicy,47.5000,Zahrat Hawai by Al-Rehab,3


In [16]:
# Define mapping from categories to numeric values
sillage_mapping = {'intimate': 1, 'moderate': 2, 'strong': 3, 'enormous': 4}

# Convert sillage entries
def convert_sillage(sillage_entry):
    sillage_dict = ast.literal_eval(sillage_entry)
    numeric_sillage = {sillage_mapping[key]: value for key, value in sillage_dict.items()}
    max_category = max(numeric_sillage, key=numeric_sillage.get)
    return max_category

# Apply conversion to create a Series of numeric sillage values
numeric_sillage_series = cleaned_perfume_df_melted_sorted['sillage'].apply(convert_sillage)

# Rename the column to 'sillage_rating'
numeric_sillage_series.name = 'sillage_rating'

# Concatenate the new Series to the original DataFrame
cleaned_perfume_df_melted_sorted = pd.concat([cleaned_perfume_df_melted_sorted, numeric_sillage_series], axis=1)

# Drop the original 'sillage' column
cleaned_perfume_df_melted_sorted.drop(columns='sillage', inplace=True)

# Display the result
cleaned_perfume_df_melted_sorted

Unnamed: 0,rating,number_votes,scent,scent_strength,perfume,longevity_rating,sillage_rating
0,4.43,4568.0,warm spicy,100.0000,02 L'Air du Desert Marocain by Tauer Perfumes,5,3
1,4.43,4568.0,musky,45.1861,02 L'Air du Desert Marocain by Tauer Perfumes,5,3
2,4.43,4568.0,green,70.2955,02 L'Air du Desert Marocain by Tauer Perfumes,5,3
3,4.43,4568.0,woody,55.8338,02 L'Air du Desert Marocain by Tauer Perfumes,5,3
4,4.43,4568.0,aromatic,67.0314,02 L'Air du Desert Marocain by Tauer Perfumes,5,3
...,...,...,...,...,...,...,...
2650,3.81,212.0,woody,81.4752,Zahrat Hawai by Al-Rehab,3,2
2651,3.81,212.0,aromatic,61.3975,Zahrat Hawai by Al-Rehab,3,2
2652,3.81,212.0,white floral,61.2162,Zahrat Hawai by Al-Rehab,3,2
2653,3.81,212.0,fresh spicy,47.5000,Zahrat Hawai by Al-Rehab,3,2


In [17]:
# Set 'perfume' column as the index
cleaned_perfume_df_melted_sorted.set_index('perfume', inplace=True)

# Display the resulting DataFrame
cleaned_perfume_df_melted_sorted

Unnamed: 0_level_0,rating,number_votes,scent,scent_strength,longevity_rating,sillage_rating
perfume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
02 L'Air du Desert Marocain by Tauer Perfumes,4.43,4568.0,warm spicy,100.0000,5,3
02 L'Air du Desert Marocain by Tauer Perfumes,4.43,4568.0,musky,45.1861,5,3
02 L'Air du Desert Marocain by Tauer Perfumes,4.43,4568.0,green,70.2955,5,3
02 L'Air du Desert Marocain by Tauer Perfumes,4.43,4568.0,woody,55.8338,5,3
02 L'Air du Desert Marocain by Tauer Perfumes,4.43,4568.0,aromatic,67.0314,5,3
...,...,...,...,...,...,...
Zahrat Hawai by Al-Rehab,3.81,212.0,woody,81.4752,3,2
Zahrat Hawai by Al-Rehab,3.81,212.0,aromatic,61.3975,3,2
Zahrat Hawai by Al-Rehab,3.81,212.0,white floral,61.2162,3,2
Zahrat Hawai by Al-Rehab,3.81,212.0,fresh spicy,47.5000,3,2


In [18]:
# Rename the DataFrame to decision_tree_data
decision_tree_data = cleaned_perfume_df_melted_sorted.copy()

# Display the resulting DataFrame
decision_tree_data

Unnamed: 0_level_0,rating,number_votes,scent,scent_strength,longevity_rating,sillage_rating
perfume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
02 L'Air du Desert Marocain by Tauer Perfumes,4.43,4568.0,warm spicy,100.0000,5,3
02 L'Air du Desert Marocain by Tauer Perfumes,4.43,4568.0,musky,45.1861,5,3
02 L'Air du Desert Marocain by Tauer Perfumes,4.43,4568.0,green,70.2955,5,3
02 L'Air du Desert Marocain by Tauer Perfumes,4.43,4568.0,woody,55.8338,5,3
02 L'Air du Desert Marocain by Tauer Perfumes,4.43,4568.0,aromatic,67.0314,5,3
...,...,...,...,...,...,...
Zahrat Hawai by Al-Rehab,3.81,212.0,woody,81.4752,3,2
Zahrat Hawai by Al-Rehab,3.81,212.0,aromatic,61.3975,3,2
Zahrat Hawai by Al-Rehab,3.81,212.0,white floral,61.2162,3,2
Zahrat Hawai by Al-Rehab,3.81,212.0,fresh spicy,47.5000,3,2


In [19]:
# Check column types
print(decision_tree_data.dtypes)

rating              float64
number_votes        float64
scent                object
scent_strength      float64
longevity_rating      int64
sillage_rating        int64
dtype: object


In [None]:
# Add the 'rating' and 'scent_strength' columns
X = pd.concat([encoded_columns,
               decision_tree_data[['scent_strength']]], axis=1)
# Determine your X and Y values
# Use the index as the target variable
y = decision_tree_data.index
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=22)
# Train decision tree
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
# Predict on testing set
y_pred = clf.predict(X_test)
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# ChatBot

In [None]:
# Import necessary libraries
from sklearn.neighbors import KNeighborsClassifier
# Define the main function for perfume recommendation
def recommend_perfume(decision_tree_data, cleaned_perfume_df_melted_sorted):
    # Data preparation
    X, y, clf = prepare_data_and_classifier(decision_tree_data)
    # Handle user input
    user_input = get_user_input(cleaned_perfume_df_melted_sorted)
    # Make recommendation
    make_recommendation(user_input, clf)

In [None]:
# Define helper function for data preparation
def prepare_data_and_classifier(decision_tree_data):
    # One-hot encode the 'scent' and 'sillage' columns
    encoded_columns = pd.get_dummies(decision_tree_data[['scent', 'sillage']])
    X = pd.concat([encoded_columns, decision_tree_data[['scent_strength']]], axis=1)
    y = decision_tree_data.index
    # Split data into training and testing sets
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.6, random_state=22)
    # Train K Nearest Neighbors classifier
    clf = KNeighborsClassifier()
    clf.fit(X_train, y_train)
    return X, y, clf

In [None]:
# Define helper function for handling user input
def get_user_input(cleaned_perfume_df_melted_sorted):
    # Create user input DataFrame
    user_input = pd.DataFrame(columns=cleaned_perfume_df_melted_sorted.columns)
 # Ask user for scent allergies
    print("Do you have any scent allergies? (yes/no)")
    scent_allergies = input().lower()
    if scent_allergies == 'yes':
        # Assuming user provides the scent they are allergic to
        print("Which scent are you allergic to?")
        allergic_scent = input()
        # Remove perfumes containing allergic scent
        cleaned_perfume_df_melted_sorted = cleaned_perfume_df_melted_sorted[
            ~cleaned_perfume_df_melted_sorted['scent'].str.contains(allergic_scent, case=False)
        ]

    # Ask user for strength preference
    print("How strong do you prefer your perfume to be? (1-100)")
    scent_strength = float(input())
    user_input['scent_strength'] = scent_strength

    # Ask user for longevity preference
    print("How long would you like your perfume to last? (1-1000)")
    sillage = float(input())
    sillage_column_name = 'sillage_{}'.format(sillage)
    
    # Check if the sillage column exists, if not, add it with default value 0
    if sillage_column_name not in user_input.columns:
        user_input[sillage_column_name] = 0
    user_input[sillage_column_name] = 1  # Update the value based on user input

    # Ask user for scent preference
    print("Do you prefer a fruity or floral scent?")
    scent_preference = input().lower()
    if scent_preference == 'fruity':
        user_input['scent_fruity'] = 1
    elif scent_preference == 'floral':
        user_input['scent_floral'] = 1
        
    # Ask user for scent preference
    print("Do you prefer a fresh or spicy scent?")
    scent_preference = input().lower()
    if scent_preference == 'fresh':
        user_input['scent_fresh'] = 1
    elif scent_preference == 'spicy':
        user_input['scent_spicy'] = 1      
     
    # Ask user for scent preference
    print("Do you prefer a musky or aromatic scent?")
    scent_preference = input().lower()
    if scent_preference == 'musky':
        user_input['scent_musky'] = 1
    elif scent_preference == 'aromatic':
        user_input['scent_aromatic'] = 1
        
    return user_input
# Define helper function for making recommendation
def make_recommendation(user_input, clf):
    # Fill missing values with 0
    user_input.fillna(0, inplace=True)
    # Make prediction
    prediction = clf.predict(user_input)
    print("Recommended perfume:", prediction)
# Call the main function to recommend perfumes based on user preferences
recommend_perfume(decision_tree_data, cleaned_perfume_df_melted_sorted)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def recommend_perfume(decision_tree_data, cleaned_perfume_df_melted_sorted):
    # One-hot encode the 'scent' and 'sillage' columns
    encoded_columns = pd.get_dummies(decision_tree_data[['scent', 'sillage']])

    # Add the 'scent_strength' column
    X = pd.concat([encoded_columns, decision_tree_data[['scent_strength']]], axis=1)

    # Determine your X and Y values
    y = decision_tree_data.index

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=22)

    # Train K Nearest Neighbors classifier
    clf = KNeighborsClassifier()
    clf.fit(X_train, y_train)

    # Create user input DataFrame
    user_input = pd.DataFrame(columns=X_train.columns)
    
    # Ask user for scent allergies
    print("Do you have any scent allergies? (yes/no)")
    scent_allergies = input().lower()
    if scent_allergies == 'yes':
        # Assuming user provides the scent they are allergic to
        print("Which scent are you allergic to?")
        allergic_scent = input()
        # Remove perfumes containing allergic scent
        cleaned_perfume_df_melted_sorted = cleaned_perfume_df_melted_sorted[
            ~cleaned_perfume_df_melted_sorted['scent'].str.contains(allergic_scent, case=False)
        ]

    # Ask user for strength preference
    print("How strong do you prefer your perfume to be? (1-100)")
    scent_strength = float(input())
    user_input['scent_strength'] = scent_strength

    # Ask user for longevity preference
    print("How long would you like your perfume to last? (1-1000)")
    sillage = float(input())
    sillage_column_name = 'sillage_{}'.format(sillage)
    
    # Check if the sillage column exists, if not, add it with default value 0
    if sillage_column_name not in user_input.columns:
        user_input[sillage_column_name] = 0
    user_input[sillage_column_name] = 1  # Update the value based on user input

    # Ask user for scent preference
    print("Do you prefer a fruity or floral scent?")
    scent_preference = input().lower()
    if scent_preference == 'fruity':
        user_input['scent_fruity'] = 1
    elif scent_preference == 'floral':
        user_input['scent_floral'] = 1
        
    # Ask user for scent preference
    print("Do you prefer a fresh or spicy scent?")
    scent_preference = input().lower()
    if scent_preference == 'fresh':
        user_input['scent_fresh'] = 1
    elif scent_preference == 'spicy':
        user_input['scent_spicy'] = 1      
     
    # Ask user for scent preference
    print("Do you prefer a musky or aromatic scent?")
    scent_preference = input().lower()
    if scent_preference == 'musky':
        user_input['scent_musky'] = 1
    elif scent_preference == 'aromatic':
        user_input['scent_aromatic'] = 1  
        
        
    # Debugging: Print user input values
    print("User input values:", user_input)

    # Fill missing values with 0
    user_input.fillna(0, inplace=True)

    # Make prediction
    prediction = clf.predict(user_input)
    print("Recommended perfume:", prediction)

# Call the function to recommend perfumes based on user preferences
recommend_perfume(decision_tree_data, cleaned_perfume_df_melted_sorted)


# steps required for a simple chatbot

## Train a Model:
Prepare your dataset: Ensure your dataset is structured properly with features and labels. You'll need a dataset where each row represents a perfume, and the columns represent features such as scent, strength, longevity, etc. The label would typically be the type of perfume or some indication of its characteristics.

Split your dataset: Split your dataset into training and testing sets. The training set is used to train the model, while the testing set is used to evaluate its performance.

Choose a model: Select a machine learning algorithm suitable for your task. For recommendation systems, algorithms like decision trees, random forests, or even more advanced methods like neural networks can be used.

Train the model: Fit the chosen model to your training data. This involves finding the optimal parameters of the model that minimize the difference between predicted and actual values.

Evaluate the model: Assess the performance of your trained model using the testing set. Common metrics for evaluation include accuracy, precision, recall, and F1-score.

## Implement the recommend_perfume Function:

Load the trained model: In the recommend_perfume function, load the trained model that you obtained from the training step. This could involve using libraries like scikit-learn for simpler models or TensorFlow/Keras for more complex models.

Process user input: Receive the user input in the recommend_perfume function and format it appropriately to match the input requirements of your trained model.

Make recommendations: Call the make_recommendation function you've defined, passing in the user input and the loaded model. 
This function will use the trained model to predict the recommended perfume based on the user's preferences.

## Testing and Debugging:

Test your code with various user inputs to ensure it provides meaningful recommendations.

Debug any errors or unexpected behavior that arise during testing.

Refine your model or code as necessary based on the testing results.

In [None]:
# This sample has a section I don't have in mine and might be worth testing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load your perfume dataset (replace 'perfume_data.csv' with your actual dataset)
perfume_data = pd.read_csv('perfume_data.csv')

# Split the dataset into features (X) and labels (y)
X = perfume_data.drop(columns=['perfume_type'])  # Features
y = perfume_data['perfume_type']  # Labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a KNN classifier
clf = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors as needed
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Define the recommend_perfume function
def recommend_perfume(user_input, clf):
    # Make prediction
    prediction = clf.predict(user_input)
    print("Recommended perfume:", prediction)

# Now you can use the recommend_perfume function to make recommendations based on user input
# For example:
user_input = pd.DataFrame({
    'scent_strength': [80],
    'sillage_100': [1],
    'scent_fruity': [1],
    'scent_fresh': [1],
    'scent_musky': [0]
})
recommend_perfume(user_input, clf)

We use a KNeighborsClassifier instead of DecisionTreeClassifier.

We train the KNN classifier using the training data.

We evaluate the model's accuracy on the testing set.

We define the recommend_perfume function to make recommendations based on user input.