In [1]:
# Import required libraries and dependencies
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
#import dependency for SQLite database
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect("../Resources/perfume.db")

# Create a cursor object to execute SQL queries
cur = conn.cursor()

In [3]:
# Fetch all rows from the 'Review_Included' table
cur.execute("SELECT * FROM Reviews_Included")

# Fetch all rows from the 'Review_Included' table
rows = cur.fetchall()

# Close the cursor and connection
cur.close()
conn.close()

In [4]:
# Convert fetched data into a pandas DataFrame
# Get column names
columns = [col[0] for col in cur.description]
perfume_df = pd.DataFrame(rows, columns=columns)

# Display the first few rows of the DataFrame
perfume_df.head()

Unnamed: 0,name,company,image,for_gender,rating,number_votes,main accords,description,top notes,middle notes,base notes,longevity,sillage,gender_vote,price value
0,Angels' Share,By Kilian,https://fimgs.net/mdimg/perfume/375x500.62615.jpg,for women and men,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...",Angels' Share by By Kilian is a Oriental Vanil...,['Cognac'],"['Cinnamon', 'Tonka Bean', 'Oak']","['Praline', 'Vanilla', 'Sandalwood']","{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15...","{'female': 40, 'more female': 39, 'unisex': 22...","{'way overpriced': 64, 'overpriced': 143, 'ok'..."
1,My Way,Giorgio Armani,https://fimgs.net/mdimg/perfume/375x500.62036.jpg,for women,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...",My Way by Giorgio Armani is a Floral fragrance...,"['Orange Blossom', 'Bergamot']","['Tuberose', 'Indian Jasmine']","['White Musk', 'Madagascar Vanilla', 'Virginia...","{'very weak': 49, 'weak': 84, 'moderate': 200,...","{'intimate': 127, 'moderate': 322, 'strong': 2...","{'female': 349, 'more female': 21, 'unisex': 4...","{'way overpriced': 38, 'overpriced': 121, 'ok'..."
2,Libre Intense,Yves Saint Laurent,https://fimgs.net/mdimg/perfume/375x500.62318.jpg,for women,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...",Libre Intense by Yves Saint Laurent is a Orien...,"['Lavender', 'Mandarin Orange', 'Bergamot']","['Lavender', 'Tunisian Orange Blossom', 'Jasmi...","['Madagascar Vanilla', 'Tonka Bean', 'Ambergri...","{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23...","{'female': 162, 'more female': 91, 'unisex': 7...","{'way overpriced': 11, 'overpriced': 59, 'ok':..."
3,Dior Homme 2020,Christian Dior,https://fimgs.net/mdimg/perfume/375x500.58714.jpg,for men,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...",Dior Homme 2020 by Christian Dior is a Woody f...,"['Bergamot', 'Pink Pepper', 'elemi']","['Cashmere Wood', 'Atlas Cedar', 'Patchouli']","['Iso E Super', 'Haitian Vetiver', 'White Musk']","{'very weak': 125, 'weak': 83, 'moderate': 174...","{'intimate': 214, 'moderate': 370, 'strong': 1...","{'female': 3, 'more female': 1, 'unisex': 17, ...","{'way overpriced': 31, 'overpriced': 59, 'ok':..."
4,Acqua di Giò Profondo,Giorgio Armani,https://fimgs.net/mdimg/perfume/375x500.59532.jpg,for men,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...",Acqua di Giò Profondo by Giorgio Armani is a A...,"['Sea Notes', 'Aquozone', 'Bergamot', 'Green M...","['Rosemary', 'Cypress', 'Lavender', 'Mastic or...","['Mineral notes', 'Musk', 'Patchouli', 'Amber']","{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1...","{'female': 3, 'more female': 0, 'unisex': 5, '...","{'way overpriced': 32, 'overpriced': 84, 'ok':..."


In [5]:
# Remove unnecessary columns (at this point we have decided to use the main accords in lieu of the top, middle, and base notes for accuracy)
columns_to_remove = ['image', 'for_gender', 'description', 'gender_vote', 'price value', 'top notes', 'middle notes', 'base notes']

# Remove the specified columns
perfume_df = perfume_df.drop(columns=columns_to_remove)

# Rename the column 'main accords' to 'main_accords' in perfume_df
perfume_df.rename(columns={'main accords': 'main_accords'}, inplace=True)

# Display the DataFrame after dropping the specified columns
perfume_df.head()

Unnamed: 0,name,company,rating,number_votes,main_accords,longevity,sillage
0,Angels' Share,By Kilian,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...","{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15..."
1,My Way,Giorgio Armani,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...","{'very weak': 49, 'weak': 84, 'moderate': 200,...","{'intimate': 127, 'moderate': 322, 'strong': 2..."
2,Libre Intense,Yves Saint Laurent,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23..."
3,Dior Homme 2020,Christian Dior,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...","{'very weak': 125, 'weak': 83, 'moderate': 174...","{'intimate': 214, 'moderate': 370, 'strong': 1..."
4,Acqua di Giò Profondo,Giorgio Armani,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1..."


In [6]:
# Define the threshold for removing rows based on rating
rating_threshold = 3.59

# Filter the DataFrame to exclude rows with ratings at or below the threshold
perfume_df = perfume_df[perfume_df['rating'] > rating_threshold]

# Display the DataFrame after removing rows
print(perfume_df)

                           name             company  rating  number_votes  \
0                 Angels' Share           By Kilian    4.31         682.0   
2                 Libre Intense  Yves Saint Laurent    4.02         858.0   
4         Acqua di Giò Profondo      Giorgio Armani    4.03         869.0   
5             Le Male Le Parfum  Jean Paul Gaultier    4.26         690.0   
6    Beau De Jour Eau de Parfum            Tom Ford    4.25         729.0   
..                          ...                 ...     ...           ...   
473                 La Capitale             Xerjoff    4.13         187.0   
474                  DKNY Women         Donna Karan    3.87        2285.0   
475                       Toy 2            Moschino    3.80         940.0   
476                       Derby            Guerlain    4.38         389.0   
477                       Aimer     Pascal Morabito    3.71           7.0   

                                          main_accords  \
0    {'woody': 10

In [7]:
# Define the threshold for removing rows based on number of votes
votes_threshold = 100

# Filter the DataFrame to exclude rows with number of votes less than the threshold
perfume_df = perfume_df[perfume_df['number_votes'] >= votes_threshold]

# Display the DataFrame after removing rows
print(perfume_df)

                           name             company  rating  number_votes  \
0                 Angels' Share           By Kilian    4.31         682.0   
2                 Libre Intense  Yves Saint Laurent    4.02         858.0   
4         Acqua di Giò Profondo      Giorgio Armani    4.03         869.0   
5             Le Male Le Parfum  Jean Paul Gaultier    4.26         690.0   
6    Beau De Jour Eau de Parfum            Tom Ford    4.25         729.0   
..                          ...                 ...     ...           ...   
472                Laudano Nero     Tiziana Terenzi    4.13        1693.0   
473                 La Capitale             Xerjoff    4.13         187.0   
474                  DKNY Women         Donna Karan    3.87        2285.0   
475                       Toy 2            Moschino    3.80         940.0   
476                       Derby            Guerlain    4.38         389.0   

                                          main_accords  \
0    {'woody': 10

In [8]:
# Rename the DataFrame to reflect its cleaned state
# Make a copy to preserve the original DataFrame
cleaned_perfume_df = perfume_df.copy()
cleaned_perfume_df

Unnamed: 0,name,company,rating,number_votes,main_accords,longevity,sillage
0,Angels' Share,By Kilian,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...","{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15..."
2,Libre Intense,Yves Saint Laurent,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23..."
4,Acqua di Giò Profondo,Giorgio Armani,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1..."
5,Le Male Le Parfum,Jean Paul Gaultier,4.26,690.0,"{'warm spicy': 100.0, 'vanilla': 72.2173, 'aro...","{'very weak': 18, 'weak': 31, 'moderate': 129,...","{'intimate': 80, 'moderate': 262, 'strong': 17..."
6,Beau De Jour Eau de Parfum,Tom Ford,4.25,729.0,"{'aromatic': 100.0, 'fresh spicy': 94.7562, 'l...","{'very weak': 30, 'weak': 19, 'moderate': 88, ...","{'intimate': 45, 'moderate': 190, 'strong': 19..."
...,...,...,...,...,...,...,...
472,Laudano Nero,Tiziana Terenzi,4.13,1693.0,"{'woody': 100.0, 'amber': 93.699, 'smoky': 68....","{'very weak': 41, 'weak': 38, 'moderate': 84, ...","{'intimate': 94, 'moderate': 245, 'strong': 42..."
473,La Capitale,Xerjoff,4.13,187.0,"{'fruity': 100.0, 'sweet': 92.341, 'amber': 85...","{'very weak': 5, 'weak': 9, 'moderate': 20, 'l...","{'intimate': 22, 'moderate': 36, 'strong': 53,..."
474,DKNY Women,Donna Karan,3.87,2285.0,"{'citrus': 100.0, 'aromatic': 97.6724, 'green'...","{'very weak': 23, 'weak': 44, 'moderate': 190,...","{'intimate': 71, 'moderate': 244, 'strong': 21..."
475,Toy 2,Moschino,3.80,940.0,"{'fruity': 100.0, 'floral': 73.9229, 'fresh': ...","{'very weak': 54, 'weak': 78, 'moderate': 147,...","{'intimate': 135, 'moderate': 200, 'strong': 1..."


In [9]:
print(cleaned_perfume_df.columns)

Index(['name', 'company', 'rating', 'number_votes', 'main_accords',
       'longevity', 'sillage'],
      dtype='object')


In [10]:
import ast

#Working to create a new entry for each scent creating multiple entries for each perfume based on the number of scents in it
# Parse the string representations of dictionaries in 'main accords' into separate columns
cleaned_perfume_df = cleaned_perfume_df.copy()

# Parse the string representations of dictionaries into actual dictionaries
cleaned_perfume_df['main_accords'] = cleaned_perfume_df['main_accords'].apply(ast.literal_eval)

# Expand the 'main_accords' column into separate columns
main_accords_df = pd.json_normalize(cleaned_perfume_df['main_accords'])

# Concatenate the expanded 'main_accords' dataframe with the original dataframe
cleaned_perfume_df = pd.concat([cleaned_perfume_df, main_accords_df], axis=1)

# Drop the original 'main_accords' column
cleaned_perfume_df.drop(columns=['main_accords'], inplace=True)

# Using cleaned_perfume_df
cleaned_perfume_df = cleaned_perfume_df.copy()

# Create a dataframe with 'name' and 'company' columns, along with each scent in 'main_accords' as a separate row
scent_df = cleaned_perfume_df[['name', 'company', 'woody', 'sweet', 'warm spicy', 'vanilla', 'cinnamon', 'amber', 'powdery', 'lactonic', 'aromatic']].melt(id_vars=['name', 'company'], var_name='scent', value_name='scent_strength').dropna()

# preview df
scent_df.head()

Unnamed: 0,name,company,scent,scent_strength
0,Angels' Share,By Kilian,woody,100.0
1,Libre Intense,Yves Saint Laurent,woody,67.7451
2,Acqua di Giò Profondo,Giorgio Armani,woody,62.7869
3,Le Male Le Parfum,Jean Paul Gaultier,woody,60.2007
4,Beau De Jour Eau de Parfum,Tom Ford,woody,52.3889


In [14]:
# Create a copy of the dataframe
cleaned_perfume_df_copy = cleaned_perfume_df.copy()

# Melt the dataframe to have each scent as its own row
cleaned_perfume_df_melted = cleaned_perfume_df_copy.melt(id_vars=['name', 'company', 'rating', 'number_votes', 'longevity', 'sillage'], var_name='scent', value_name='scent_strength')

# Drop rows with NaN values in the 'scent_strength' column
cleaned_perfume_df_melted = cleaned_perfume_df_melted.dropna(subset=['scent_strength'])

# Reset the index
cleaned_perfume_df_melted.reset_index(drop=True, inplace=True)

# Display the resulting dataframe
cleaned_perfume_df_melted

Unnamed: 0,name,company,rating,number_votes,longevity,sillage,scent,scent_strength
0,Angels' Share,By Kilian,4.31,682.0,"{'very weak': 21, 'weak': 17, 'moderate': 107,...","{'intimate': 40, 'moderate': 187, 'strong': 15...",woody,100.0000
1,Libre Intense,Yves Saint Laurent,4.02,858.0,"{'very weak': 24, 'weak': 12, 'moderate': 103,...","{'intimate': 39, 'moderate': 155, 'strong': 23...",woody,67.7451
2,Acqua di Giò Profondo,Giorgio Armani,4.03,869.0,"{'very weak': 59, 'weak': 66, 'moderate': 188,...","{'intimate': 115, 'moderate': 333, 'strong': 1...",woody,62.7869
3,Le Male Le Parfum,Jean Paul Gaultier,4.26,690.0,"{'very weak': 18, 'weak': 31, 'moderate': 129,...","{'intimate': 80, 'moderate': 262, 'strong': 17...",woody,60.2007
4,Beau De Jour Eau de Parfum,Tom Ford,4.25,729.0,"{'very weak': 30, 'weak': 19, 'moderate': 88, ...","{'intimate': 45, 'moderate': 190, 'strong': 19...",woody,52.3889
...,...,...,...,...,...,...,...,...
3388,,,,,,,conifer,57.7621
3389,Chanel N°22,Chanel,4.26,1208.0,"{'very weak': 21, 'weak': 18, 'moderate': 72, ...","{'intimate': 45, 'moderate': 124, 'strong': 11...",Champagne,62.1377
3390,,,,,,,camphor,46.5055
3391,,,,,,,savory,84.0410


In [20]:
# Sort the dataframe by the 'name' column in alphabetical order
cleaned_perfume_df_melted_sorted = cleaned_perfume_df_melted.sort_values(by='name')

# Reset the index
cleaned_perfume_df_melted_sorted.reset_index(drop=True, inplace=True)

# Drop rows with NaN values in the 'name' column
cleaned_perfume_df_melted_sorted.dropna(subset=['name'], inplace=True)

# Display the resulting dataframe
cleaned_perfume_df_melted_sorted

Unnamed: 0,name,company,rating,number_votes,longevity,sillage,scent,scent_strength
0,02 L'Air du Desert Marocain,Tauer Perfumes,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",warm spicy,100.0000
1,02 L'Air du Desert Marocain,Tauer Perfumes,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",musky,45.1861
2,02 L'Air du Desert Marocain,Tauer Perfumes,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",green,70.2955
3,02 L'Air du Desert Marocain,Tauer Perfumes,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",woody,55.8338
4,02 L'Air du Desert Marocain,Tauer Perfumes,4.43,4568.0,"{'very weak': 104, 'weak': 78, 'moderate': 225...","{'intimate': 203, 'moderate': 641, 'strong': 9...",aromatic,67.0314
...,...,...,...,...,...,...,...,...
2650,Zahrat Hawai,Al-Rehab,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",aromatic,61.3975
2651,Zahrat Hawai,Al-Rehab,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",white floral,61.2162
2652,Zahrat Hawai,Al-Rehab,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",fresh spicy,47.5000
2653,Zahrat Hawai,Al-Rehab,3.81,212.0,"{'very weak': 7, 'weak': 11, 'moderate': 42, '...","{'intimate': 15, 'moderate': 63, 'strong': 28,...",floral,76.5721


In [None]:
# Import specific dependencies for this block
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Feature scaling (Min-Max scaling)
scaler = MinMaxScaler()
accords_columns = [col for col in cleaned_perfume_df.columns if col not in ['rating', 'number_votes']]
accords_scaled = scaler.fit_transform(cleaned_perfume_df[accords_columns])
cleaned_perfume_df[accords_columns] = accords_scaled

# Split the data into training and testing sets
#These are the features
X = cleaned_perfume_df.drop(['rating', 'number_votes'], axis=1)

# Target variable
y = cleaned_perfume_df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the cleaned DataFrame
cleaned_perfume_df.head()

In [None]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the database

# Review the scaled data

In [None]:
# Create a DataFrame with the scaled data

# Copy the names from the original data

# Set the identifier column as index

# Display sample data

Find the best value for k using the original data

In [None]:
# Create a list with the number of k-values from 1 to 11

In [None]:
# Create an empty list to store the inertia values

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `market_scaled`
# 3. Append the model.inertia_ to the inertia list

In [None]:
# Create a dictionary with the data to plot the Elbow curve

# Create a DataFrame with the data to plot the Elbow curve

In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k using hvplot

# Show the line chart

The best value for k is:

Cluster scent notes with K-means using the original data

In [None]:
# Initialize the K-Means model using the best value for k


In [None]:
# Fit the K-Means model using the scaled data

In [None]:
# Predict the clusters to group the notes using the scaled data

# Print the resulting array of cluster values.

In [None]:
# Create a copy of the DataFrame

In [None]:
# Add a new column to the DataFrame with the predicted clusters

# Display sample data

In [None]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the note name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

# Show scatterplot

Optimize Clusters with Principal Component Analysis.

In [None]:
# Create a PCA model instance and set `n_components=3`.

In [None]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.

# View the first five rows of the DataFrame.

In [None]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.

In [None]:
# Retrieve the explained variance ratios

# Calculate the total explained variance

# Print the total explained variance

What is the total explained variance of the # principal components?

In [None]:
# Creating a DataFrame with the PCA data

# Copy the note names from the original data

# Set the name column as index

# Display sample data

Find the best value for k using the PCA Data

In [None]:
# Create a list with the number of k-values from 1 to 11

In [None]:
# Create an empty list to store the inertia values

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_pca`
# 3. Append the model.inertia_ to the inertia list

In [None]:
# Create a dictionary with the data to plot the Elbow curve

# Create a DataFrame with the data to plot the Elbow curve

In [None]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

The best value for k when using the PCA data is:

Does the best k value change between the PCA Data and the Original Data?

Cluster Notes with K-means using the PCA Data

In [None]:
# Initialize the K-Means model using the best value for k

In [None]:
# Fit the K-Means model using the PCA data

In [None]:
# Predict the clusters to group the notes using the PCA data

# Print the resulting array of cluster values.

In [None]:
# Create a copy of the DataFrame with the PCA data

# Add a new column to the DataFrame with the predicted clusters

# Display sample data

In [None]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the note name in the `hover_cols` parameter to identify 
# the note represented by each data point.

# Show the scatter plot

Visualize and Compare the Results

In [None]:
# Composite plot to contrast the Elbow curves
# Arrange plots side by side for comparison

# Show the composite plot

In [None]:
# Composite plot to contrast the clusters
# Arrange plots side by side for comparison

# Show the scatter plot comparison