In [86]:
# Import Libraries
import pandas as pd
import numpy as np
import re
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool,ColorBar, LinearColorMapper
from bokeh.transform import linear_cmap
from bokeh.palettes import RdYlGn


In [88]:
# Load the dataset
df = pd.read_csv(r'C:\Users\bnith\Downloads\dataset\datasets\cosmetics.csv')

# Display a sample of five random rows from the dataset
display(df.sample(5))

# Display counts of each product type in the 'Label' column
print("Product Type Counts:")
print(df['Label'].value_counts())



Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
1407,Sun protect,COOLA,Classic Face SPF 30 - Cucumber,32,4.4,"Caprylohydroxamic Acid, Caprylyl Glycol, Carth...",1,1,1,0,1
119,Moisturizer,BIOSSANCE,Squalane + Omega Repair Cream,58,4.7,"Water, Squalane, Glycerin, Caprylic/Capric Tri...",1,1,1,0,1
1373,Sun protect,AMOREPACIFIC,Natural Protector Broad Spectrum SPF 35 Sunscreen,65,4.0,Visit the AMOREPACIFIC boutique,1,1,1,1,1
54,Moisturizer,TARTE,BB Tinted Treatment 12-Hour Primer Broad Spect...,37,4.3,"Cyclopentasiloxane, Isododecane, Mica, Polysil...",1,1,1,1,0
208,Moisturizer,SK-II,Cellumination Cream EX,160,4.3,"Water, Galactomyces Ferment Filtrate (Pitera),...",1,1,1,1,1


Product Type Counts:
Label
Moisturizer    298
Cleanser       281
Face Mask      266
Treatment      248
Eye cream      209
Sun protect    170
Name: count, dtype: int64


In [90]:
#  Remove duplicate rows
df.drop_duplicates(inplace=True)

#  Drop rows with missing essential data
df.dropna(subset=['Ingredients', 'Label', 'Price'], inplace=True)

# Clean the 'Ingredients' column:
# - Remove leading/trailing spaces
# - Convert to lowercase
# - Remove HTML tags if present

df['Ingredients'] = df['Ingredients'].str.strip().str.lower()
df['Ingredients'] = df['Ingredients'].apply(lambda x: re.sub(r'<.*?>', '', x))

#  Clean the 'Label' column (Product Category)
df['Label'] = df['Label'].str.strip().str.title()

#  Convert 'Price' to numeric, handle errors
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

#  Drop rows with missing or invalid prices after conversion
df.dropna(subset=['Price'], inplace=True)

print(df)

            Label                          Brand  \
0     Moisturizer                         LA MER   
1     Moisturizer                          SK-II   
2     Moisturizer                 DRUNK ELEPHANT   
3     Moisturizer                         LA MER   
4     Moisturizer                   IT COSMETICS   
...           ...                            ...   
1467  Sun Protect                         KORRES   
1468  Sun Protect                KATE SOMERVILLE   
1469  Sun Protect                  VITA LIBERATA   
1470  Sun Protect  ST. TROPEZ TANNING ESSENTIALS   
1471  Sun Protect                     DERMAFLASH   

                                                   Name  Price  Rank  \
0                                       Crème de la Mer    175   4.1   
1                              Facial Treatment Essence    179   4.1   
2                            Protini™ Polypeptide Cream     68   4.4   
3                           The Moisturizing Soft Cream    175   3.8   
4         Your 

In [100]:
#  Filter the data for Moisturizers and Oily Skin

# Filter for 'Moisturizer' in the 'Label' column
Sun protect = df[df['Label'] == 'Sun protect']

# Further filter for products suitable for dry skin (Oily column == 1)
moisturizers_Sun protect  = Sun protect[Sun protect['Normal'] == 1]

# Reset the index after filtering
Sun protect_Normal  = Sun protect_Normal.reset_index(drop=True)

# Display the filtered data shape
print("Filtered dataset shape:",Sun protect_Normal.shape)


Filtered dataset shape: (204, 11)


In [108]:
# Tokenize the ingredients and create a bag of words

# Initialize dictionary, list, and initial index
ingredient_idx = {}
corpus = []
idx = 0

# For loop for tokenization
for i in range(len(moisturizers_Normal)):    
    # Get the ingredients of each product
    ingredients = moisturizers_Normal['Ingredients'][i]
    
    # Convert ingredients to lowercase
    ingredients_lower = ingredients.lower()
    
    # Split the ingredients into tokens based on ', '
    tokens = ingredients_lower.split(', ')
    
    # Append the list of tokens to the corpus
    corpus.append(tokens)
    
    # Build the ingredient index dictionary
    for ingredient in tokens:
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient] = idx
            idx += 1
            
#  Check the index of a specific ingredient
print("The index for decyl oleate is", ingredient_idx['decyl oleate'])


The index for decyl oleate is 25


In [110]:
# Initialize a Document-Term Matrix (DTM)

# Get the total number of products (documents)
M = len(moisturizers_Normal)

# Get the total number of unique ingredients (terms)
N = len(ingredient_idx)

# Initialize a matrix of zeros with size M x N
A = np.zeros((M, N))

# Display the shape of the matrix
print("Document-Term Matrix shape (M x N):", A.shape)


Document-Term Matrix shape (M x N): (204, 2315)


In [112]:
# Define the oh_encoder function
def oh_encoder(tokens):
    # Initialize a zero vector of length N (total number of unique ingredients)
    x = np.zeros(N)
    
    # For each ingredient in the token list
    for ingredient in tokens:
        # Get the index of the ingredient from ingredient_idx dictionary
        idx = ingredient_idx.get(ingredient)
        
        # If the ingredient exists in the dictionary, set the corresponding index to 1
        if idx is not None:
            x[idx] = 1
            
    # Return the one-hot encoded vector
    return x


In [114]:
# Get the binary value of the tokens for each row of matrix A

# Initialize row index
i = 0

# Loop through each tokenized ingredient list (each product)
for tokens in corpus:
    # Apply the oh_encoder function to get the one-hot encoded vector
    A[i, :] = oh_encoder(tokens)
    
    # Increment row index
    i += 1



In [118]:
# Dimension reduction with t-SNE

# Create a TSNE instance with specified parameters
model = TSNE(n_components=2, learning_rate=200, random_state=42)

# Apply t-SNE to the Document-Term Matrix A
tsne_features = model.fit_transform(A)

# Create new columns 'X' and 'Y' in the moisturizers_dry DataFrame to store t-SNE results
moisturizers_Normal ['X'] = tsne_features[:, 0]  # First dimension
moisturizers_Normal ['Y'] = tsne_features[:, 1]  # Second dimension

# Optional: Display first few results
print(moisturizers_Normal)

           Label            Brand  \
0    Moisturizer           LA MER   
1    Moisturizer            SK-II   
2    Moisturizer   DRUNK ELEPHANT   
3    Moisturizer           LA MER   
4    Moisturizer     IT COSMETICS   
..           ...              ...   
199  Moisturizer         SHISEIDO   
200  Moisturizer    SATURDAY SKIN   
201  Moisturizer           LA MER   
202  Moisturizer  KATE SOMERVILLE   
203  Moisturizer            GO-TO   

                                                  Name  Price  Rank  \
0                                      Crème de la Mer    175   4.1   
1                             Facial Treatment Essence    179   4.1   
2                           Protini™ Polypeptide Cream     68   4.4   
3                          The Moisturizing Soft Cream    175   3.8   
4        Your Skin But Better™ CC+™ Cream with SPF 50+     38   4.1   
..                                                 ...    ...   ...   
199  White Lucent All Day Brightener Broad Spectrum...    

In [122]:
# Create a ColumnDataSource with the moisturizers_dry DataFrame
source = ColumnDataSource(moisturizers_Normal)

# Create the figure with axis labels
plot = figure(x_axis_label='T-SNE 1', 
              y_axis_label='T-SNE 2', 
              width=500, height=400,
              title="t-SNE Scatter Plot of Moisturizers for Normal Skin")

# Create a linear color mapper based on Price (green = low price, red = high price)
color_mapper = linear_cmap(field_name='Price', palette=RdYlGn[11][::1], low=moisturizers_Normal['Price'].min(), high=moisturizers_Normal['Price'].max())

# Add the scatter plot with color based on price
plot.scatter(x='X', 
             y='Y', 
             source=source, 
             size=10, 
             color=color_mapper, 
             alpha=0.8)


In [124]:
# Create a HoverTool object
hover = HoverTool(tooltips=[
    ('Item', '@Name'),
    ('Brand', '@Brand'),
    ('Price', '$@Price'),
    ('Rank', '@Rank')
])

# Add the hover tool to the plot
plot.add_tools(hover)

# Add color bar to indicate price scale
color_bar = ColorBar(color_mapper=color_mapper['transform'], label_standoff=12, location=(0,0))
plot.add_layout(color_bar, 'right')



In [126]:
# Display the plot
show(plot)


In [None]:
#  Print the ingredients for two similar products

# Filter the first product by name
cosmetic_1 = moisturizers_dry[moisturizers_dry['Name'] == "Color Control Cushion Compact Broad Spectrum SPF 50+"]

# Filter the second product by name
cosmetic_2 = moisturizers_dry[moisturizers_dry['Name'] == "BB Cushion Hydra Radiance SPF 50"]

# Display the data and ingredients of the first product
display(cosmetic_1)
print("Ingredients of Color Control Cushion Compact Broad Spectrum SPF 50+:\n", cosmetic_1.Ingredients.values[0])

# Display the data and ingredients of the second product
display(cosmetic_2)
print("Ingredients of BB Cushion Hydra Radiance SPF 50:\n", cosmetic_2.Ingredients.values[0])
