In [1]:
# Import libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\My
[nltk_data]     Account\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load the dataset from Google Drive
file_path = 'data/product_info.csv'
data = pd.read_csv(file_path)

In [3]:
data.columns

Index(['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count',
       'rating', 'reviews', 'size', 'variation_type', 'variation_value',
       'variation_desc', 'ingredients', 'price_usd', 'value_price_usd',
       'sale_price_usd', 'limited_edition', 'new', 'online_only',
       'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category',
       'secondary_category', 'tertiary_category', 'child_count',
       'child_max_price', 'child_min_price'],
      dtype='object')

In [4]:
data['tertiary_category'].value_counts()

tertiary_category
Perfume                      568
Moisturizers                 386
Face Serums                  379
Rollerballs & Travel Size    287
Hair Styling Products        255
                            ... 
Under-Eye Concealer            3
Sunscreen                      2
Hair Thinning & Hair Loss      2
Damaged Hair                   1
Manicure & Pedicure Tools      1
Name: count, Length: 118, dtype: int64

In [5]:
data['primary_category'].value_counts()

primary_category
Skincare           2420
Makeup             2369
Hair               1464
Fragrance          1432
Bath & Body         405
Mini Size           288
Men                  60
Tools & Brushes      52
Gifts                 4
Name: count, dtype: int64

In [6]:
# Filter relevant columns and categories
filtered_data = data[data['primary_category'].isin(['Skincare', 'Makeup', 'Hair', 'Fragrance'])]
filtered_data = filtered_data[['ingredients', 'primary_category']]

# Drop rows with missing ingredients
filtered_data = filtered_data.dropna(subset=['ingredients'])

In [7]:
from nltk.stem import WordNetLemmatizer
import re

# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_ingredients(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
filtered_data['ingredients_cleaned'] = filtered_data['ingredients'].apply(preprocess_ingredients)


In [8]:
import dask.dataframe as dd

# Convert the filtered dataset to a Dask DataFrame
dask_data = dd.from_pandas(filtered_data, npartitions=4)

# Example pipeline: Preprocess and split into features and labels
dask_data['ingredients_cleaned'] = dask_data['ingredients'].map(preprocess_ingredients, meta=('ingredients_cleaned', 'str'))

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Use top 5000 features

# Fit and transform the text data
X = tfidf.fit_transform(filtered_data['ingredients_cleaned'])
y = filtered_data['primary_category']

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

   Fragrance       0.97      0.97      0.97       262
        Hair       0.96      0.87      0.91       246
      Makeup       0.94      0.91      0.92       411
    Skincare       0.88      0.95      0.91       451

    accuracy                           0.93      1370
   macro avg       0.94      0.92      0.93      1370
weighted avg       0.93      0.93      0.93      1370



In [11]:
# Example: Predict using new ingredient data
new_data = ["WATER/EAU, SODIUM LAURYLSULFATE, SODIUM LAURETH SULFATE, COCAMIDOPROPYL BE- TAINE, GLYCOL DISTEARATE, DIMETHICONE, SODIUM CITRATE, COCAMIDE MEA, SODIUM XYLENESULFONATE, FRAGRANCE/ PARFUM, CITRIC ACID, SODIUM BENZOATE, POLYQUATERNI- UM-76, SODIUM CHLORIDE, TETRASODIUM EDTA, PANTHE- NOL, PANTHENYL ETHYL ETHER, METHYLCHLOROISOTHIAZOLIN- ONE, METHYLISOTHIAZOLINONE"]
new_data_preprocessed = [preprocess_ingredients(item) for item in new_data]
new_data_tfidf = tfidf.transform(new_data_preprocessed)
predictions = model.predict(new_data_tfidf)
print(predictions)

['Hair']
