In [1]:
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

ings = pd.read_csv('data/ingredients_dataset.csv')

# Convert all values in 'ingredient' column to strings
ings['ingredient'] = ings['ingredient'].astype(str)

# Tokenize the ingredients
ings['tokenized_ingredients'] = ings['ingredient'].apply(simple_preprocess)

# Train Word2Vec on the tokenized ingredients
word2vec_model = Word2Vec(
    sentences=ings['tokenized_ingredients'],
    vector_size=100,       # Dimensionality of the embeddings
    window=5,              # Context window size
    min_count=1,           # Include all ingredients, even rare ones
    workers=4,             # Use 4 CPU cores for training
    sg=1,                  # Use Skip-Gram model
    epochs=10              # Number of iterations over the corpus
)

In [None]:
# Save the model
word2vec_model.save("data/model/ingredient_word2vec.model")

In [None]:
import pandas as pd

# Loading the dataset
data = pd.read_csv('data/product_info.csv')


In [None]:
# The only required columns
data = data[['product_id', 'brand_name', 'product_name', 'ingredients', 'primary_category',
       'secondary_category', 'tertiary_category']]

# Checking for missing values
data.isnull().sum()

In [None]:
# Dropping null values
data.dropna(subset=['ingredients'], inplace=True)
data.dropna(subset=['tertiary_category'], inplace=True)

In [None]:
# Counting duplicates
duplicate_count = data.duplicated(keep=False).sum()
print(f"Number of duplicate rows: {duplicate_count}")

In [None]:
# Removing products in the mini size category because it has no tertiary category

data.drop(data[data['primary_category'] == "Mini Size"].index, inplace=True)

In [None]:
# Update primary_category where tertiary_category is "Beauty Supplements"
data.loc[data['tertiary_category'] == "Beauty Supplements", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Holistic Wellness"
data.loc[data['tertiary_category'] == "Holistic Wellness", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Holistic Wellness"
data.loc[data['tertiary_category'] == "Makeup Removers", 'primary_category'] = "Makeup"
# Update primary_category where tertiary_category is "Teeth Whitening"
data.loc[data['tertiary_category'] == "Teeth Whitening", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Brush Cleaners"
data.loc[data['tertiary_category'] == "Brush Cleaners", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Brush Sets"
data.loc[data['tertiary_category'] == "Brush Sets", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Eye Brushes"
data.loc[data['tertiary_category'] == "Eye Brushes", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Face Brushes"
data.loc[data['tertiary_category'] == "Face Brushes", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Sponges & Applicators"
data.loc[data['tertiary_category'] == "Sponges & Applicators", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Accessories"
data.loc[data['tertiary_category'] == "Accessories", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Hair Supplements"
data.loc[data['tertiary_category'] == "Hair Supplements", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Candles"
data.loc[data['tertiary_category'] == "Candles", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Diffusers"
data.loc[data['tertiary_category'] == "Diffusers", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "For Body"
data.loc[data['tertiary_category'] == "For Body", 'primary_category'] = "Skincare"
# Update primary_category where tertiary_category is "For Face"
data.loc[data['tertiary_category'] == "For Face", 'primary_category'] = "Skincare"
# Update primary_category where tertiary_category is "Bath & Body"
data.loc[data['primary_category'] == "Bath & Body", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Tools & Brushes"
data.loc[data['primary_category'] == "Tools & Brushes", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Men"
data.loc[data['primary_category'] == "Men", 'primary_category'] = "Other"
# Update primary_category where tertiary_category is "Body Sunscreen"
data.loc[data['tertiary_category'] == "Body Sunscreen", 'primary_category'] = "Skincare"
# Update primary_category where tertiary_category is "Blotting Papers"
data.loc[data['tertiary_category'] == "Blotting Papers", 'primary_category'] = "Other"
# Changing the tertiary category name
data.loc[data['tertiary_category'] == "Cologne Gift Sets", 'tertiary_category'] = "Cologne"
# Changing the tertiary category name
data.loc[data['tertiary_category'] == "Perfume Gift Sets", 'tertiary_category'] = "Perfume"

In [None]:
# Changing the tertiary category name
data.loc[data['tertiary_category'] == "BB & CC Creams", 'tertiary_category'] = "BB & CC Cream"

In [None]:
# Filter the data for the 'Fragrance' primary category
fragrance_data = data[data['primary_category'] == 'Fragrance']

# Group by tertiary category and count occurrences
tertiary_counts = fragrance_data.groupby('tertiary_category').size().reset_index(name='count')

# Display all tertiary categories and their counts
print(tertiary_counts)

In [None]:
# Checking for missing values
data['primary_category'].isnull().sum()

In [None]:
# Checking for value counts of components in primary category
data['primary_category'].value_counts()

Cleaning the Ingredients column

In [None]:
# Clean the ingredients column
data['ingredients'] = data['ingredients'].str.strip()  # Remove leading and trailing spaces
data['ingredients'] = data['ingredients'].str.replace(r'^[^\w]+|[^\w]+$', '', regex=True)  # Remove unwanted symbols

Attaching detection columns to the dataset

In [None]:
import re

# Function to normalize ingredient names
def normalize_ingredient(ingredient):
    ingredient = ingredient.lower()  # Convert to lowercase
    ingredient = re.sub(r'[-.,/]+', ' ', ingredient)  # Replace separators with space
    ingredient = re.sub(r'\s+', ' ', ingredient).strip()  # Remove extra spaces
    return ingredient


# Load ingredient lists
with open("data/concern_chems.txt") as f:
    concern_chems = [normalize_ingredient(line.strip()) for line in f]

with open("data/red_list.txt") as f:
    red_list = [normalize_ingredient(line.strip()) for line in f]

with open("data/the_gens.txt") as f:
    the_gens = [normalize_ingredient(line.strip()) for line in f]

# Ensure the ingredients column is in a string format
data['ingredients'] = data['ingredients'].astype(str)

# Tokenize each ingredient list in the dataset by splitting on commas
data['ingredients_list'] = data['ingredients'].apply(
    lambda x: [normalize_ingredient(ingredient) for ingredient in x.split(',')])

# Create new columns to store detected ingredients from each category
data['concerning_chems_detected'] = data['ingredients_list'].apply(
    lambda ingredients: [ingredient for ingredient in ingredients if ingredient in concern_chems])
data['red_list_chems_detected'] = data['ingredients_list'].apply(
    lambda ingredients: [ingredient for ingredient in ingredients if ingredient in red_list])
data['allergens_detected'] = data['ingredients_list'].apply(
    lambda ingredients: [ingredient for ingredient in ingredients if ingredient in the_gens])

# Count detected ingredients for each category
data['concerning_chems_count'] = data['concerning_chems_detected'].apply(len)
data['red_list_chems_count'] = data['red_list_chems_detected'].apply(len)
data['allergens_count'] = data['allergens_detected'].apply(len)

# View the results
data[['product_name', 'concerning_chems_detected', 'concerning_chems_count', 'red_list_chems_detected',
      'red_list_chems_count', 'allergens_detected', 'allergens_count']].head()

In [None]:
from gensim.models import Word2Vec

# Load the trained Word2Vec model
word2vec_model = Word2Vec.load("data/model/ingredient_word2vec.model")

In [None]:
import numpy as np
from gensim.utils import simple_preprocess
import json

def get_sentence_vector(model, sentence):
    """
    Compute the average vector for a list of words using a Word2Vec model
    """
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)  # Take the average of word vectors
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words match

# Process each column to generate embeddings using Word2Vec
# Assuming your columns already contain preprocessed tokens or texts

# For ingredients_list column
data["ingredients_embedding"] = data["ingredients_list"].apply(
    lambda x: json.dumps(get_sentence_vector(word2vec_model, x if isinstance(x, list) else simple_preprocess(str(x))).tolist())
)

# For concern_chems_detected column
data["concern_chems_embedding"] = data["concerning_chems_detected"].apply(
    lambda x: json.dumps(get_sentence_vector(word2vec_model, x if isinstance(x, list) else simple_preprocess(str(x))).tolist())
)

# For red_list_detected column
data["red_list_embedding"] = data["red_list_chems_detected"].apply(
    lambda x: json.dumps(get_sentence_vector(word2vec_model, x if isinstance(x, list) else simple_preprocess(str(x))).tolist())
)

# For the_gens_detected column
data["the_gens_embedding"] = data["allergens_detected"].apply(
    lambda x: json.dumps(get_sentence_vector(word2vec_model, x if isinstance(x, list) else simple_preprocess(str(x))).tolist())
)

In [None]:
# Drop the "BB & CC Creams" column from the DataFrame
data = data.drop(columns=["BB & CC Creams"])

In [None]:
# Save the data with embeddings
data.to_csv("data/new_product_info3.csv", index=False)

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

# Extract features (ingredient embeddings) and labels (primary category)
X = np.vstack(data['ingredient_embeddings'])
y = data['primary_category']

# Try different K values
k_values = [3, 5, 10, 15]
scores = {}

for k in k_values:
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    model = RandomForestClassifier(random_state=42)

    # Perform Cross-Validation
    cv_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
    scores[k] = np.mean(cv_scores)

# Find the best K
best_k = max(scores, key=scores.get)
best_accuracy = scores[best_k]

print(f"Best K: {best_k} with Accuracy: {best_accuracy:.4f}")

# Train the final model using the best K
final_skf = StratifiedKFold(n_splits=best_k, shuffle=True, random_state=42)
final_model = RandomForestClassifier(random_state=42)

# Perform final training
final_model.fit(X, y)

print("Final model trained with K =", best_k)

In [None]:
import joblib

classifier = final_model

# Save the model to a file
joblib.dump(final_model, "data/model/new_random_forest.pkl")

In [None]:
import joblib

# Load the saved model
final_model = joblib.load("data/model/new_random_forest.pkl")

classifier = final_model