In [1]:
import pandas as pd

data = pd.read_csv('data/new_product_info.csv')

------------------------------------------------------------------------------------

Predicting Category using Word2Vec model

In [2]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

# Tokenize the ingredients
data['tokenized_ingredients'] = data['ingredients'].apply(simple_preprocess)

# Train Word2Vec on the tokenized ingredients
word2vec_model = Word2Vec(
    sentences=data['tokenized_ingredients'],
    vector_size=100,       # Dimensionality of the embeddings
    window=5,              # Context window size
    min_count=1,           # Include all ingredients, even rare ones
    workers=4,             # Use 4 CPU cores for training
    sg=1,                  # Use Skip-Gram model
    epochs=10              # Number of iterations over the corpus
)

In [3]:
import numpy as np

def get_sentence_vector(model, sentence):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)  # Take the average of word vectors
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words match

# Generate embeddings for each product's ingredients
data['ingredient_embeddings'] = data['tokenized_ingredients'].apply(
    lambda x: get_sentence_vector(word2vec_model, x)
)


In [4]:
from sklearn.model_selection import train_test_split

# Extract features (ingredient embeddings) and labels (primary category)
X = np.vstack(data['ingredient_embeddings'])
y = data['primary_category']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

   Fragrance       0.98      0.98      0.98       172
        Hair       0.96      0.85      0.91       158
      Makeup       0.95      0.92      0.94       276
    Skincare       0.88      0.95      0.92       310

    accuracy                           0.93       916
   macro avg       0.94      0.93      0.93       916
weighted avg       0.93      0.93      0.93       916



In [34]:
# Example ingredient list for prediction
new_ingredients = r"Water/EAU, Glyceryl Stearate, Ammonium Acrylates Copolymer, Disteardimonium Hectorite, Propylene Glycol, Stearic Acid, Alcohol Denat., Copernicia Cerifera (Carnauba) Wax/Cire De Carnauba, Triethanolamine, Polyethylene, Acrylates Copolymer, Polyvinyl Alcoho, Lecithin, Propylene Carbonate, Synthetic Wax, Oleic Acid, Benzyl Alcohol, Nylon-6, Ascorbyl Palmitate, Tocopherol, Glycerin, Panthenol, Simethicone, Xanthan Gum, Ilica Ethylparaben, Sodium Laureth Sulfate, Phenoxyethanol, Methylparaben, Propylparaben, Trisodium EDTA, Titanium Dioxide, Ultramarines, Black 2, Iron Oxides."

# Tokenize and generate embeddings
new_tokens = simple_preprocess(new_ingredients)
new_embedding = get_sentence_vector(word2vec_model, new_tokens).reshape(1, -1)

# Predict the category
predicted_category = classifier.predict(new_embedding)
print("Predicted Primary Category:", predicted_category[0])


Predicted Primary Category: Makeup


Neural Network

In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np

# Ensure data is numeric
X_train = np.array(X_train, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)

# Encode labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

# Define the model
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Correct way to define input shape
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(len(set(y_train)), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)  # Convert softmax output to class labels

# Print classification report
print(classification_report(y_test, y_pred_classes))


Epoch 1/20
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6741 - loss: 0.9547 - val_accuracy: 0.8843 - val_loss: 0.3678
Epoch 2/20
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8783 - loss: 0.3568 - val_accuracy: 0.9028 - val_loss: 0.3113
Epoch 3/20
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8916 - loss: 0.3168 - val_accuracy: 0.9039 - val_loss: 0.2967
Epoch 4/20
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9005 - loss: 0.2850 - val_accuracy: 0.9061 - val_loss: 0.3057
Epoch 5/20
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9033 - loss: 0.2864 - val_accuracy: 0.9083 - val_loss: 0.2914
Epoch 6/20
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9162 - loss: 0.2685 - val_accuracy: 0.9094 - val_loss: 0.2733
Epoch 7/20
[1m115/115[0m 

In [35]:
import numpy as np

# List of category names in the same order as the model's output
category_names = ["Fragrance", "Hair", "Makeup", "Skincare"]  # Replace with your actual categories

# Example ingredient list for prediction
new_ingredients = r"Water/EAU, Glyceryl Stearate, Ammonium Acrylates Copolymer, Disteardimonium Hectorite, Propylene Glycol, Stearic Acid, Alcohol Denat., Copernicia Cerifera (Carnauba) Wax/Cire De Carnauba, Triethanolamine, Polyethylene, Acrylates Copolymer, Polyvinyl Alcoho, Lecithin, Propylene Carbonate, Synthetic Wax, Oleic Acid, Benzyl Alcohol, Nylon-6, Ascorbyl Palmitate, Tocopherol, Glycerin, Panthenol, Simethicone, Xanthan Gum, Ilica Ethylparaben, Sodium Laureth Sulfate, Phenoxyethanol, Methylparaben, Propylparaben, Trisodium EDTA, Titanium Dioxide, Ultramarines, Black 2, Iron Oxides."

# Tokenize and generate embeddings
new_tokens = simple_preprocess(new_ingredients)
new_embedding = get_sentence_vector(word2vec_model, new_tokens).reshape(1, -1)

# Predict category probabilities
predicted_probs = model.predict(new_embedding)

# Get the index of the highest probability
predicted_index = np.argmax(predicted_probs)

# Get the corresponding category name
predicted_category_name = category_names[predicted_index]

print("Predicted Primary Category:", predicted_category_name)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Predicted Primary Category: Makeup


Reinforcement Model

In [9]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from collections import deque
from sklearn.preprocessing import LabelEncoder
import random

# Define the RL Environment
class CategoryClassificationEnv(gym.Env):
    def __init__(self, X_train, y_train, num_classes):
        super(CategoryClassificationEnv, self).__init__()
        self.X_train = X_train
        self.y_train = y_train
        self.num_classes = num_classes
        self.current_index = 0  # Track which sample we're on
        self.state = self.X_train[self.current_index]  # First ingredient list vector

    def step(self, action):
        """Take a step in the environment based on action (category prediction)."""
        correct_category = self.y_train[self.current_index]
        reward = 1 if action == correct_category else -1  # Reward function
        self.current_index += 1  # Move to next sample
        done = self.current_index >= len(self.X_train)  # End episode after dataset iteration
        self.state = self.X_train[self.current_index] if not done else None  # Update state
        return self.state, reward, done, {}

    def reset(self):
        """Reset environment to start training again."""
        self.current_index = 0
        self.state = self.X_train[self.current_index]
        return self.state

# Convert labels to numeric values
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

# Create RL Environment
env = CategoryClassificationEnv(X_train, y_train_encoded, num_classes=len(set(y_train_encoded)))

# Define the DQN Model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  # Define input shape here
    Dense(64, activation='relu'),
    Dense(len(set(y_train)), activation='softmax')
])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss="mse")

# Training the DQN
gamma = 0.95  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
memory = deque(maxlen=2000)  # Experience replay memory
batch_size = 32

for episode in range(100):  # Train for 100 episodes
    state = env.reset()
    done = False
    while not done:
        if np.random.rand() <= epsilon:
            action = np.random.choice(env.num_classes)  # Random action (exploration)
        else:
            q_values = model.predict(state.reshape(1, -1))
            action = np.argmax(q_values)  # Choose best category

        next_state, reward, done, _ = env.step(action)
        memory.append((state, action, reward, next_state, done))
        state = next_state

    if len(memory) > batch_size:
        minibatch = random.sample(memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward if done else reward + gamma * np.max(model.predict(next_state.reshape(1, -1)))
            target_q_values = model.predict(state.reshape(1, -1))
            target_q_values[0][action] = target
            model.fit(state.reshape(1, -1), target_q_values, epochs=1, verbose=0)

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay  # Reduce exploration rate

# Evaluate on test set
for i in range(len(X_test)):
    q_values = model.predict(X_test[i].reshape(1, -1))
    predicted_category = np.argmax(q_values)
    print(f"Predicted: {encoder.inverse_transform([predicted_category])}, Actual: {encoder.inverse_transform([y_test_encoded[i]])}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 671ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

KeyboardInterrupt: 

------------------------------------------------------------------------------------

Predicting Category using Fasttext model

In [11]:
from gensim.models import FastText

# Train FastText on the tokenized ingredients
fasttext_model = FastText(
    sentences=data['tokenized_ingredients'],  # Tokenized ingredients
    vector_size=100,                          # Dimensionality of word embeddings
    window=5,                                 # Context window size
    min_count=1,                              # Include all words, even rare ones
    workers=4,                                # Use 4 CPU cores for training
    sg=1,                                     # Use Skip-Gram (sg=1); CBOW if sg=0
    epochs=10                                 # Number of training iterations
)


In [12]:
# Generate embeddings for each product's ingredients
data['ingredient_embeddings'] = data['tokenized_ingredients'].apply(
    lambda x: get_sentence_vector(fasttext_model, x)
)

# Extract features (ingredient embeddings) and labels (primary category)
X = np.vstack(data['ingredient_embeddings'])
y = data['primary_category']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Train a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   Fragrance       0.98      0.97      0.98       262
        Hair       0.96      0.86      0.91       246
      Makeup       0.95      0.89      0.92       411
    Skincare       0.85      0.96      0.90       451

    accuracy                           0.92      1370
   macro avg       0.94      0.92      0.93      1370
weighted avg       0.93      0.92      0.92      1370



------------------------------------------------------------------------------------

Predicting Category using BERT

In [14]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
# Tokenize the ingredients column
data['tokenized_ingredients'] = data['ingredients'].apply(
    lambda x: tokenizer(
        text=x,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
)


KeyboardInterrupt: 

In [8]:
import torch
from transformers import BertModel

# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Generate embeddings for each ingredient list
def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Apply to the dataset
data['bert_embeddings'] = data['ingredients'].apply(
    lambda x: get_bert_embedding(x, tokenizer, bert_model)
)


KeyboardInterrupt: 

In [15]:
import torch
from transformers import BertModel

# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Generate embeddings for each ingredient list
def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

In [16]:
from sklearn.model_selection import train_test_split
import numpy as np
import json

# Loading the bert embeddings
data = pd.read_csv("data/bert_embeddings.csv")
data['bert_embeddings'] = data['bert_embeddings'].apply(lambda x: np.array(json.loads(x)))


# Extract features and labels
X = np.vstack(data['bert_embeddings'])
y = data['primary_category']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   Fragrance       0.98      0.96      0.97       262
        Hair       0.86      0.53      0.66       246
      Makeup       0.85      0.82      0.83       411
    Skincare       0.70      0.87      0.78       451

    accuracy                           0.81      1370
   macro avg       0.85      0.80      0.81      1370
weighted avg       0.83      0.81      0.81      1370



In [18]:
# Example ingredient list
new_ingredients = "aqua, glycerin, cetyl alcohol, fragrance"

# Generate embedding for the new ingredient list
new_embedding = get_bert_embedding(new_ingredients, tokenizer, bert_model).reshape(1, -1)

# Predict the category
predicted_category = classifier.predict(new_embedding)
print("Predicted Primary Category:", predicted_category[0])

Predicted Primary Category: Makeup


In [8]:
import json

# Convert embeddings to a string (JSON format) before saving
data['bert_embeddings'] = data['bert_embeddings'].apply(lambda x: json.dumps(x.tolist()))

# Save the DataFrame to a CSV file
data.to_csv("data/bert_embeddings.csv", index=False)

In [None]:
# Loading the bert embeddings
data = pd.read_csv("data/bert_embeddings.csv")
data['bert_embeddings'] = data['bert_embeddings'].apply(lambda x: np.array(json.loads(x)))


------------------------------------------------------------------------------------

Recommendation Part

In [16]:
from sentence_transformers import SentenceTransformer

# Load Sentence-BERT model
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Example input ingredients
input_ingredients = ["Sodium Chloride, Glycerin, Helianthus Annuus (Sunflower) Seed , Fragrance/Parfum, Glyceryl Stearate, Kaolin, Sodium Stearoyl Lactylate, Cocos Nucifera (Coconut) , Water/Aqua/Eau, Butyrospermum Parkii (Shea) Butter, Carthamus Tinctorius (Safflower) Seed , Glyceryl Stearate SE, Tocopherol, Caprylic/Capric Triglyceride, Rosa Canina Fruit , Melia Azadirachta Leaf Extract, Melia Azadirachta Flower Extract, Corallina Officinalis Extract, Himanthalia Elongata Extract, Raphanus Sativus (Radish) Root Extract, Maltodextrin, Coccinia Indica Fruit Extract, Gardenia Florida Fruit Extract, Solanum Melongena (Eggplant) Fruit Extract, Aloe Barbadensis Flower Extract, Simmondsia Chinensis (Jojoba) Seed , Curcuma Longa (Turmeric) Root Extract, Ocimum Basilicum (Basil) Flower/Leaf Extract, Ocimum Sanctum Leaf Extract."]

# Preprocess the input ingredients
processed_input = " ".join([ingredient.lower().strip() for ingredient in input_ingredients])

# Compute the embedding for the input
input_embedding = model.encode(processed_input, convert_to_tensor=True)


In [17]:
# Figuring out the category using the pre-trained model

cat_pred_ingredients = "Sodium Chloride, Glycerin, Helianthus Annuus (Sunflower) Seed , Fragrance/Parfum, Glyceryl Stearate, Kaolin, Sodium Stearoyl Lactylate, Cocos Nucifera (Coconut) , Water/Aqua/Eau, Butyrospermum Parkii (Shea) Butter, Carthamus Tinctorius (Safflower) Seed , Glyceryl Stearate SE, Tocopherol, Caprylic/Capric Triglyceride, Rosa Canina Fruit , Melia Azadirachta Leaf Extract, Melia Azadirachta Flower Extract, Corallina Officinalis Extract, Himanthalia Elongata Extract, Raphanus Sativus (Radish) Root Extract, Maltodextrin, Coccinia Indica Fruit Extract, Gardenia Florida Fruit Extract, Solanum Melongena (Eggplant) Fruit Extract, Aloe Barbadensis Flower Extract, Simmondsia Chinensis (Jojoba) Seed , Curcuma Longa (Turmeric) Root Extract, Ocimum Basilicum (Basil) Flower/Leaf Extract, Ocimum Sanctum Leaf Extract."

# Tokenize and generate embeddings
new_tokens = simple_preprocess(cat_pred_ingredients)
new_embedding = get_sentence_vector(word2vec_model, new_tokens).reshape(1, -1)

# Predict the category
predicted_category = classifier.predict(new_embedding)
print("Predicted Primary Category:", predicted_category[0])

Predicted Primary Category: Skincare


In [18]:
import json
import torch
from torch.nn.functional import cosine_similarity

def compute_similarity(input_emb, product_emb):
    return cosine_similarity(input_emb, product_emb, dim=0).item()

# Load the CSV files
ingredients_data = pd.read_csv("data/ingredients_embedding.csv")
concern_chems_data = pd.read_csv("data/concern_chems_embedding.csv")
red_list_data = pd.read_csv("data/red_list_embedding.csv")
the_gens_data = pd.read_csv("data/the_gens_embedding.csv")

# Convert JSON string embeddings back to tensors
ingredients_data["ingredients_embedding"] = ingredients_data["ingredients_embedding"].apply(lambda x: torch.tensor(json.loads(x)))
concern_chems_data["concern_chems_embedding"] = concern_chems_data["concern_chems_embedding"].apply(lambda x: torch.tensor(json.loads(x)))
red_list_data["red_list_embedding"] = red_list_data["red_list_embedding"].apply(lambda x: torch.tensor(json.loads(x)))
the_gens_data["the_gens_embedding"] = the_gens_data["the_gens_embedding"].apply(lambda x: torch.tensor(json.loads(x)))

# Input primary category (assume we determine this beforehand)
input_primary_category = predicted_category[0] # Replace this with the actual category from the input

# Filter dataset for the same primary category
filtered_data = data[data["primary_category"] == input_primary_category].copy()

# Calculate similarity scores only for filtered products
filtered_data["ingredient_similarity"] = ingredients_data["ingredients_embedding"].apply(
    lambda x: compute_similarity(input_embedding, x)
)
filtered_data["concern_chems_similarity"] = concern_chems_data["concern_chems_embedding"].apply(
    lambda x: compute_similarity(input_embedding, x)
)
filtered_data["red_list_similarity"] = red_list_data["red_list_embedding"].apply(
    lambda x: compute_similarity(input_embedding, x)
)
filtered_data["the_gens_similarity"] = the_gens_data["the_gens_embedding"].apply(
    lambda x: compute_similarity(input_embedding, x)
)

# Combine scores: prioritize high ingredient similarity and low harmful similarity
filtered_data["final_score"] = (
    filtered_data["ingredient_similarity"] - filtered_data["concern_chems_similarity"] - filtered_data["red_list_similarity"] - filtered_data["the_gens_similarity"]
)

In [19]:
# Sort by the final score (descending)
recommended_products = filtered_data.sort_values(by="final_score", ascending=False)

# Get the top product
top_product = recommended_products.iloc[0]
print("Recommended Product:")
print(f"Name: {top_product['product_name']}")
# print(f"Ingredients: {top_product['ingredients']}")
print(f"Detected Harmful Ingredients: {top_product['harmful_detected']}")
print(f"Similarity Score: {top_product['final_score']}")


Recommended Product:
Name: Coconut Milk Bath Soak


KeyError: 'harmful_detected'