In [1]:
import pandas as pd

data = pd.read_csv('data/new_product_info.csv')

------------------------------------------------------------------------------------

Predicting Category using Word2Vec model

In [2]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

# Tokenize the ingredients
data['tokenized_ingredients'] = data['ingredients'].apply(simple_preprocess)

# Train Word2Vec on the tokenized ingredients
word2vec_model = Word2Vec(
    sentences=data['tokenized_ingredients'],
    vector_size=100,       # Dimensionality of the embeddings
    window=5,              # Context window size
    min_count=1,           # Include all ingredients, even rare ones
    workers=4,             # Use 4 CPU cores for training
    sg=1,                  # Use Skip-Gram model
    epochs=10              # Number of iterations over the corpus
)

In [3]:
import numpy as np

def get_sentence_vector(model, sentence):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)  # Take the average of word vectors
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words match

# Generate embeddings for each product's ingredients
data['ingredient_embeddings'] = data['tokenized_ingredients'].apply(
    lambda x: get_sentence_vector(word2vec_model, x)
)


In [4]:
from sklearn.model_selection import train_test_split

# Extract features (ingredient embeddings) and labels (primary category)
X = np.vstack(data['ingredient_embeddings'])
y = data['primary_category']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

   Fragrance       0.98      0.98      0.98       172
        Hair       0.95      0.86      0.90       158
      Makeup       0.95      0.92      0.94       276
    Skincare       0.88      0.95      0.91       310

    accuracy                           0.93       916
   macro avg       0.94      0.93      0.93       916
weighted avg       0.93      0.93      0.93       916



In [6]:
# Example ingredient list for prediction
new_ingredients = "aqua, glycerin, cetyl alcohol, fragrance"

# Tokenize and generate embeddings
new_tokens = simple_preprocess(new_ingredients)
new_embedding = get_sentence_vector(word2vec_model, new_tokens).reshape(1, -1)

# Predict the category
predicted_category = classifier.predict(new_embedding)
print("Predicted Primary Category:", predicted_category[0])


Predicted Primary Category: Hair


------------------------------------------------------------------------------------

Predicting Category using Word2Vec model

In [11]:
from gensim.models import FastText

# Train FastText on the tokenized ingredients
fasttext_model = FastText(
    sentences=data['tokenized_ingredients'],  # Tokenized ingredients
    vector_size=100,                          # Dimensionality of word embeddings
    window=5,                                 # Context window size
    min_count=1,                              # Include all words, even rare ones
    workers=4,                                # Use 4 CPU cores for training
    sg=1,                                     # Use Skip-Gram (sg=1); CBOW if sg=0
    epochs=10                                 # Number of training iterations
)


In [12]:
# Generate embeddings for each product's ingredients
data['ingredient_embeddings'] = data['tokenized_ingredients'].apply(
    lambda x: get_sentence_vector(fasttext_model, x)
)

# Extract features (ingredient embeddings) and labels (primary category)
X = np.vstack(data['ingredient_embeddings'])
y = data['primary_category']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Train a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   Fragrance       0.98      0.97      0.98       262
        Hair       0.96      0.86      0.91       246
      Makeup       0.95      0.89      0.92       411
    Skincare       0.85      0.96      0.90       451

    accuracy                           0.92      1370
   macro avg       0.94      0.92      0.93      1370
weighted avg       0.93      0.92      0.92      1370



------------------------------------------------------------------------------------

Predicting Category using BERT

In [14]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
# Tokenize the ingredients column
data['tokenized_ingredients'] = data['ingredients'].apply(
    lambda x: tokenizer(
        text=x,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
)


KeyboardInterrupt: 

In [8]:
import torch
from transformers import BertModel

# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Generate embeddings for each ingredient list
def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Apply to the dataset
data['bert_embeddings'] = data['ingredients'].apply(
    lambda x: get_bert_embedding(x, tokenizer, bert_model)
)


KeyboardInterrupt: 

In [15]:
import torch
from transformers import BertModel

# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Generate embeddings for each ingredient list
def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

In [16]:
from sklearn.model_selection import train_test_split
import numpy as np
import json

# Loading the bert embeddings
data = pd.read_csv("data/bert_embeddings.csv")
data['bert_embeddings'] = data['bert_embeddings'].apply(lambda x: np.array(json.loads(x)))


# Extract features and labels
X = np.vstack(data['bert_embeddings'])
y = data['primary_category']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   Fragrance       0.98      0.96      0.97       262
        Hair       0.86      0.53      0.66       246
      Makeup       0.85      0.82      0.83       411
    Skincare       0.70      0.87      0.78       451

    accuracy                           0.81      1370
   macro avg       0.85      0.80      0.81      1370
weighted avg       0.83      0.81      0.81      1370



In [18]:
# Example ingredient list
new_ingredients = "aqua, glycerin, cetyl alcohol, fragrance"

# Generate embedding for the new ingredient list
new_embedding = get_bert_embedding(new_ingredients, tokenizer, bert_model).reshape(1, -1)

# Predict the category
predicted_category = classifier.predict(new_embedding)
print("Predicted Primary Category:", predicted_category[0])

Predicted Primary Category: Makeup


In [8]:
import json

# Convert embeddings to a string (JSON format) before saving
data['bert_embeddings'] = data['bert_embeddings'].apply(lambda x: json.dumps(x.tolist()))

# Save the DataFrame to a CSV file
data.to_csv("data/bert_embeddings.csv", index=False)

In [None]:
# Loading the bert embeddings
data = pd.read_csv("data/bert_embeddings.csv")
data['bert_embeddings'] = data['bert_embeddings'].apply(lambda x: np.array(json.loads(x)))


------------------------------------------------------------------------------------

Recommendation Part

In [12]:
from sentence_transformers import SentenceTransformer

# Load Sentence-BERT model
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Example input ingredients
input_ingredients = ["WaterAquaEau , Glycerin , Caprylyl Methicone , Butylene Glycol , Alcohol Denat. , Dimethicone , Isododecane , Peg-10 Dimethicone , Dipropylene Glycol , Prunus Amygdalus Dulcis (Sweet Almond) Seed Extract , Cucumis Melo (Melon) Fruit Extract , Persea Gratissima (Avocado) Oil , Saccharomyces Lysate Extract , Acetyl Glucosamine , Porphyridium Cruentum Extract , Dipeptide Diaminobutyroyl Benzylamide Diacetate , Acetyl Hexapeptide-8 , Palmitoyl Tetrapeptide-7 , Acetyl Octapeptide-3 , Cholesterol , Decarboxy Carnosine Hcl , Acetyl Carnitine Hcl , Caffeine , Creatine , Sigesbeckia Orientalis (St. Paul'S Wort) Extract , Palmitoyl Tripeptide-1 , Polygonum Cuspidatum Root Extract , Centella Asiatica (Hydrocotyl) Extract , Glycine Soja (Soybean) Protein , Ergothioneine , Propylene Glycol Dicaprylate , Peg-150 , Lauryl Peg-9 Polydimethylsiloxyethyl Dimethicone , Jojoba Esters , Whey ProteinLactis ProteinProtine Du Petit-Lait , Adenosine Phosphate , Tocopheryl Acetate , Lactic Acid , Yeast ExtractFaexExtrait De Levu Linoleic Acid , Sodium Hyaluronate , Phytantriol , Glycine Soja (Soybean) Seed Extract , Disteardimonium Hectorite , Propylene Carbonate , Methanediol , Hydroxypropyl Methylcellulose , Pullulan , Polyquaternium-51 , Sodium Hydroxide , Calcium Chloride , Carbomer , Sea Whip Extract , Polysorbate 20 , Caprylyl Glycol , Tetrahexyldecyl Ascorbate , Citric Acid , Potassium Sulfate , Sodium Hexametaphosphate , Sodium Citrate , Potassium Sorbate , Sodium Benzoate , Phenoxyethanol"]

# Preprocess the input ingredients
processed_input = " ".join([ingredient.lower().strip() for ingredient in input_ingredients])

# Compute the embedding for the input
input_embedding = model.encode(processed_input, convert_to_tensor=True)


In [13]:
# Figuring out the category using the pre-trained model

cat_pred_ingredients = "WaterAquaEau , Glycerin , Caprylyl Methicone , Butylene Glycol , Alcohol Denat. , Dimethicone , Isododecane , Peg-10 Dimethicone , Dipropylene Glycol , Prunus Amygdalus Dulcis (Sweet Almond) Seed Extract , Cucumis Melo (Melon) Fruit Extract , Persea Gratissima (Avocado) Oil , Saccharomyces Lysate Extract , Acetyl Glucosamine , Porphyridium Cruentum Extract , Dipeptide Diaminobutyroyl Benzylamide Diacetate , Acetyl Hexapeptide-8 , Palmitoyl Tetrapeptide-7 , Acetyl Octapeptide-3 , Cholesterol , Decarboxy Carnosine Hcl , Acetyl Carnitine Hcl , Caffeine , Creatine , Sigesbeckia Orientalis (St. Paul'S Wort) Extract , Palmitoyl Tripeptide-1 , Polygonum Cuspidatum Root Extract , Centella Asiatica (Hydrocotyl) Extract , Glycine Soja (Soybean) Protein , Ergothioneine , Propylene Glycol Dicaprylate , Peg-150 , Lauryl Peg-9 Polydimethylsiloxyethyl Dimethicone , Jojoba Esters , Whey ProteinLactis ProteinProtine Du Petit-Lait , Adenosine Phosphate , Tocopheryl Acetate , Lactic Acid , Yeast ExtractFaexExtrait De Levu Linoleic Acid , Sodium Hyaluronate , Phytantriol , Glycine Soja (Soybean) Seed Extract , Disteardimonium Hectorite , Propylene Carbonate , Methanediol , Hydroxypropyl Methylcellulose , Pullulan , Polyquaternium-51 , Sodium Hydroxide , Calcium Chloride , Carbomer , Sea Whip Extract , Polysorbate 20 , Caprylyl Glycol , Tetrahexyldecyl Ascorbate , Citric Acid , Potassium Sulfate , Sodium Hexametaphosphate , Sodium Citrate , Potassium Sorbate , Sodium Benzoate , Phenoxyethanol"

# Tokenize and generate embeddings
new_tokens = simple_preprocess(cat_pred_ingredients)
new_embedding = get_sentence_vector(word2vec_model, new_tokens).reshape(1, -1)

# Predict the category
predicted_category = classifier.predict(new_embedding)
print("Predicted Primary Category:", predicted_category[0])

Predicted Primary Category: Skincare


In [14]:
import json
import torch
from torch.nn.functional import cosine_similarity

def compute_similarity(input_emb, product_emb):
    return cosine_similarity(input_emb, product_emb, dim=0).item()

# Load the CSV files
ingredients_data = pd.read_csv("data/ingredients_embedding.csv")
concern_chems_data = pd.read_csv("data/concern_chems_embedding.csv")
red_list_data = pd.read_csv("data/red_list_embedding.csv")
the_gens_data = pd.read_csv("data/the_gens_embedding.csv")

# Convert JSON string embeddings back to tensors
ingredients_data["ingredients_embedding"] = ingredients_data["ingredients_embedding"].apply(lambda x: torch.tensor(json.loads(x)))
concern_chems_data["concern_chems_embedding"] = concern_chems_data["concern_chems_embedding"].apply(lambda x: torch.tensor(json.loads(x)))
red_list_data["red_list_embedding"] = red_list_data["red_list_embedding"].apply(lambda x: torch.tensor(json.loads(x)))
the_gens_data["the_gens_embedding"] = the_gens_data["the_gens_embedding"].apply(lambda x: torch.tensor(json.loads(x)))

# Input primary category (assume we determine this beforehand)
input_primary_category = predicted_category[0] # Replace this with the actual category from the input

# Filter dataset for the same primary category
filtered_data = data[data["primary_category"] == input_primary_category].copy()

# Calculate similarity scores only for filtered products
filtered_data["ingredient_similarity"] = ingredients_data["ingredients_embedding"].apply(
    lambda x: compute_similarity(input_embedding, x)
)
filtered_data["concern_chems_similarity"] = concern_chems_data["concern_chems_embedding"].apply(
    lambda x: compute_similarity(input_embedding, x)
)
filtered_data["red_list_similarity"] = red_list_data["red_list_embedding"].apply(
    lambda x: compute_similarity(input_embedding, x)
)
filtered_data["the_gens_similarity"] = the_gens_data["the_gens_embedding"].apply(
    lambda x: compute_similarity(input_embedding, x)
)

# Combine scores: prioritize high ingredient similarity and low harmful similarity
filtered_data["final_score"] = (
    filtered_data["ingredient_similarity"] - filtered_data["concern_chems_similarity"] - filtered_data["red_list_similarity"] - filtered_data["the_gens_similarity"]
)

In [15]:
# Sort by the final score (descending)
recommended_products = filtered_data.sort_values(by="final_score", ascending=False)

# Get the top product
top_product = recommended_products.iloc[0]
print("Recommended Product:")
print(f"Name: {top_product['product_name']}")
# print(f"Ingredients: {top_product['ingredients']}")
print(f"Detected Harmful Ingredients: {top_product['harmful_detected']}")
print(f"Similarity Score: {top_product['final_score']}")


Recommended Product:
Name: Makeup Setting Spray Organic Sunscreen SPF 30


KeyError: 'harmful_detected'