In [3]:
from gensim.models import FastText


In [4]:
import gensim
print(gensim.__version__)  # Outputs the installed Gensim version


4.3.3


In [1]:
import pandas as pd

data = pd.read_csv('data/new_product_info.csv')

Predicting Category using Word2Vec model

In [6]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

# Tokenize the ingredients
data['tokenized_ingredients'] = data['ingredients'].apply(simple_preprocess)

# Train Word2Vec on the tokenized ingredients
word2vec_model = Word2Vec(
    sentences=data['tokenized_ingredients'],
    vector_size=100,       # Dimensionality of the embeddings
    window=5,              # Context window size
    min_count=1,           # Include all ingredients, even rare ones
    workers=4,             # Use 4 CPU cores for training
    sg=1,                  # Use Skip-Gram model
    epochs=10              # Number of iterations over the corpus
)

In [7]:
import numpy as np

def get_sentence_vector(model, sentence):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)  # Take the average of word vectors
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words match

# Generate embeddings for each product's ingredients
data['ingredient_embeddings'] = data['tokenized_ingredients'].apply(
    lambda x: get_sentence_vector(word2vec_model, x)
)


In [8]:
from sklearn.model_selection import train_test_split

# Extract features (ingredient embeddings) and labels (primary category)
X = np.vstack(data['ingredient_embeddings'])
y = data['primary_category']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

   Fragrance       0.99      0.97      0.98       262
        Hair       0.95      0.88      0.91       246
      Makeup       0.95      0.89      0.92       411
    Skincare       0.86      0.96      0.91       451

    accuracy                           0.93      1370
   macro avg       0.94      0.92      0.93      1370
weighted avg       0.93      0.93      0.93      1370



In [10]:
# Example ingredient list for prediction
new_ingredients = "aqua, glycerin, cetyl alcohol, fragrance"

# Tokenize and generate embeddings
new_tokens = simple_preprocess(new_ingredients)
new_embedding = get_sentence_vector(word2vec_model, new_tokens).reshape(1, -1)

# Predict the category
predicted_category = classifier.predict(new_embedding)
print("Predicted Primary Category:", predicted_category[0])


Predicted Primary Category: Hair


Predicting Category using Word2Vec model

In [11]:
from gensim.models import FastText

# Train FastText on the tokenized ingredients
fasttext_model = FastText(
    sentences=data['tokenized_ingredients'],  # Tokenized ingredients
    vector_size=100,                          # Dimensionality of word embeddings
    window=5,                                 # Context window size
    min_count=1,                              # Include all words, even rare ones
    workers=4,                                # Use 4 CPU cores for training
    sg=1,                                     # Use Skip-Gram (sg=1); CBOW if sg=0
    epochs=10                                 # Number of training iterations
)


In [12]:
# Generate embeddings for each product's ingredients
data['ingredient_embeddings'] = data['tokenized_ingredients'].apply(
    lambda x: get_sentence_vector(fasttext_model, x)
)

# Extract features (ingredient embeddings) and labels (primary category)
X = np.vstack(data['ingredient_embeddings'])
y = data['primary_category']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Train a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   Fragrance       0.98      0.97      0.98       262
        Hair       0.96      0.86      0.91       246
      Makeup       0.95      0.89      0.92       411
    Skincare       0.85      0.96      0.90       451

    accuracy                           0.92      1370
   macro avg       0.94      0.92      0.93      1370
weighted avg       0.93      0.92      0.92      1370



Predicting Category using BERT

In [2]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the ingredients column
data['tokenized_ingredients'] = data['ingredients'].apply(
    lambda x: tokenizer(
        text=x,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
)


In [3]:
import torch
from transformers import BertModel

# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Generate embeddings for each ingredient list
def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Apply to the dataset
data['bert_embeddings'] = data['ingredients'].apply(
    lambda x: get_bert_embedding(x, tokenizer, bert_model)
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 