In [1]:
import os
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [3]:
def generate_embedding(text):
    """Genera un embedding BERT para un texto dado."""
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        embeddings = model(**tokens).last_hidden_state[:, 0, :].numpy()
    return embeddings


In [4]:
def generate_and_save_embeddings():
    """Genera embeddings para todas las ofertas y los guarda en un archivo."""
    # Cargar el archivo con las ofertas utilizando una ruta compatible con todos los sistemas operativos
    offer_df = pd.read_csv(os.path.join("Datasets", "offer_retailer.csv"))
    
    # Generar embeddings para cada oferta
    offers = offer_df['OFFER'].tolist()
    offer_embeddings = [generate_embedding(offer) for offer in offers]

    # Guardar los embeddings en un archivo .npy
    np.save('offer_embeddings.npy', offer_embeddings)

    # Retornar las ofertas y sus embeddings para cualquier uso posterior
    return offers, offer_embeddings

In [5]:
offers, offer_embeddings = generate_and_save_embeddings()

In [6]:
# Imprimir la primera oferta y su embedding correspondiente como una comprobación
print(offers[0])

Spend $50 on a Full-Priced new Club Membership


In [7]:
print(offer_embeddings[0])

[[-1.06063433e-01  3.27015743e-02 -3.03485483e-01  3.03139925e-01
  -1.18893899e-01 -1.74904823e-01  5.98251522e-02  6.98660851e-01
  -1.34912193e-01 -4.18268412e-01 -1.36960417e-01 -2.81980764e-02
   1.92050353e-01  3.02797705e-01 -8.48396346e-02  3.34609300e-04
  -5.29448576e-02  5.39136887e-01 -2.27013156e-01  2.61990637e-01
  -3.93296748e-01 -7.50676095e-02 -2.82008406e-02 -1.20547205e-01
   3.41931432e-01  3.07851017e-01  3.24194580e-02  5.94556332e-02
  -1.45098493e-02  9.82262418e-02 -2.17515290e-01 -1.38206072e-02
   1.26884341e-01 -3.14507395e-01  2.76281357e-01 -6.76931262e-01
   4.11946744e-01 -1.29396260e-01  5.23486808e-02  1.44063503e-01
   8.00085738e-02  3.09434831e-01  5.30616701e-01  2.09648371e-01
   9.09574479e-02 -5.48754215e-01 -2.05205607e+00 -2.46750966e-01
   1.28829196e-01 -2.13193417e-01  2.74126887e-01 -1.34497315e-01
  -1.78375319e-01  2.31803641e-01  3.84227112e-02  3.74074727e-01
  -1.20478548e-01  4.19835001e-01  4.70038295e-01  1.09798357e-01
   3.15754