In [39]:
def augment_descriptions(data):
    augmented_descriptions = []

    
    for item in data:
        name = item.get('name', '').strip()
        color = item.get('color', '').strip()
        brand = item.get('brand', '').strip()
        description = item.get('description', '').strip()

        if name and color:
            new_description = f"{brand.capitalize()} {color.lower()} {name.lower()}."
            if description:
                new_description += f" {description}"
                item['description'] = new_description
                augmented_descriptions.append(new_description)

    return augmented_descriptions

In [2]:
%pip install torch
%pip install transformers

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, BertModel


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [40]:
import json

with open('zarascraper/zara-data/unique-products.json', 'r') as f:
    data = json.load(f)

docs = augment_descriptions(data)

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [41]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

tokenized_docs = tokenizer(docs, return_tensors="pt", padding=True, truncation=True)

print(tokenized_docs) 

{'input_ids': tensor([[  101, 23564,  2527,  ...,     0,     0,     0],
        [  101, 23564,  2527,  ...,     0,     0,     0],
        [  101, 23564,  2527,  ...,     0,     0,     0],
        ...,
        [  101, 23564,  2527,  ...,  4642,  1007,   102],
        [  101, 23564,  2527,  ...,     0,     0,     0],
        [  101, 23564,  2527,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [35]:
with torch.no_grad():
    outputs = model(**tokenized_docs)
    sentence_embeddings = outputs.last_hidden_state[:, 0, :].numpy()

sentence_embeddings

array([[-0.3373197 , -1.0613083 ,  0.11465828, ...,  0.49393478,
         0.00672049, -0.4190629 ],
       [-0.3566898 , -0.9231635 ,  0.11140573, ...,  0.4030354 ,
         0.04799193, -0.44828123],
       [-0.34996334, -1.038434  ,  0.08831158, ...,  0.28725287,
        -0.10124843, -0.42541602],
       ...,
       [-1.0120435 , -0.7619317 , -0.316899  , ...,  0.32375476,
        -0.14339298,  0.08462722],
       [-0.9960088 , -0.7691596 , -0.14577636, ...,  0.300434  ,
        -0.23361614,  0.07289792],
       [-0.30383855, -0.83472186,  0.07650954, ...,  0.15699425,
         0.14622018,  0.16990244]], dtype=float32)

In [43]:
import pandas as pd

df = pd.DataFrame(sentence_embeddings)

df.to_csv('bert_embeddings.csv', index=True)