In [None]:
Large Language Model (LLM)

In [None]:
# 1. Install dependencies (if you haven’t already):
#    pip install transformers datasets

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset,ClassLabel
import pandas as pd


# Define your furniture descriptions and labels
furniture_data_train = {
    'text': [
        'Modern velvet sofa with gold legs',
        'Oak dining table seating six',
        'Ergonomic office chair with lumbar support',
        'Queen size platform bed frame',
        'Solid wood coffee table',
        'Mid-century modern accent chair',
        'Glass top bedside table',
        'Leather recliner sofa',
        'Adjustable height standing desk',
        'Upholstered dining room chair'
    ],
    'label': [
        'sofa',
        'table',
        'chair',
        'bed',
        'table',
        'chair',
        'table',
        'sofa',
        'desk',
        'chair'
    ]
}

furniture_data_test = {
    'text': [
        'Glass top bedside table',
        'Upholstered dining room chair'
    ],
    'label': [
        'table',
        'chair'
    ]
}


# Create a DataFrame
df_furniture = pd.DataFrame(furniture_data_train)
df_furniture_test = pd.DataFrame(furniture_data_test)

# (Optional) Save to CSV for later use
df_furniture.to_csv('furniture_dataset.csv', index=False)
df_furniture_test.to_csv('furniture_dataset_test.csv', index=False)

# 2. Load your dataset
#    Suppose train.csv and test.csv each have "text","label"
dataset = load_dataset('csv', data_files={'train':'furniture_dataset.csv','test':'furniture_dataset_test.csv'})
# 3. Convert the 'label' column from strings → ClassLabel (ints)
dataset = dataset.class_encode_column('label')
label_feature = dataset['train'].features['label']
print(label_feature.names)

# 3. (Optional) If your labels are strings, convert them to ClassLabel
#    Here we assume label is already an int in [0..num_labels-1].
#    If not, you can do:
# labels = dataset['train'].unique('label')
# dataset = dataset.class_encode_column('label')

num_labels = len(dataset['train'].unique('label'))

# 4. Tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_fn(examples):
    return tokenizer(examples['text'],
                     padding='max_length',
                     truncation=True,
                     max_length=128)
tokenized = dataset.map(tokenize_fn, batched=True)

# 5. Prepare for PyTorch
tokenized = tokenized.rename_column('label','labels')
tokenized.set_format(type='torch',
                     columns=['input_ids','attention_mask','labels'])

# 6. Load a pre‑trained BERT with a classification head
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels
)

# 7. Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    logging_dir='./logs',
    report_to= 'wandb'    # no toolkit
)

# 8. Create the Trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
    tokenizer=tokenizer
)
trainer.train()

# 9. Inference on new descriptions
def predict_label(text: str):
    inputs = tokenizer(text,
                       return_tensors='pt',
                       truncation=True,
                       padding=True)
    outputs = model(**inputs)
    pred_id = outputs.logits.argmax(dim=-1).item()
    # If you used ClassLabel you can call .int2str; otherwise map pred_id to your own list
    return label_feature.int2str(pred_id)

print(predict_label("Modern velvet sofa with gold legs"))


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Casting to class labels:   0%|          | 0/10 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/2 [00:00<?, ? examples/s]

['bed', 'chair', 'desk', 'sofa', 'table']


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


desk


In [None]:
label_feature.names

['bed', 'chair', 'desk', 'sofa', 'table']

## Bert for catelog prediction

In [2]:
# ------------------------------
# The code basically first applies a pretrained BERT model called 'bert-base-uncased', then uses the local data (texts and labels) to fine tune the base model.
# ------------------------------
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# ------------------------------
# 1. Prepare the Dataset
# ------------------------------
class ProductDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# ------------------------------
# 2. Define the Model
# ------------------------------
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
      # Here we fine tune the base BERT model called 'bert-base-uncased'.
      # During our training, we add a new classification layer (linear layer) and update the weights for this layer completely.
      # On the other hand, we only update the layers of BERT based model slightly. This is because base model is already smart enough, and we don't want to unlearn it.
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output  # [CLS] token representation
        return self.classifier(self.dropout(cls_output))



In [3]:
texts = [
    "Modern velvet tufted loveseat with wooden legs",
    "High-back ergonomic office chair with mesh back",
    "Queen-size platform bed with storage drawers",
    "Round glass dining table with chrome base",
    "Rustic wooden coffee table with drawers",
    "Adjustable standing desk with memory presets",
    "L-shaped sectional sofa with reversible chaise",
    "Upholstered dining chair set of 2",
    "King platform bed frame with headboard",
    "Compact writing desk with metal frame",
    "Contemporary TV stand with open shelving",
    "Mid-century nightstand with walnut finish",
    "Fabric recliner chair with cup holders",
    "5-shelf bookcase with adjustable shelves",
    "Metal bunk bed with ladder and safety rails",
    "Leather office chair with tilt and lumbar support",
    "Foldable dining table for small spaces",
    "Tufted ottoman with storage compartment",
    "Swivel bar stools with backrest, set of 2",
    "Double dresser with 6 drawers and metal pulls"
]

labels = [
    "Loveseats",
    "Office Chairs",
    "Beds",
    "Dining Tables",
    "Coffee Tables",
    "Desks",
    "Sectionals",
    "Dining Chairs",
    "Beds",
    "Desks",
    "TV Stands",
    "Nightstands",
    "Recliners",
    "Bookcases",
    "Beds",
    "Office Chairs",
    "Dining Tables",
    "Ottomans",
    "Bar Stools",
    "Dressers"
]


In [4]:
# ------------------------------
# 3. Load and Encode Data
# ------------------------------

# Encode labels to integers
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 64

# Dataset and DataLoader
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels_encoded, test_size=0.2, random_state=42
)
train_dataset = ProductDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = ProductDataset(val_texts, val_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)

# ------------------------------
# 4. Train the Model
# ------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(num_labels=num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

epochs = 3
model.train()
for epoch in range(epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")

# ------------------------------
# 5. Predict and Decode
# ------------------------------
model.eval() # I am using model for inference, not training
sample_text = "Tufted gray bed for living room"
encoded = tokenizer(sample_text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_len).to(device)
# Encoded will look smth like below:
# encoded = {
#     'input_ids': tensor([[101, 17959, 3511, 6888, 2005,  living,  2733, 102, 0, 0]]),
#     'attention_mask': tensor([[1, 1, 1, 1,     1,       1,     1,   1, 0, 0]])
# }
with torch.no_grad(): # Disables gradient tracking bc we are not training
    logits = model(encoded['input_ids'], encoded['attention_mask'])
    # logits = tensor([[2.1, 0.5, -1.2, 1.4]])  # shape: [1, num_classes]. The logits represent the confidence scores.
    # torch.argmax([2.1, 0.5, -1.2, 1.4]) → 0, this means the predicted class is 0
    predicted_class = torch.argmax(logits, dim=1).item()

print("Predicted Category:", label_encoder.inverse_transform([predicted_class])[0]) # Return the original label

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1 Loss: 2.7335
Epoch 2 Loss: 2.4330
Epoch 3 Loss: 2.2883
Predicted Category: Beds


In [10]:
label_encoder.inverse_transform([predicted_class])[0]

np.str_('Beds')

In [8]:
# ------------------------------
# 5. Predict and Decode
# ------------------------------
model.eval()
sample_text = "Tufted gray dining chair for living room"
encoded = tokenizer(sample_text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_len).to(device)
with torch.no_grad():
    logits = model(encoded['input_ids'], encoded['attention_mask'])
    predicted_class = torch.argmax(logits, dim=1).item()

print("Predicted Category:", label_encoder.inverse_transform([predicted_class])[0])

Predicted Category: Beds
