In [None]:
# %pip install torch transformers datasets

In [1]:
# Importing the necessary libraries
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# %pip install zstandard

In [2]:
try:
    dataset = load_dataset("nixiesearch/amazon-esci", streaming=True)
    print("Dataset loaded in streaming mode successfully.")
    print(next(iter(dataset["train"])))
except Exception as e:
    print(f"Error loading dataset in streaming mode: {e}")
    raise

Dataset loaded in streaming mode successfully.
{'query': 'kids probtiotic fiber', 'doc': 'Fiber Prebiotic Sugar-Free Gummies with Inulin, Digestive Support for Kids & Adults - Apple Flavor (60 Ct) Chewable Gummies: Our fiber gummies for kids and adults contain 3.35g of Inulin, used to improve gut health, relieve constipation, reverse appetite loss and alleviate an upset stomach.\nFiber Supplements: The easy way to take a dose of pure superfood supplement, our fiber supplement gummies are made for both adults and kids and have a delicious apple flavor with no artificial colors, flavors, or additives.\nFiber for Kids & Adults; BeLive’s Prebiotic Kids Fiber Gummies include Chicory Root & Inulin which are highly effective fibers to improve digestive function and helps in achieving a healthy weight. For the best result, take it with BeLive’s Probiotic Gummies!\nFiber Gummies for Adults Sugar Free: Vegan Friendly, BeLive does not use gelatin or any animal products in the making of our fiber 

In [3]:
for split in dataset.keys():
    try:
        print(f"Split: {split}, Size: {len(dataset[split])}")
    except Exception as e:
        print(f"Split: {split}, Size: Unable to determine (streaming dataset)")
    
    # Print first item in each split
    print(f"First item in {split} split:")
    print(next(iter(dataset[split])))
    print()

Split: train, Size: Unable to determine (streaming dataset)
First item in train split:
{'query': 'kids probtiotic fiber', 'doc': 'Fiber Prebiotic Sugar-Free Gummies with Inulin, Digestive Support for Kids & Adults - Apple Flavor (60 Ct) Chewable Gummies: Our fiber gummies for kids and adults contain 3.35g of Inulin, used to improve gut health, relieve constipation, reverse appetite loss and alleviate an upset stomach.\nFiber Supplements: The easy way to take a dose of pure superfood supplement, our fiber supplement gummies are made for both adults and kids and have a delicious apple flavor with no artificial colors, flavors, or additives.\nFiber for Kids & Adults; BeLive’s Prebiotic Kids Fiber Gummies include Chicory Root & Inulin which are highly effective fibers to improve digestive function and helps in achieving a healthy weight. For the best result, take it with BeLive’s Probiotic Gummies!\nFiber Gummies for Adults Sugar Free: Vegan Friendly, BeLive does not use gelatin or any ani

In [4]:
# Take 5 samples from train and test splits
train_samples = list(dataset["train"].take(5))
test_samples = list(dataset["test"].take(5))

print("Train samples:", len(train_samples))
print("Test samples:", len(test_samples))

# Display the first sample from each split
print("\nFirst train sample:")
print(train_samples[0])

print("\nFirst test sample:")
print(test_samples[0])


Train samples: 5
Test samples: 5

First train sample:
{'query': 'kids probtiotic fiber', 'doc': 'Fiber Prebiotic Sugar-Free Gummies with Inulin, Digestive Support for Kids & Adults - Apple Flavor (60 Ct) Chewable Gummies: Our fiber gummies for kids and adults contain 3.35g of Inulin, used to improve gut health, relieve constipation, reverse appetite loss and alleviate an upset stomach.\nFiber Supplements: The easy way to take a dose of pure superfood supplement, our fiber supplement gummies are made for both adults and kids and have a delicious apple flavor with no artificial colors, flavors, or additives.\nFiber for Kids & Adults; BeLive’s Prebiotic Kids Fiber Gummies include Chicory Root & Inulin which are highly effective fibers to improve digestive function and helps in achieving a healthy weight. For the best result, take it with BeLive’s Probiotic Gummies!\nFiber Gummies for Adults Sugar Free: Vegan Friendly, BeLive does not use gelatin or any animal products in the making of our

In [5]:
# Print the keys of the first sample in train_samples
print("Keys in train samples:")
print(train_samples[0].keys())

# Print the keys of the first sample in test_samples
print("\nKeys in test samples:")
print(test_samples[0].keys())


Keys in train samples:
dict_keys(['query', 'doc', 'neg', 'negscore'])

Keys in test samples:
dict_keys(['query', 'doc', 'neg', 'negscore'])


In [6]:
# Import necessary libraries
from datasets import Dataset
from transformers import AutoTokenizer
import pandas as pd


# Function to tokenize and prepare samples for training
def prepare_samples(examples):
    samples = []
    
    # Create positive samples
    for query, doc in zip(examples["query"], examples["doc"]):
        samples.append({"query": query, "text": doc, "label": 1.0})
    
    # Create negative samples
    for query, negs, negscores in zip(examples["query"], examples["neg"], examples["negscore"]):
        for neg, negscore in zip(negs, negscores):
            samples.append({"query": query, "text": neg, "label": negscore})
    
    # Tokenize the batch
    tokenized = tokenizer(
        [sample["query"] for sample in samples],
        [sample["text"] for sample in samples],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    
    # Add labels
    tokenized["labels"] = torch.tensor([sample["label"] for sample in samples])
    
    return tokenized

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


# Convert train_samples and test_samples to Dataset objects
train_dataset = Dataset.from_list(train_samples)
test_dataset = Dataset.from_list(test_samples)

# Tokenize datasets using map function with batching
tokenized_train_dataset = train_dataset.map(
    prepare_samples,
    batched=True,
    batch_size=32,
    remove_columns=train_dataset.column_names
)

tokenized_test_dataset = test_dataset.map(
    prepare_samples,
    batched=True,
    batch_size=32,
    remove_columns=test_dataset.column_names
)

print("Tokenized train dataset:")
print(tokenized_train_dataset)
print("\nFirst tokenized train sample:")
print(tokenized_train_dataset[0])



Map: 100%|██████████| 5/5 [00:00<00:00, 133.51 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 312.33 examples/s]

Tokenized train dataset:
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 56
})

First tokenized train sample:
{'input_ids': [101, 4268, 4013, 19279, 25185, 2594, 11917, 102, 11917, 3653, 26591, 5699, 1011, 2489, 16031, 28397, 2007, 1999, 18639, 1010, 17886, 3512, 2490, 2005, 4268, 1004, 6001, 1011, 6207, 14894, 1006, 3438, 14931, 1007, 21271, 3085, 16031, 28397, 1024, 2256, 11917, 16031, 28397, 2005, 4268, 1998, 6001, 5383, 1017, 1012, 3486, 2290, 1997, 1999, 18639, 1010, 2109, 2000, 5335, 9535, 2740, 1010, 15804, 9530, 16643, 24952, 2239, 1010, 7901, 18923, 3279, 1998, 24251, 2019, 6314, 4308, 1012, 11917, 25654, 1024, 1996, 3733, 2126, 2000, 2202, 1037, 13004, 1997, 5760, 3565, 14876, 7716, 12448, 1010, 2256, 11917, 12448, 16031, 28397, 2024, 2081, 2005, 2119, 6001, 1998, 4268, 1998, 2031, 1037, 12090, 6207, 14894, 2007, 2053, 7976, 6087, 1010, 26389, 1010, 2030, 29167, 2015, 1012, 11917, 2005, 4268, 1004, 6001, 1025, 19337, 3512, 15




Printing the first sample of the tokenized dataset (without tensors)
We observe that the tokenized sample contains CLS, SEP, and PAD tokens. Each query and document pair is concatenated with a SEP token in between. The labels are the relevance scores.


In [7]:
# Print the first sample of the tokenized dataset (without tensors)
import json
print("\nFirst tokenized train sample (without tensors):")
sample = tokenized_train_dataset[7]
readable_sample = {
    "input_ids": tokenizer.decode(sample["input_ids"]),
    "labels": sample["labels"]
}
print(json.dumps(readable_sample, indent=2))



First tokenized train sample (without tensors):
{
  "input_ids": "[CLS] kids probtiotic fiber [SEP] pottywise liquid stool softener for kids - stool softener and liquidlaxative for kids - gentle constipation relief for kids, fiber for kids, kids stool softener < p > a backed up baby booty can wreak havoc on your day. looking for an easy & safe way to keep constipation away? our digestive support supplements safely stop constipation, eliminate gas, relieve tummy cramps & reduce bloating! < / p > < p > < / p > < p > 100 % absolutely no sugar, gluten, preservatives, etc. < / p > < p > < / p > < p > stop giving your kid a laxative with stool softener designed for adult tummies. sugar - free, gluten - free & blended using only the finest laxative herbs, these digestion drops provide natural constipation relief for kids & gassy babies. < / p > < p > < / p > < p > yummy bowel health formula children enjoy! don't worry about tricking your kid into taking his / her bowel movement supplement. o

In [None]:
# %pip install accelerate -U

Model Training and Evaluation

In the cell below, we perform the following steps:

1. Initialize the BERT model for sequence classification with a single output (regression).
2. Split the tokenized dataset into training and validation sets.
3. Define training arguments, including:
   - Output directory for results
   - Number of training epochs
   - Batch sizes for training and evaluation
   - Warmup steps and weight decay for optimization
   - Logging directory
   - Evaluation and saving strategies
4. Create a Trainer object with the model, training arguments, datasets, and tokenizer.
5. Start the training process.
6. Evaluate the trained model on the validation set.
7. Save the final model.

The loss function used in this regression task is typically Mean Squared Error (MSE).
MSE calculates the average squared difference between the predicted relevance scores
and the actual labels. The model aims to minimize this loss during training.

The use of a single output neuron (num_labels=1) in the model initialization
indicates that we're treating this as a regression problem, predicting a continuous
relevance score rather than discrete classes.


In [None]:
# # Initialize the model
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

# # Split the tokenized datasets into train and validation sets
# tokenized_dataset = tokenized_train_dataset.train_test_split(test_size=0.2)

# print("Training dataset size:", len(tokenized_dataset["train"]))
# print("Validation dataset size:", len(tokenized_dataset["test"]))

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
# )

# # Create the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["test"],
#     tokenizer=tokenizer,
# )

# # Start training
# print("Starting training...")
# trainer.train()

# # Evaluate the model
# print("Evaluating the model...")
# eval_results = trainer.evaluate()
# print(f"Evaluation results: {eval_results}")

# # Save the model
# print("Saving the model...")
# trainer.save_model("./final_model")
# print("Model saved successfully.")




In [None]:
# Install the required packages
%pip install transformers[torch] -U
%pip install accelerate -U


### Training Run with Weights and Biases monitoring


In [None]:
%pip install wandb

In [8]:
import os
os.environ["WANDB_API_KEY"] = "30e6362592c0c1ec55dd7cf48a09a674b33f6519"

# Now initialize wandb
import wandb
wandb.init(project="amazon-esci-ranking", name="bert-regression")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mswayatta[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
# Initialize wandb
import wandb

# Configure wandb
wandb.init(
    project="amazon-esci-ranking",
    name="bert-regression",
    config={
        "model": "bert-base-uncased",
        "epochs": 3,
        "batch_size": 8,
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
    }
)

print("Weights & Biases initialized successfully.")


Weights & Biases initialized successfully.


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
# import wandb  # Disabled for now

# Initialize wandb
# wandb.init(
#     project="amazon-esci-ranking",
#     name="bert-regression",
#     config={
#         "model": "bert-base-uncased",
#         "epochs": 3,
#         "batch_size": 8,
#         "learning_rate": 2e-5,
#         "weight_decay": 0.01,
#     }
# )

# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

# Use the previously tokenized datasets
train_dataset = tokenized_train_dataset
test_dataset = tokenized_test_dataset

print("Training dataset size:", len(train_dataset))
print("Test dataset size:", len(test_dataset))

# Define custom metrics computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mse = mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    return {
        "mse": mse,
        "rmse": rmse,
    }

# Define training arguments with wandb logging
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # report_to="wandb",  # Enable wandb logging
)

# Create the Trainer with wandb logging and custom metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Add custom metrics
)

# Start training
print("Starting training...")
trainer.train()

# Evaluate the model
print("Evaluating the model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Log evaluation results to wandb
# wandb.log({"eval_results": eval_results})

# Save the model
print("Saving the model...")
trainer.save_model("./final_model")
print("Model saved successfully.")

# Finish the wandb run
# wandb.finish()

In [12]:
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 56
})

In [14]:
print("Training dataset size:", len(train_dataset))
print("Test dataset size:", len(test_dataset))

Training dataset size: 5
Test dataset size: 5
