In [1]:
! pip install transformers datasets evaluate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m480.6/480.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K  

In [3]:
from datasets import load_dataset

Automotive = load_dataset(
    "json",
    data_files={
        "train": "Automotive.train.json",  # File with JSONL for training
        "test": "Automotive.test.json",  # File with JSONL for testing
    },
)
print(Automotive["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

{'overall': 1.0, 'verified': True, 'reviewTime': '09 2, 2014', 'reviewerID': 'A1X1OLX0FWVXSP', 'asin': 'B000HZRTDE', 'reviewerName': 'No One here but chickens', 'reviewText': 'Dirty and cheap. Looked like someone dropped this in the dirty and shoved it back in the product box, then shipped it to me. Just a cheap rumpled pad of paper inside.', 'summary': 'Dirty and cheap. Not impressed', 'unixReviewTime': 1409616000, 'style': None, 'vote': None, 'image': None}


In [4]:
def concat_summary_to_review(example):
    # Check if 'summary' exists and is not empty
    if "summary" in example and example["summary"]:
        # Concatenate summary to reviewText
        example["reviewText"] = f'{example["reviewText"]} Summary: {example["summary"]}.'
    return example


# Apply the function to the entire dataset
Automotive_with_summary = Automotive.map(concat_summary_to_review)

# Print an example to confirm the change
print(Automotive_with_summary["train"][0])


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'overall': 1.0, 'verified': True, 'reviewTime': '09 2, 2014', 'reviewerID': 'A1X1OLX0FWVXSP', 'asin': 'B000HZRTDE', 'reviewerName': 'No One here but chickens', 'reviewText': 'Dirty and cheap. Looked like someone dropped this in the dirty and shoved it back in the product box, then shipped it to me. Just a cheap rumpled pad of paper inside. Summary: Dirty and cheap. Not impressed.', 'summary': 'Dirty and cheap. Not impressed', 'unixReviewTime': 1409616000, 'style': None, 'vote': None, 'image': None}


In [5]:
def select_relevant_fields(example):
    return {"labels": example["overall"], "reviewText": example["reviewText"]}


# Apply the function to both train and test splits. we choose train arbitrarly since both train and test have the same column names
Automotive_filtered = Automotive_with_summary.map(
    select_relevant_fields, remove_columns=Automotive_with_summary["train"].column_names
)

# Print an example to confirm the change
print(Automotive_filtered["train"][0])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'reviewText': 'Dirty and cheap. Looked like someone dropped this in the dirty and shoved it back in the product box, then shipped it to me. Just a cheap rumpled pad of paper inside. Summary: Dirty and cheap. Not impressed.', 'labels': 1.0}


In [6]:
from datasets import Features, Value


# Define a function to convert `labels` to zero-based integer
def convert_and_update_labels(example):
    example["labels"] = int(example["labels"]) - 1  # Convert to 0-based index
    return example


# Apply the function to both train and test splits
Automotive_labeled = Automotive_filtered.map(convert_and_update_labels)

# Define the updated schema with labels as int32
new_features = Features(
    {
        "labels": Value("int32"),  # Explicitly set labels as int32
        "reviewText": Value("string"),
    }
)

# Update the schema for both train and test splits
Automotive_int = Automotive_labeled.cast(new_features)

# Print an example to confirm the changes
print(Automotive_int["train"][0])
print(type(Automotive_int["train"][0]["labels"]))

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'labels': 0, 'reviewText': 'Dirty and cheap. Looked like someone dropped this in the dirty and shoved it back in the product box, then shipped it to me. Just a cheap rumpled pad of paper inside. Summary: Dirty and cheap. Not impressed.'}
<class 'int'>


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
def process_row(example):
    # Replace missing or empty reviewText with "EMPTY"
    if not isinstance(example.get("reviewText"), str) or not example["reviewText"].strip():
        example["reviewText"] = "EMPTY"
    return example


def add_placeholder(dataset):
    for split in ["train", "test"]:  # Iterate over both train and test splits
        # Apply process_row to each row using map
        dataset[split] = dataset[split].map(process_row)
    return dataset


Automotive_placeholders = add_placeholder(Automotive_int)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:
def preprocess_function(examples):
    return tokenizer(examples["reviewText"], truncation=True)

In [11]:
tokenized_Automotive = Automotive_placeholders.map(preprocess_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [14]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [15]:
id2label = {0: "1", 1: "2", 2: "3", 3: "4", 4: "5"}
label2id = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4}

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Finding the best arguments, dont run this:

In [17]:
import itertools

# Define the grid of hyperparameters
learning_rates = [2e-5, 3e-5, 5e-5]
batch_sizes = [16, 32]
num_epochs = [2, 3,4, 5]
weight_decays = [0.01, 0.001]

# Generate all combinations of hyperparameters
param_combinations = list(itertools.product(learning_rates, batch_sizes, num_epochs, weight_decays))

# Variable to store the best results
best_accuracy = 0
best_params = {}


In [None]:
for lr, batch_size, epochs, wd in param_combinations:
    print(f"Testing: LR={lr}, Batch={batch_size}, Epochs={epochs}, WD={wd}")

    training_args = TrainingArguments(
        output_dir="./my_awesome_model",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=wd,  # Use dynamic weight decay
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=False,
        push_to_hub=False,
        report_to="none",
        seed=42,

    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_Automotive["train"],
        eval_dataset=tokenized_Automotive["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    results = trainer.evaluate()

    if results["eval_accuracy"] > best_accuracy:
        best_accuracy = results["eval_accuracy"]
        best_params = {"learning_rate": lr, "batch_size": batch_size, "num_epochs": epochs, "weight_decay": wd}
        print(f"New Best: {best_params} with Accuracy: {best_accuracy}")

# Print the best configuration
print(f"Best Combination: {best_params} with Accuracy: {best_accuracy}")


Testing: LR=2e-05, Batch=16, Epochs=2, WD=0.01


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.571,0.852868,0.6625
2,0.4267,0.935697,0.6645


Testing: LR=2e-05, Batch=16, Epochs=2, WD=0.001


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2931,1.158441,0.664
2,0.2196,1.250402,0.667


New Best: {'learning_rate': 2e-05, 'batch_size': 16, 'num_epochs': 2, 'weight_decay': 0.001} with Accuracy: 0.667
Testing: LR=2e-05, Batch=16, Epochs=3, WD=0.01


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.195,1.60426,0.656
2,0.1415,1.57445,0.6575
3,0.2446,1.575712,0.6635


Testing: LR=2e-05, Batch=16, Epochs=3, WD=0.001


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1118,2.035681,0.6555
2,0.0862,2.082576,0.6545
3,0.1064,2.2543,0.6585


Testing: LR=2e-05, Batch=16, Epochs=4, WD=0.01


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0775,2.402154,0.649
2,0.0609,2.707536,0.632
3,0.0837,2.658219,0.653
4,0.074,2.728163,0.6515


Testing: LR=2e-05, Batch=16, Epochs=4, WD=0.001


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.06,3.056432,0.6585
2,0.0534,3.018275,0.6505
3,0.0537,3.188373,0.651
4,0.0415,3.117178,0.654


Testing: LR=2e-05, Batch=16, Epochs=5, WD=0.01


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0782,3.341363,0.6505
2,0.0535,3.222531,0.658
3,0.0503,3.407278,0.652
4,0.0453,3.338265,0.659
5,0.0529,3.312733,0.656


Testing: LR=2e-05, Batch=16, Epochs=5, WD=0.001


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0327,3.89075,0.647
2,0.0409,3.469353,0.663
3,0.0576,3.424865,0.664
4,0.0189,3.544403,0.659
5,0.0278,3.551532,0.6555


Testing: LR=2e-05, Batch=32, Epochs=2, WD=0.01


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.18639,0.659
2,0.051600,3.319808,0.6565


Testing: LR=2e-05, Batch=32, Epochs=2, WD=0.001


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.669174,0.649
2,0.024500,3.605863,0.65


Testing: LR=2e-05, Batch=32, Epochs=3, WD=0.01


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.753266,0.6465
2,0.025300,3.665549,0.657
3,0.025300,3.507504,0.6595


Testing: LR=2e-05, Batch=32, Epochs=3, WD=0.001


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.928791,0.6465
2,0.024200,3.812475,0.648
3,0.024200,3.757676,0.6475


Testing: LR=2e-05, Batch=32, Epochs=4, WD=0.01


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.80045,0.644
2,0.022100,3.850408,0.6485
3,0.022100,3.805595,0.6525
4,0.019600,3.798429,0.651


Testing: LR=2e-05, Batch=32, Epochs=4, WD=0.001


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,4.143341,0.651
2,0.016700,4.194392,0.6445
3,0.016700,4.139433,0.65
4,0.011700,4.116211,0.6525


Testing: LR=2e-05, Batch=32, Epochs=5, WD=0.01


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,4.126676,0.6605


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,4.126676,0.6605
2,0.021300,4.13789,0.6495
3,0.021300,4.154499,0.656
4,0.009600,4.093779,0.6575


# A run with the best results only:

In [17]:
# Best hyperparameters
best_learning_rate = 2e-05
best_batch_size = 16
best_num_epochs = 2
best_weight_decay= 0.001


# Define training arguments with the best combination
training_args = TrainingArguments(
    output_dir="./final_model",  # Updated output directory
    learning_rate=best_learning_rate,
    per_device_train_batch_size=best_batch_size,
    per_device_eval_batch_size=best_batch_size,
    num_train_epochs=best_num_epochs,
    weight_decay=best_weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
    push_to_hub=False,
    report_to="none",
    seed=42,
)

# Create a Trainer instance with the best hyperparameters
tester = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_Automotive["train"],
    eval_dataset=tokenized_Automotive["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model with the best hyperparameters
tester.train()

# Evaluate the model and print results
final_results = tester.evaluate()
print(f"Final Test Accuracy with Best Params: {final_results['eval_accuracy']}")


  tester = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.028,0.796219,0.662


Epoch,Training Loss,Validation Loss,Accuracy
1,1.028,0.796219,0.662
2,0.7029,0.779356,0.675


Final Test Accuracy with Best Params: 0.675
