In [2]:
from datasets import load_dataset
dataset = load_dataset("imdb")
print(dataset["train"][0])  # View a sample


{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [3]:
def add_context(data):
    with_context = []
    for i in range(len(data) - 1):
        current = data[i]
        next_sentence = data[i + 1]["text"]
        with_context.append({
            "text": f"{current['text']} Context: {next_sentence}",
            "label": current["label"]
        })
    return with_context


In [4]:
contextual_train = add_context(dataset["train"])
contextual_test = add_context(dataset["test"])


In [5]:
from datasets import Dataset

# Convert list of dictionaries to dictionary of lists
contextual_train_dict = {
    "text": [entry["text"] for entry in contextual_train],
    "label": [entry["label"] for entry in contextual_train],
}
contextual_test_dict = {
    "text": [entry["text"] for entry in contextual_test],
    "label": [entry["label"] for entry in contextual_test],
}

# Create datasets
train_dataset = Dataset.from_dict(contextual_train_dict)
test_dataset = Dataset.from_dict(contextual_test_dict)


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).to("cuda")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def tokenize_function(example):
    return tokenizer(
        example["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )


In [8]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])


Map:   0%|          | 0/24999 [00:00<?, ? examples/s]

Map:   0%|          | 0/24999 [00:00<?, ? examples/s]

In [9]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model(model, dataset):
    model.eval()  # Set model to evaluation mode
    predictions, labels = [], []
    for sample in dataset:
        inputs = {
            key: torch.tensor(value).unsqueeze(0).to("cuda") 
            for key, value in sample.items() if key != "label"
        }
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predictions.append(torch.argmax(logits, dim=-1).cpu().numpy()[0])
            labels.append(sample["label"])
    predictions = np.array(predictions)
    labels = np.array(labels)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }



In [10]:
baseline_metrics = evaluate_model(model, test_dataset)
print(f"Baseline Accuracy: {baseline_metrics['accuracy']}")
print(f"Baseline F1 Score: {baseline_metrics['f1']}")



Baseline Accuracy: 0.010440417616704669
Baseline F1 Score: 0.02009223987603546


In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
)




In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "f1": f1}


In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3183,0.273818,0.883995,0.883848
2,0.253,0.415029,0.889596,0.88959
3,0.1667,0.562771,0.890556,0.890556


TrainOutput(global_step=9375, training_loss=0.23662614766438803, metrics={'train_runtime': 666.3297, 'train_samples_per_second': 112.552, 'train_steps_per_second': 14.07, 'total_flos': 4933179247336704.0, 'train_loss': 0.23662614766438803, 'epoch': 3.0})

In [14]:
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [16]:
from transformers import AutoModelForSequenceClassification

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_model").to("cuda")


In [17]:
def classify_with_context(text, context):
    input_text = f"{text} Context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to("cuda")
    outputs = fine_tuned_model(**inputs)
    logits = outputs.logits
    sentiment = torch.argmax(logits, dim=-1).item()
    labels = ["Negative", "Neutral", "Positive"]
    return labels[sentiment]

# Example Usage
text = "The movie was amazing!"
context = "However, the ending was predictable."
print(classify_with_context(text, context))


Neutral


In [18]:
from transformers import AutoModelForSequenceClassification

model_pretrained = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to("cuda")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model_finetuned = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_model").to("cuda")


In [20]:
import torch

def classify_with_model(model, text, context):
    input_text = f"{text} Context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to("cuda")
    outputs = model(**inputs)
    logits = outputs.logits
    sentiment = torch.argmax(logits, dim=-1).item()
    labels = ["Negative", "Neutral", "Positive"]
    return labels[sentiment]


In [21]:
# Example input
text = "The movie was great!"
context = "However, the ending was predictable."

# Pre-trained model prediction
pretrained_prediction = classify_with_model(model_pretrained, text, context)
print(f"Pre-trained Model Prediction: {pretrained_prediction}")

# Fine-tuned model prediction
finetuned_prediction = classify_with_model(model_finetuned, text, context)
print(f"Fine-Tuned Model Prediction: {finetuned_prediction}")


Pre-trained Model Prediction: Neutral
Fine-Tuned Model Prediction: Neutral


In [22]:
# Create a list of test examples
test_examples = [
    {"text": "The service was excellent.", "context": "The food was terrible."},
    {"text": "I enjoyed the experience.", "context": "It was my first visit."},
    {"text": "The product quality is amazing.", "context": "But the delivery was late."},
]

# Compare predictions for each example
for example in test_examples:
    text, context = example["text"], example["context"]
    pretrained_prediction = classify_with_model(model_pretrained, text, context)
    finetuned_prediction = classify_with_model(model_finetuned, text, context)
    print(f"Text: {text}")
    print(f"Context: {context}")
    print(f"Pre-trained Model Prediction: {pretrained_prediction}")
    print(f"Fine-Tuned Model Prediction: {finetuned_prediction}")
    print("-" * 50)


Text: The service was excellent.
Context: The food was terrible.
Pre-trained Model Prediction: Neutral
Fine-Tuned Model Prediction: Neutral
--------------------------------------------------
Text: I enjoyed the experience.
Context: It was my first visit.
Pre-trained Model Prediction: Neutral
Fine-Tuned Model Prediction: Neutral
--------------------------------------------------
Text: The product quality is amazing.
Context: But the delivery was late.
Pre-trained Model Prediction: Neutral
Fine-Tuned Model Prediction: Negative
--------------------------------------------------


In [23]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model_on_test(model, dataset):
    predictions, labels = [], []
    for sample in dataset:
        inputs = {
            key: torch.tensor(value).unsqueeze(0).to("cuda") 
            for key, value in sample.items() if key != "label"
        }
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predictions.append(torch.argmax(logits, dim=-1).cpu().numpy()[0])
            labels.append(sample["label"])
    predictions = np.array(predictions)
    labels = np.array(labels)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }


In [24]:
metrics_pretrained = evaluate_model_on_test(model_pretrained, test_dataset)
metrics_finetuned = evaluate_model_on_test(model_finetuned, test_dataset)

print(f"Pre-trained Model Metrics: {metrics_pretrained}")
print(f"Fine-Tuned Model Metrics: {metrics_finetuned}")


Pre-trained Model Metrics: {'accuracy': 0.5539821592863714, 'f1': 0.5529134069708699}
Fine-Tuned Model Metrics: {'accuracy': 0.890555622224889, 'f1': 0.890555622224889}


In [25]:
from datasets import load_dataset

# Load Yelp Polarity dataset
dataset_yelp = load_dataset("yelp_polarity")
print(dataset_yelp["train"][0])  # Example from Yelp dataset

# Load Amazon Reviews dataset
dataset_amazon = load_dataset("amazon_polarity")
print(dataset_amazon["train"][0])  # Example from Amazon dataset


README.md:   0%|          | 0.00/8.93k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/256M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/38000 [00:00<?, ? examples/s]

{'text': "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.", 'label': 0}


README.md:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/260M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400000 [00:00<?, ? examples/s]

{'label': 1, 'title': 'Stuning even for the non-gamer', 'content': 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'}


In [26]:
def add_context(data):
    with_context = []
    for i in range(len(data) - 1):
        current = data[i]
        next_sentence = data[i + 1]["text"]
        with_context.append({
            "text": f"{current['text']} Context: {next_sentence}",
            "label": current["label"]
        })
    return with_context


In [35]:
# Process Yelp Polarity
contextual_yelp_train = add_context(dataset_yelp["train"])
contextual_yelp_test = add_context(dataset_yelp["test"])



In [36]:
def tokenize_function(example):
    return tokenizer(
        example["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )


In [38]:
from datasets import Dataset

# Yelp Train/Test
contextual_yelp_train_dict = {
    "text": [entry["text"] for entry in contextual_yelp_train],
    "label": [entry["label"] for entry in contextual_yelp_train],
}
contextual_yelp_test_dict = {
    "text": [entry["text"] for entry in contextual_yelp_test],
    "label": [entry["label"] for entry in contextual_yelp_test],
}
yelp_train_dataset = Dataset.from_dict(contextual_yelp_train_dict)
yelp_test_dataset = Dataset.from_dict(contextual_yelp_test_dict)


In [39]:
yelp_train_dataset = yelp_train_dataset.map(tokenize_function, batched=True)
yelp_test_dataset = yelp_test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
yelp_train_dataset = yelp_train_dataset.remove_columns(["text"])
yelp_test_dataset = yelp_test_dataset.remove_columns(["text"])


Map:   0%|          | 0/559999 [00:00<?, ? examples/s]

Map:   0%|          | 0/37999 [00:00<?, ? examples/s]

In [40]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model_on_dataset(model, dataset):
    model.eval()  # Set the model to evaluation mode
    predictions, labels = [], []
    for sample in dataset:
        inputs = {
            key: torch.tensor(value).unsqueeze(0).to("cuda") 
            for key, value in sample.items() if key != "label"
        }
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predictions.append(torch.argmax(logits, dim=-1).cpu().numpy()[0])
            labels.append(sample["label"])
    predictions = np.array(predictions)
    labels = np.array(labels)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }


In [41]:
pretrained_metrics_yelp = evaluate_model_on_dataset(model_pretrained, yelp_test_dataset)

print(f"Pre-trained Model on Yelp: {pretrained_metrics_yelp}")

Pre-trained Model on Yelp: {'accuracy': 0.5141187925998053, 'f1': 0.5138433388790341}


In [42]:
finetuned_metrics_yelp = evaluate_model_on_dataset(model_finetuned, yelp_test_dataset)

print(f"Fine-Tuned Model on Yelp: {finetuned_metrics_yelp}")


Fine-Tuned Model on Yelp: {'accuracy': 0.8138372062422695, 'f1': 0.812994723659259}


In [45]:
results = {
    "Dataset": ["Yelp"],
    "Pre-Trained Accuracy": [pretrained_metrics_yelp["accuracy"]],
    "Fine-Tuned Accuracy": [finetuned_metrics_yelp["accuracy"]],
    "Pre-Trained F1": [pretrained_metrics_yelp["f1"]],
    "Fine-Tuned F1": [finetuned_metrics_yelp["f1"]],
}

import pandas as pd
df_results = pd.DataFrame(results)
print(df_results)


  Dataset  Pre-Trained Accuracy  Fine-Tuned Accuracy  Pre-Trained F1  \
0    Yelp              0.514119             0.813837        0.513843   

   Fine-Tuned F1  
0       0.812995  
