In [None]:
#Experiment 1, rounds 1, 2,3,4 and 5
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


!pip install datasets
from datasets import load_dataset

import transformers
from transformers import TrainingArguments, Trainer
from transformers import get_scheduler
from transformers import AutoTokenizer, AutoModelForSequenceClassification

!pip install evaluate
import evaluate

!pip install torch
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#load the data
df_data = pd.read_csv('/content/drive/MyDrive/Fewshotproject/Data/IMDB Dataset.csv')
#rename columns 'review', 'sentiment' as 'text', 'label'
df_data = df_data.rename(columns={'review': 'text', 'sentiment': 'label'})
print(df_data.columns)
#replace the label 'positive' with 0 (entailment)
df_data['label'] = df_data['label'].replace('positive', 0)
#replace the label 'negative' with 1 (contradiction)
df_data['label'] = df_data['label'].replace('negative', 1)
print(df_data.head(10))

Index(['text', 'label'], dtype='object')
                                                text  label
0  One of the other reviewers has mentioned that ...      0
1  A wonderful little production. <br /><br />The...      0
2  I thought this was a wonderful way to spend ti...      0
3  Basically there's a family where a little boy ...      1
4  Petter Mattei's "Love in the Time of Money" is...      0
5  Probably my all-time favorite movie, a story o...      0
6  I sure would like to see a resurrection of a u...      0
7  This show was an amazing, fresh & innovative i...      1
8  Encouraged by the positive comments about this...      1
9  If you like original gut wrenching laughter yo...      0


In [None]:
#split into 50% train and 50% test
df_train, df_test = train_test_split(df_data, test_size=0.5, random_state=42)
#make a small validation set
df_train, df_val = train_test_split(df_train, test_size=0.25, random_state=42)
#convert test to csv file
df_test.to_csv('/content/drive/MyDrive/Fewshotproject/Data/test.csv', index=False)
#convert validation set to csv
df_val.to_csv('/content/drive/MyDrive/Fewshotproject/Data/validation.csv', index=False)

In [None]:
#get batch 1 of few-shot examples

positives_1 = df_train[df_train['label'] == 0].sample(n=8, random_state=40) #randomly select 8 'positive' examples

negatives_1 = df_train[df_train['label'] == 1].sample(n=8, random_state=40) #randomly select 8 'negative' examples

#combine the positives and negatives into a single dataframe
few_shot_batch1 = pd.concat([negatives_1, positives_1])
print(few_shot_batch1)
#convert the current batch to csv, so that it can be input into load_dataset
few_shot_batch1.to_csv('/content/drive/MyDrive/Fewshotproject/Data/few_shot_batch1.csv', index=False)

                                                    text  label
15782  Greenaway's films pose as clever, erudite and ...      1
6841   I saw this movie a few days ago and gamely jum...      1
35830  That's the question you have to ask yourself w...      1
7882   Patrick Channing (Jeff Kober) is a disciple of...      1
37512  If "B" movies, tired and corny scripts, and go...      1
16865  I saw Brigadoon on TV last night (12 Sept 2009...      1
13662  I wish I would have read more reviews and more...      1
46768  Terrible direction from an awful script. Even ...      1
29422  Maybe the movie itself isn't one of the best J...      0
27079  This movie is directed by Renny Harlin the fin...      0
40486  I read thru most of the comments posted here &...      0
5067   Plot in a nutshell - Duchess (voice of Eva Gab...      0
46965  Spoiler Alert - although this is a plot almost...      0
5029   A few thoughts before I get to the heart of th...      0
14519  It's interesting how the train of

In [None]:
#get batch 2 of few-shot examples

#randomly select 8 'positive' examples
positives_1 = df_train[df_train['label'] == 0].sample(n=8, random_state=39)
#randomly select 8 'negative' examples
negatives_1 = df_train[df_train['label'] == 1].sample(n=8, random_state=39)

#combine the positives and negatives into a single dataframe
few_shot_batch2 = pd.concat([negatives_1, positives_1])
print(few_shot_batch2)
#convert the current batch to csv, so that it can be input into load_dataset
few_shot_batch2.to_csv('/content/drive/MyDrive/Fewshotproject/Data/few_shot_batch2.csv', index=False)

                                                    text  label
1678   Give director Stanley Tong of Jackie Chan's Su...      1
4027   How does David Lynch do it? Unlike the legions...      1
16204  ...you know the rest. If you want a good zombi...      1
32257  There seems to have been some money behind thi...      1
11144  Besides the fact that this guy is a liar, he i...      1
20977  Want a great recipe for failure? Take a crappy...      1
14503  This is even worse than the original Game of D...      1
35205  The first few minutes showing the cold and cru...      1
11258  This must be the dumbest movie I've ever seen ...      0
23401  I just saw this episode this evening, on a rec...      0
21977  "Panic" is a captivating, blurred-genre film a...      0
15008  Due to this show getting cut early I never rea...      0
17992  This film is a very descent remake of the famo...      0
3055   Trio's vignettes were insightful and quite enj...      0
19385  My wife did not realize what a ge

In [None]:
#get batch 3 of few-shot examples


#randomly select 8 'positive' examples
positives_1 = df_train[df_train['label'] == 0].sample(n=8, random_state=38)
#randomly select 8 'negative' examples
negatives_1 = df_train[df_train['label'] == 1].sample(n=8, random_state=38)

#combine the positives and negatives into a single dataframe
few_shot_batch3 = pd.concat([negatives_1, positives_1])
print(few_shot_batch3)
#convert the current batch to csv, so that it can be input into load_dataset
few_shot_batch3.to_csv('/content/drive/MyDrive/Fewshotproject/Data/few_shot_batch3.csv', index=False)

                                                    text  label
5293   I am a big fan of horror movies, and know a lo...      1
37924  1 hour and 40 minutes of talking--boring talki...      1
4881   Cage (1989) was another one of those low budge...      1
10685  Imagine an exploitive remake of The Defiant On...      1
21618  I think Micheal Ironsides acting career must b...      1
21040  The video quality is awful. The sound quality ...      1
18212  Aya! If you are looking for special effects th...      1
35628  I had to compare two versions of Hamlet for my...      1
46537  Brilliant execution in displaying once and for...      0
16877  Having never heard of this film until I saw th...      0
8163   I have to say that sometimes "looks" are all t...      0
3358   I'm just getting the chance to dig into past A...      0
27235  Since this is Black History Month and I'm revi...      0
29299  Yes it's a Fast Times wannabe, but it's still ...      0
49563  The great cinematic musicals were

In [None]:
#get batch 4 of few-shot examples


#randomly select 8 'positive' examples
positives_1 = df_train[df_train['label'] == 0].sample(n=8, random_state=37)
#randomly select 8 'negative' examples
negatives_1 = df_train[df_train['label'] == 1].sample(n=8, random_state=37)

#combine the positives and negatives into a single dataframe
few_shot_batch4 = pd.concat([negatives_1, positives_1])
print(few_shot_batch4)
#convert the current batch to csv, so that it can be input into load_dataset
few_shot_batch4.to_csv('/content/drive/MyDrive/Fewshotproject/Data/few_shot_batch4.csv', index=False)

                                                    text  label
15987  That's a snippet of choice dialogue delivered ...      1
25752  I saw Roger Moore huffing it on the scenes tha...      1
49743  This has to be one of the worst films I have e...      1
48028  From the start, you know how this movie will e...      1
16559  Were it not for the fact that this came as a 2...      1
27917  I just wanted to say that I was very disappoin...      1
2812   This movie features an o.k. score and a not ba...      1
4427   He really lost the plot with this one! None of...      1
1779   This film was adapted from the well known sutr...      0
17594  The young lady's name is Bonnie (Polay). She's...      0
21700  Somewhere on IMDb there is a discussion about ...      0
46752  This was the best movie I've ever seen about B...      0
9280   Twisted Desire (1996) was a TV movie starring ...      0
46814  mahatma Gandhi, the father of the nation in hi...      0
7393   This movie is perfect for familie

In [None]:
#get batch 5 of few-shot examples


#randomly select 8 'positive' examples
positives_1 = df_train[df_train['label'] == 0].sample(n=8, random_state=36)
#randomly select 8 'negative' examples
negatives_1 = df_train[df_train['label'] == 1].sample(n=8, random_state=36)

#combine the positives and negatives into a single dataframe
few_shot_batch5 = pd.concat([negatives_1, positives_1])
print(few_shot_batch5)
#convert the current batch to csv, so that it can be input into load_dataset
few_shot_batch5.to_csv('/content/drive/MyDrive/Fewshotproject/Data/few_shot_batch5.csv', index=False)

                                                    text  label
7243   This is, per se, an above average film but why...      1
23172  OK, I've now seen George Zucco in at least fou...      1
1130   I haven't had a chance to view the previous fi...      1
33105  I know if I was a low budget film maker I woul...      1
10207  Yes, this was pure unbelievable condescending ...      1
12852  The reason this is such a bad movie is because...      1
3857   Arthur Hunnicutt plays a very stereotypical ro...      1
16137  I love Memoirs of a Geisha so I read the book ...      1
37311  After a long wait, "Bedrooms and Hallways" mad...      0
37922  Yeah I watched this mini series with My Mom an...      0
4823   One of my favorite movies to date starts as an...      0
18450  This series could very well be the best Britco...      0
22995  *May Contain Spoilers*<br /><br />The first ti...      0
10541  I just saw this movie the other day when I ren...      0
48304  Though structured totally differe

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli",  num_labels=2, ignore_mismatched_sizes=True)

Some weights of the model checkpoint at FacebookAI/roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large-mnli and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instanti

In [None]:
#ROUND 1 OF EXPERIMENT
#train set is few_shot_batch1
#repeat this cell with 'few_shot_batch2.csv' and 'few_shot_batch3.csv' for the next two rounds
data_files = {'train': 'few_shot_batch1.csv', 'test': 'test.csv', 'validation': 'validation.csv'}

dataset = load_dataset("/content/drive/MyDrive/Fewshotproject/Data/", data_files=data_files)

In [None]:
#tokenize the text
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
#tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
#round 1
torch.cuda.empty_cache()
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["validation"]
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(200)) #get small validation set
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=8,
    num_workers=2,
    pin_memory=True
    )
#try increasing evaluation batch_size from 8 to 16
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
#new training loop
from tqdm.auto import tqdm
import torch

# Create progress bar
progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)

model.train()
for epoch in range(num_epochs):
    # Update the description to include the current epoch
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")

    # Training loop
    total_train_loss = 0  # Track the total training loss for the epoch
    num_train_batches = 0  # Track the number of training batches

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Accumulate training loss
        total_train_loss += loss.item()
        num_train_batches += 1

        # Update the progress bar with the current loss
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches)

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    eval_loss = 0
    num_eval_batches = 0

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():  # Disable gradient calculations
            outputs = model(**batch)
            eval_loss += outputs.loss.item()  # Accumulate evaluation loss
        num_eval_batches += 1

    eval_loss /= num_eval_batches  # Average evaluation loss
    progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches, eval_loss=eval_loss)  # Update progress bar with both losses

    print(f"Epoch {epoch+1} | Training Loss: {total_train_loss / num_train_batches:.4f} | Evaluation Loss: {eval_loss:.4f}")

    model.train()  # Set model back to training mode


Training:   0%|          | 0/2 [00:00<?, ?it/s]

  self.pid = os.fork()


Epoch 1 | Training Loss: 0.6774 | Evaluation Loss: 0.7003


In [None]:
#get both accuracy and f1-macro on the full test set
import evaluate

# Prepare the evaluation dataloader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Load the F1 and accuracy metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

model.eval()  # Set the model to evaluation mode

# Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class

    # Add predictions and references (true labels) to both metrics
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute both metrics
f1_score = f1_metric.compute(average="macro")  # Use weighted average for F1 if you have imbalanced classes
accuracy = accuracy_metric.compute()

print(f"Macro-F1 Score: {f1_score['f1']:.4f}")
print(f"Accuracy: {accuracy['accuracy']:.4f}")


F1 Score: 0.4338
Accuracy: 0.5277


In [None]:
#ROUND 2 OF EXPERIMENT
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli",  num_labels=2, ignore_mismatched_sizes=True)

#train set is few_shot_batch2
data_files = {'train': 'few_shot_batch2.csv', 'test': 'test.csv', 'validation': 'validation.csv'}

dataset = load_dataset("/content/drive/MyDrive/Fewshotproject/Data/", data_files=data_files)
#tokenize the dataset for round 2
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Some weights of the model checkpoint at FacebookAI/roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large-mnli and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instanti

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:

#round 2 train
torch.cuda.empty_cache()
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["validation"]
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(200)) #get small validation set
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=8,
    num_workers=2,
    pin_memory=True
    )
#try increasing evaluation batch_size from 8 to 16
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
#new training loop
from tqdm.auto import tqdm
import torch

# Create progress bar
progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)

model.train()
for epoch in range(num_epochs):
    # Update the description to include the current epoch
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")

    # Training loop
    total_train_loss = 0  # Track the total training loss for the epoch
    num_train_batches = 0  # Track the number of training batches

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Accumulate training loss
        total_train_loss += loss.item()
        num_train_batches += 1

        # Update the progress bar with the current loss
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches)

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    eval_loss = 0
    num_eval_batches = 0

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():  # Disable gradient calculations
            outputs = model(**batch)
            eval_loss += outputs.loss.item()  # Accumulate evaluation loss
        num_eval_batches += 1

    eval_loss /= num_eval_batches  # Average evaluation loss
    progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches, eval_loss=eval_loss)  # Update progress bar with both losses

    print(f"Epoch {epoch+1} | Training Loss: {total_train_loss / num_train_batches:.4f} | Evaluation Loss: {eval_loss:.4f}")

    model.train()  # Set model back to training mode


Training:   0%|          | 0/2 [00:00<?, ?it/s]

  self.pid = os.fork()


Epoch 1 | Training Loss: 0.7289 | Evaluation Loss: 0.7050


In [None]:
#Round 2 evaluate
#get both accuracy and f1-macro on the full test set
import evaluate

# Prepare the evaluation dataloader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Load the F1 and accuracy metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

model.eval()  # Set the model to evaluation mode

# Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class

    # Add predictions and references (true labels) to both metrics
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute both metrics
f1_score = f1_metric.compute(average="macro")  # Use weighted average for F1 if you have imbalanced classes
accuracy = accuracy_metric.compute()

print(f"Macro-F1 Score: {f1_score['f1']:.4f}")
print(f"Accuracy: {accuracy['accuracy']:.4f}")

Macro-F1 Score: 0.4269
Accuracy: 0.5328


In [None]:
#ROUND 3 OF EXPERIMENT
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli",  num_labels=2, ignore_mismatched_sizes=True)

#train set is few_shot_batch3

data_files = {'train': 'few_shot_batch3.csv', 'test': 'test.csv', 'validation': 'validation.csv'}

dataset = load_dataset("/content/drive/MyDrive/Fewshotproject/Data/", data_files=data_files)
#tokenize the dataset for round 3
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Some weights of the model checkpoint at FacebookAI/roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large-mnli and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instanti

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

In [None]:
#Round 3 train
torch.cuda.empty_cache()
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["validation"]
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(200)) #get small validation set
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=8,
    num_workers=2,
    pin_memory=True
    )
#try increasing evaluation batch_size from 8 to 16
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
#new training loop
from tqdm.auto import tqdm
import torch

# Create progress bar
progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)

model.train()
for epoch in range(num_epochs):
    # Update the description to include the current epoch
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")

    # Training loop
    total_train_loss = 0  # Track the total training loss for the epoch
    num_train_batches = 0  # Track the number of training batches

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Accumulate training loss
        total_train_loss += loss.item()
        num_train_batches += 1

        # Update the progress bar with the current loss
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches)

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    eval_loss = 0
    num_eval_batches = 0

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():  # Disable gradient calculations
            outputs = model(**batch)
            eval_loss += outputs.loss.item()  # Accumulate evaluation loss
        num_eval_batches += 1

    eval_loss /= num_eval_batches  # Average evaluation loss
    progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches, eval_loss=eval_loss)  # Update progress bar with both losses

    print(f"Epoch {epoch+1} | Training Loss: {total_train_loss / num_train_batches:.4f} | Evaluation Loss: {eval_loss:.4f}")

    model.train()  # Set model back to training mode


Training:   0%|          | 0/2 [00:00<?, ?it/s]

  self.pid = os.fork()


Epoch 1 | Training Loss: 0.7134 | Evaluation Loss: 0.6891


In [None]:
#Round 3 evaluate
#get both accuracy and f1-macro on the full test set
import evaluate

# Prepare the evaluation dataloader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Load the F1 and accuracy metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

model.eval()  # Set the model to evaluation mode

# Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class

    # Add predictions and references (true labels) to both metrics
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute both metrics
f1_score = f1_metric.compute(average="macro")  # Use weighted average for F1 if you have imbalanced classes
accuracy = accuracy_metric.compute()

print(f"Macro-F1 Score: {f1_score['f1']:.4f}")
print(f"Accuracy: {accuracy['accuracy']:.4f}")

Macro-F1 Score: 0.3368
Accuracy: 0.5013


In [None]:
#ROUND 4 OF EXPERIMENT
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli",  num_labels=2, ignore_mismatched_sizes=True)

#train set is few_shot_batch4

data_files = {'train': 'few_shot_batch4.csv', 'test': 'test.csv', 'validation': 'validation.csv'}

dataset = load_dataset("/content/drive/MyDrive/Fewshotproject/Data/", data_files=data_files)
#tokenize the dataset for round 4
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Some weights of the model checkpoint at FacebookAI/roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large-mnli and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instanti

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

In [None]:
#Round 4 train
torch.cuda.empty_cache()
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["validation"]
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(200)) #get small validation set
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=8,
    num_workers=2,
    pin_memory=True
    )
#try increasing evaluation batch_size from 8 to 16
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
#new training loop
from tqdm.auto import tqdm
import torch

# Create progress bar
progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)

model.train()
for epoch in range(num_epochs):
    # Update the description to include the current epoch
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")

    # Training loop
    total_train_loss = 0  # Track the total training loss for the epoch
    num_train_batches = 0  # Track the number of training batches

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Accumulate training loss
        total_train_loss += loss.item()
        num_train_batches += 1

        # Update the progress bar with the current loss
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches)

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    eval_loss = 0
    num_eval_batches = 0

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():  # Disable gradient calculations
            outputs = model(**batch)
            eval_loss += outputs.loss.item()  # Accumulate evaluation loss
        num_eval_batches += 1

    eval_loss /= num_eval_batches  # Average evaluation loss
    progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches, eval_loss=eval_loss)  # Update progress bar with both losses

    print(f"Epoch {epoch+1} | Training Loss: {total_train_loss / num_train_batches:.4f} | Evaluation Loss: {eval_loss:.4f}")

    model.train()  # Set model back to training mode


Training:   0%|          | 0/2 [00:00<?, ?it/s]

  self.pid = os.fork()


Epoch 1 | Training Loss: 0.7906 | Evaluation Loss: 0.6712


In [None]:
#Round 4 evaluate
#get both accuracy and f1-macro on the full test set
import evaluate

# Prepare the evaluation dataloader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Load the F1 and accuracy metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

model.eval()  # Set the model to evaluation mode

# Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class

    # Add predictions and references (true labels) to both metrics
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute both metrics
f1_score = f1_metric.compute(average="macro")  # Use weighted average for F1 if you have imbalanced classes
accuracy = accuracy_metric.compute()

print(f"Macro-F1 Score: {f1_score['f1']:.4f}")
print(f"Accuracy: {accuracy['accuracy']:.4f}")

Macro-F1 Score: 0.4946
Accuracy: 0.5380


In [None]:
#ROUND 5 OF EXPERIMENT
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli",  num_labels=2, ignore_mismatched_sizes=True)

#train set is few_shot_batch5

data_files = {'train': 'few_shot_batch5.csv', 'test': 'test.csv', 'validation': 'validation.csv'}

dataset = load_dataset("/content/drive/MyDrive/Fewshotproject/Data/", data_files=data_files)
#tokenize the dataset for round 5
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Some weights of the model checkpoint at FacebookAI/roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large-mnli and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instanti

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

In [None]:
#Round 5 train
torch.cuda.empty_cache()
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["validation"]
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(200)) #get small validation set
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=8,
    num_workers=2,
    pin_memory=True
    )
#try increasing evaluation batch_size from 8 to 16
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
#new training loop
from tqdm.auto import tqdm
import torch

# Create progress bar
progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)

model.train()
for epoch in range(num_epochs):
    # Update the description to include the current epoch
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")

    # Training loop
    total_train_loss = 0  # Track the total training loss for the epoch
    num_train_batches = 0  # Track the number of training batches

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Accumulate training loss
        total_train_loss += loss.item()
        num_train_batches += 1

        # Update the progress bar with the current loss
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches)

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    eval_loss = 0
    num_eval_batches = 0

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():  # Disable gradient calculations
            outputs = model(**batch)
            eval_loss += outputs.loss.item()  # Accumulate evaluation loss
        num_eval_batches += 1

    eval_loss /= num_eval_batches  # Average evaluation loss
    progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches, eval_loss=eval_loss)  # Update progress bar with both losses

    print(f"Epoch {epoch+1} | Training Loss: {total_train_loss / num_train_batches:.4f} | Evaluation Loss: {eval_loss:.4f}")

    model.train()  # Set model back to training mode

Training:   0%|          | 0/2 [00:00<?, ?it/s]

  self.pid = os.fork()


Epoch 1 | Training Loss: 0.8127 | Evaluation Loss: 0.6889


In [None]:
#Round 5 evaluate
#get both accuracy and f1-macro on the full test set
import evaluate

# Prepare the evaluation dataloader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Load the F1 and accuracy metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

model.eval()  # Set the model to evaluation mode

# Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class

    # Add predictions and references (true labels) to both metrics
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute both metrics
f1_score = f1_metric.compute(average="macro")  # Use weighted average for F1 if you have imbalanced classes
accuracy = accuracy_metric.compute()

print(f"Macro-F1 Score: {f1_score['f1']:.4f}")
print(f"Accuracy: {accuracy['accuracy']:.4f}")

Macro-F1 Score: 0.3347
Accuracy: 0.5003


In [None]:
60/10

6.0

In [None]:
#compute the average macro-f1 score across all 5 rounds of experiment
average_macro_f1 = (0.4338+0.4269+0.3368+0.4946+0.3347)/5
average_macro_f1 = round(average_macro_f1, 3)
print(average_macro_f1)
average_accuracy = (0.5277+0.5328+0.5013+0.5380+0.5003)/5

0.405


In [None]:
#compute the average accuracy score across all 5 rounds of experiment
average_accuracy = (0.5277+0.5328+0.5013+0.5380+0.5003)/5
print(average_accuracy)
average_accuracy = round(average_accuracy, 3)

0.52002
