In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


!pip install datasets
from datasets import load_dataset

import transformers
from transformers import TrainingArguments, Trainer
from transformers import get_scheduler
from transformers import AutoTokenizer, AutoModelForSequenceClassification

!pip install evaluate
import evaluate

!pip install torch
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

from google.colab import drive
drive.mount('/content/drive')

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2
Mounted at /content/drive


In [64]:
#load the data

df_train = pd.read_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/train.csv')
df_train['label description'] = 'The text has positive sentiment.' #add 'label description' column

df_test = pd.read_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/test.csv')
df_test['label description'] = 'The text has positive sentiment.' #add 'label description' column

df_dev = pd.read_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/dev.csv')
df_dev['label description'] = 'The text has positive sentiment.' #add 'label description' column

validation = df_dev[0:400] #get small validation set
#create new csv files
df_train.to_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/train.csv', index=False)
df_test.to_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/test.csv', index=False)
validation.to_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/validation.csv', index=False) #rename as validation.csv


print(df_train.columns)
print(df_test.columns)
print(df_dev.columns)
print(validation.shape)

Index(['label', 'text', 'label description'], dtype='object')
Index(['label', 'text', 'label description'], dtype='object')
Index(['label', 'text', 'label description'], dtype='object')
(400, 3)


In [65]:
#'label' column: 1 = positive sentiment, 0 = negative
#create a new column called 'label description': every cell is "The text has positive sentiment."


In [66]:
#First batch of training data
positives = df_train[df_train['label'] == 1].sample(n=8, random_state=40) #randomly select 8 'positive' examples

negatives = df_train[df_train['label'] == 0].sample(n=8, random_state=40) #randomly select 8 'negative' examples

batch1 = pd.concat([positives, negatives])

#convert the current batch to csv, so that it can be input into load_dataset
batch1.to_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/batch1.csv', index=False)
print(batch1.head(10))

      label                                               text  \
6139      1  Gute Nachrichten am Morgen: Momentan läuft es ...   
3051      1  Der Streik geht zu Ende, die Schlichtung begin...   
715       1  Re: Berliner Kurier Sind denn nur noch beklopp...   
5133      1  Re: Berliner Morgenpost Seit dem Ferien sind f...   
3407      1  Re: ULTRA Fanatics - Das Leben was wir wählten...   
5123      1  Ja in der S Bahn isses warm und trocken. Da is...   
4750      1  Freiburg- Stühlinger 1A Toplage schöne zentral...   
3823      1                      Da ist meine Bahn #zauberhaft   
4098      0  neues-deutschland.de Ramelow kritisiert langen...   
99        0  [DB Regio] 1. Akt. #München Hbf - #Garmisch-#P...   

                     label description  
6139  The text has positive sentiment.  
3051  The text has positive sentiment.  
715   The text has positive sentiment.  
5133  The text has positive sentiment.  
3407  The text has positive sentiment.  
5123  The text has positive

In [67]:
#Second batch of training data
positives = df_train[df_train['label'] == 1].sample(n=8, random_state=39) #randomly select 8 'positive' examples

negatives = df_train[df_train['label'] == 0].sample(n=8, random_state=39) #randomly select 8 'negative' examples

batch2 = pd.concat([positives, negatives])

#convert the current batch to csv, so that it can be input into load_dataset
batch2.to_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/batch2.csv', index=False)
print(batch2.head(10))

      label                                               text  \
2016      1  lvz.de Besserer S-Bahn-Takt – Züge pendeln öft...   
2381      1                  Hast wohl gutes WLAN in der Bahn.   
4954      1  +++ Bahn-Streik im Live-Ticker +++: Überrasche...   
3499      1  rbb Mediathek: Charité-Streik ausgesetzt, Konf...   
2041      1  "next stop: muenster." danke an die  fuer 15 s...   
1039      1  Verkehrsausschuss-Vorsitzender begrüßt Schlich...   
4422      1  Bahn-Chef Grube verspricht weniger Chaos am Ba...   
3560      1  In der ICE-Toilette sind jetzt Blumen im Fenst...   
522       0  Gepl. Abfahrt war 19:19 ab HB Hbf im ICE. ICE ...   
3276      0  Das Landesticket für Bus und #Bahn im Südweste...   

                     label description  
2016  The text has positive sentiment.  
2381  The text has positive sentiment.  
4954  The text has positive sentiment.  
3499  The text has positive sentiment.  
2041  The text has positive sentiment.  
1039  The text has positive

In [68]:
#Third batch of training data
positives = df_train[df_train['label'] == 1].sample(n=8, random_state=38) #randomly select 8 'positive' examples

negatives = df_train[df_train['label'] == 0].sample(n=8, random_state=38) #randomly select 8 'negative' examples

batch3 = pd.concat([positives, negatives])

#convert the current batch to csv, so that it can be input into load_dataset
batch3.to_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/batch3.csv', index=False)
print(batch3.head(10))

      label                                               text  \
2260      1  Re: DB Bahn Charly Kro Für Wunsch-Sitzplätze k...   
38        1  Emnid-Umfrage: Die Bahn ist kundenfreundlich, ...   
5105      1  77 Teams für Toleranz. Weber: "Bahn-Azubis mac...   
782       1  bahnstreik vorbei! glaub ich das nun? Einigung...   
1203      1  Brennerpass, das Nadelöhr Zielorte. Die Auswah...   
1043      1  Guten Morgen. Schnelltrasse statt Schnellstraß...   
2912      1             deswegen fahr ich lieber mit der bahn�   
1594      1  Betrunken Fahrrad geklaut Betrunken Fahrrad ge...   
2526      0  TS: S-Bahn-Ausfall? Die Berliner nehmen's spor...   
784       0  Die Bahn fährt nicht und vielleicht ist hier g...   

                     label description  
2260  The text has positive sentiment.  
38    The text has positive sentiment.  
5105  The text has positive sentiment.  
782   The text has positive sentiment.  
1203  The text has positive sentiment.  
1043  The text has positive

In [69]:
#Fourth batch of training data
positives = df_train[df_train['label'] == 1].sample(n=8, random_state=37) #randomly select 8 'positive' examples

negatives = df_train[df_train['label'] == 0].sample(n=8, random_state=37) #randomly select 8 'negative' examples

batch4 = pd.concat([positives, negatives])

#convert the current batch to csv, so that it can be input into load_dataset
batch4.to_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/batch4.csv', index=False)
print(batch4.head(10))

      label                                               text  \
6188      1  Großartig \o/ Dann nehm ich mein: "Ihr habt me...   
1249      1  Wiederentdeckt: Im Norden hat man Humor! (Re-P...   
579       1  JETZT EINZIEHEN: 3 Zimmer Wohnung mit wundersc...   
6162      1  Endlich Lösung gefunden 21.05.2015, 07:25 Uhr ...   
5314      1  #bahnstreik angeblich zu Ende. #GDL und Bahn s...   
1707      1  #focus Onlineportal soll WLAN entlasten - Revo...   
5752      1  Bahn fährt bald schneller und öfter von Deutsc...   
5997      1  Re: Teneriffa Umständlich von stuttgart aus :(...   
3781      0  Super Diskussion! Kinder unter 6 Jahren sind f...   
3381      0  #youknowyouregermanwhen die post streikt, die ...   

                     label description  
6188  The text has positive sentiment.  
1249  The text has positive sentiment.  
579   The text has positive sentiment.  
6162  The text has positive sentiment.  
5314  The text has positive sentiment.  
1707  The text has positive

In [70]:
#Fifth batch of training data
positives = df_train[df_train['label'] == 1].sample(n=8, random_state=36) #randomly select 8 'positive' examples

negatives = df_train[df_train['label'] == 0].sample(n=8, random_state=36) #randomly select 8 'negative' examples

batch5 = pd.concat([positives, negatives])

#convert the current batch to csv, so that it can be input into load_dataset
batch5.to_csv('/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/batch5.csv', index=False)
print(batch5.head(10))

      label                                               text  \
6176      1  Ich weiß nicht, wie erfreut  ist, dass ausgere...   
2787      1  DB-Streik heute, 21.05.2015- Donnerstag, um 19...   
2164      1  Pläne zum Bahnhofsumbau NIEDERNHAUSEN - (red)....   
1669      1  München: Streik der Lokführer in Deutschland ü...   
3661      1  Re: GDL und Bahn: Der Streik wird beendet. +Ha...   
3556      1  Liebe Freunde in Deutschland. Die Deutsche Bah...   
3094      1  Re: Stuttgart-Vaihingen Dem schließe ich mich ...   
1379      1  Mit der #db kann das fahren doch ganz angenehm...   
1537      0  #Bahn hat mal wieder Verspätung.... every fckn...   
1768      0  Guys. Meine Bahn steht nun. Es muss eine Ersat...   

                     label description  
6176  The text has positive sentiment.  
2787  The text has positive sentiment.  
2164  The text has positive sentiment.  
1669  The text has positive sentiment.  
3661  The text has positive sentiment.  
3556  The text has positive

In [71]:
# pose sequence as a NLI premise and label as a hypothesis
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')
tokenizer = AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')



#delete below string
'''
premise = sequence
hypothesis = f'This example is {label}.'

# run through model pre-trained on MNLI
x = tokenizer.encode(premise, hypothesis, return_tensors='pt',
                     truncation_strategy='only_first')
logits = model(x.to(device))[0]

# we throw away "neutral" (dim 1) and take the probability of
# "entailment" (2) as the probability of the label being true
entail_contradiction_logits = logits[:,[0,2]]
probs = entail_contradiction_logits.softmax(dim=1)
prob_label_is_true = probs[:,1]
'''

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'\npremise = sequence\nhypothesis = f\'This example is {label}.\'\n\n# run through model pre-trained on MNLI\nx = tokenizer.encode(premise, hypothesis, return_tensors=\'pt\',\n                     truncation_strategy=\'only_first\')\nlogits = model(x.to(device))[0]\n\n# we throw away "neutral" (dim 1) and take the probability of\n# "entailment" (2) as the probability of the label being true\nentail_contradiction_logits = logits[:,[0,2]]\nprobs = entail_contradiction_logits.softmax(dim=1)\nprob_label_is_true = probs[:,1]\n'

In [72]:
#ROUND 1 OF EXPERIMENT
#train set is few_shot_batch1
#repeat this cell with 'few_shot_batch2.csv' and 'few_shot_batch3.csv' for the next two rounds
data_files = {'train': 'batch1.csv', 'test': 'test.csv', 'validation': 'validation.csv'}

dataset = load_dataset("/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/", data_files=data_files)


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [73]:
#tokenize the text
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [74]:
#tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/1490 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [75]:
#round 1
torch.cuda.empty_cache()
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.remove_columns(["label description"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

small_eval_dataset = tokenized_datasets["validation"] #rename validation set as small_eval_dataset
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=4,
    num_workers=2,
    pin_memory=True
    )
#try increasing evaluation batch_size from 8 to 16
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
#new training loop
from tqdm.auto import tqdm
import torch

# Create progress bar
progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)

model.train()
for epoch in range(num_epochs):
    # Update the description to include the current epoch
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")

    # Training loop
    total_train_loss = 0  # Track the total training loss for the epoch
    num_train_batches = 0  # Track the number of training batches

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Accumulate training loss
        total_train_loss += loss.item()
        num_train_batches += 1

        # Update the progress bar with the current loss
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches)

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    eval_loss = 0
    num_eval_batches = 0

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():  # Disable gradient calculations
            outputs = model(**batch)
            eval_loss += outputs.loss.item()  # Accumulate evaluation loss
        num_eval_batches += 1

    eval_loss /= num_eval_batches  # Average evaluation loss
    progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches, eval_loss=eval_loss)  # Update progress bar with both losses

    print(f"Epoch {epoch+1} | Training Loss: {total_train_loss / num_train_batches:.4f} | Evaluation Loss: {eval_loss:.4f}")

    model.train()  # Set model back to training mode


Training:   0%|          | 0/20 [00:00<?, ?it/s]

  self.pid = os.fork()


Epoch 1 | Training Loss: 1.6565 | Evaluation Loss: 1.6003
Epoch 2 | Training Loss: 0.4523 | Evaluation Loss: 0.9625
Epoch 3 | Training Loss: 0.2338 | Evaluation Loss: 0.8661
Epoch 4 | Training Loss: 0.1342 | Evaluation Loss: 0.8659
Epoch 5 | Training Loss: 0.1113 | Evaluation Loss: 0.8774


In [76]:
#get both accuracy and f1-macro on the full test set
import evaluate

# Prepare the evaluation dataloader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Load the F1 and accuracy metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

model.eval()  # Set the model to evaluation mode

# Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class

    # Add predictions and references (true labels) to both metrics
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute both metrics
f1_score = f1_metric.compute(average="macro")  # Use weighted average for F1 if you have imbalanced classes
accuracy = accuracy_metric.compute()

print(f"Macro-F1 Score: {f1_score['f1']:.4f}")
print(f"Accuracy: {accuracy['accuracy']:.4f}")


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Macro-F1 Score: 0.3690
Accuracy: 0.6497


In [80]:
#ROUND 2 OF EXPERIMENT

model = AutoModelForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')
tokenizer = AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')
#repeat this cell with 'few_shot_batch2.csv' and 'few_shot_batch3.csv' for the next two rounds
data_files = {'train': 'batch2.csv', 'test': 'test.csv', 'validation': 'validation.csv'}

dataset = load_dataset("/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/", data_files=data_files)

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [81]:
#tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [82]:
#round 2
torch.cuda.empty_cache()
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.remove_columns(["label description"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

small_eval_dataset = tokenized_datasets["validation"] #rename validation set as small_eval_dataset
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=4,
    num_workers=2,
    pin_memory=True
    )
#try increasing evaluation batch_size from 8 to 16
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
#new training loop
from tqdm.auto import tqdm
import torch

# Create progress bar
progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)

model.train()
for epoch in range(num_epochs):
    # Update the description to include the current epoch
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")

    # Training loop
    total_train_loss = 0  # Track the total training loss for the epoch
    num_train_batches = 0  # Track the number of training batches

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Accumulate training loss
        total_train_loss += loss.item()
        num_train_batches += 1

        # Update the progress bar with the current loss
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches)

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    eval_loss = 0
    num_eval_batches = 0

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():  # Disable gradient calculations
            outputs = model(**batch)
            eval_loss += outputs.loss.item()  # Accumulate evaluation loss
        num_eval_batches += 1

    eval_loss /= num_eval_batches  # Average evaluation loss
    progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches, eval_loss=eval_loss)  # Update progress bar with both losses

    print(f"Epoch {epoch+1} | Training Loss: {total_train_loss / num_train_batches:.4f} | Evaluation Loss: {eval_loss:.4f}")

    model.train()  # Set model back to training mode

Training:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1 | Training Loss: 1.4716 | Evaluation Loss: 1.8283
Epoch 2 | Training Loss: 0.5797 | Evaluation Loss: 1.0557
Epoch 3 | Training Loss: 0.2381 | Evaluation Loss: 0.9707
Epoch 4 | Training Loss: 0.0864 | Evaluation Loss: 0.9847
Epoch 5 | Training Loss: 0.1161 | Evaluation Loss: 1.0007


In [83]:
#get both accuracy and f1-macro on the full test set
import evaluate

# Prepare the evaluation dataloader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Load the F1 and accuracy metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

model.eval()  # Set the model to evaluation mode

# Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class

    # Add predictions and references (true labels) to both metrics
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute both metrics
f1_score = f1_metric.compute(average="macro")  # Use weighted average for F1 if you have imbalanced classes
accuracy = accuracy_metric.compute()

print(f"Macro-F1 Score: {f1_score['f1']:.4f}")
print(f"Accuracy: {accuracy['accuracy']:.4f}")


Macro-F1 Score: 0.3468
Accuracy: 0.5940


In [84]:
#ROUND 3 OF EXPERIMENT

model = AutoModelForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')
tokenizer = AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')
#repeat this cell with 'few_shot_batch2.csv' and 'few_shot_batch3.csv' for the next two rounds
data_files = {'train': 'batch3.csv', 'test': 'test.csv', 'validation': 'validation.csv'}

dataset = load_dataset("/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/", data_files=data_files)

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [85]:
#tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/1490 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [86]:
#round 3
torch.cuda.empty_cache()
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.remove_columns(["label description"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

small_eval_dataset = tokenized_datasets["validation"] #rename validation set as small_eval_dataset
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=4,
    num_workers=2,
    pin_memory=True
    )
#try increasing evaluation batch_size from 8 to 16
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
#new training loop
from tqdm.auto import tqdm
import torch

# Create progress bar
progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)

model.train()
for epoch in range(num_epochs):
    # Update the description to include the current epoch
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")

    # Training loop
    total_train_loss = 0  # Track the total training loss for the epoch
    num_train_batches = 0  # Track the number of training batches

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Accumulate training loss
        total_train_loss += loss.item()
        num_train_batches += 1

        # Update the progress bar with the current loss
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches)

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    eval_loss = 0
    num_eval_batches = 0

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():  # Disable gradient calculations
            outputs = model(**batch)
            eval_loss += outputs.loss.item()  # Accumulate evaluation loss
        num_eval_batches += 1

    eval_loss /= num_eval_batches  # Average evaluation loss
    progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches, eval_loss=eval_loss)  # Update progress bar with both losses

    print(f"Epoch {epoch+1} | Training Loss: {total_train_loss / num_train_batches:.4f} | Evaluation Loss: {eval_loss:.4f}")

    model.train()  # Set model back to training mode

Training:   0%|          | 0/20 [00:00<?, ?it/s]

  self.pid = os.fork()


Epoch 1 | Training Loss: 2.5237 | Evaluation Loss: 2.1019
Epoch 2 | Training Loss: 0.5606 | Evaluation Loss: 1.3036
Epoch 3 | Training Loss: 0.3872 | Evaluation Loss: 0.9950
Epoch 4 | Training Loss: 0.3301 | Evaluation Loss: 0.9498
Epoch 5 | Training Loss: 0.2793 | Evaluation Loss: 0.9615


In [87]:
#get both accuracy and f1-macro on the full test set
import evaluate

# Prepare the evaluation dataloader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Load the F1 and accuracy metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

model.eval()  # Set the model to evaluation mode

# Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class

    # Add predictions and references (true labels) to both metrics
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute both metrics
f1_score = f1_metric.compute(average="macro")  # Use weighted average for F1 if you have imbalanced classes
accuracy = accuracy_metric.compute()

print(f"Macro-F1 Score: {f1_score['f1']:.4f}")
print(f"Accuracy: {accuracy['accuracy']:.4f}")

Macro-F1 Score: 0.3461
Accuracy: 0.5940


In [88]:
#ROUND 4 OF EXPERIMENT

model = AutoModelForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')
tokenizer = AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')
#repeat this cell with 'few_shot_batch2.csv' and 'few_shot_batch3.csv' for the next two rounds
data_files = {'train': 'batch3.csv', 'test': 'test.csv', 'validation': 'validation.csv'}

dataset = load_dataset("/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/", data_files=data_files)

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [89]:
#tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1490 [00:00<?, ? examples/s]

In [90]:
#round 4
torch.cuda.empty_cache()
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.remove_columns(["label description"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

small_eval_dataset = tokenized_datasets["validation"] #rename validation set as small_eval_dataset
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=4,
    num_workers=2,
    pin_memory=True
    )
#try increasing evaluation batch_size from 8 to 16
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 4
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
#new training loop
from tqdm.auto import tqdm
import torch

# Create progress bar
progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)

model.train()
for epoch in range(num_epochs):
    # Update the description to include the current epoch
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")

    # Training loop
    total_train_loss = 0  # Track the total training loss for the epoch
    num_train_batches = 0  # Track the number of training batches

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Accumulate training loss
        total_train_loss += loss.item()
        num_train_batches += 1

        # Update the progress bar with the current loss
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches)

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    eval_loss = 0
    num_eval_batches = 0

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():  # Disable gradient calculations
            outputs = model(**batch)
            eval_loss += outputs.loss.item()  # Accumulate evaluation loss
        num_eval_batches += 1

    eval_loss /= num_eval_batches  # Average evaluation loss
    progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches, eval_loss=eval_loss)  # Update progress bar with both losses

    print(f"Epoch {epoch+1} | Training Loss: {total_train_loss / num_train_batches:.4f} | Evaluation Loss: {eval_loss:.4f}")

    model.train()  # Set model back to training mode

Training:   0%|          | 0/16 [00:00<?, ?it/s]

  self.pid = os.fork()


Epoch 1 | Training Loss: 2.7226 | Evaluation Loss: 2.5199
Epoch 2 | Training Loss: 0.7020 | Evaluation Loss: 1.6201
Epoch 3 | Training Loss: 0.5542 | Evaluation Loss: 1.1826
Epoch 4 | Training Loss: 0.4049 | Evaluation Loss: 1.0978


In [91]:
#get both accuracy and f1-macro on the full test set
import evaluate

# Prepare the evaluation dataloader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Load the F1 and accuracy metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

model.eval()  # Set the model to evaluation mode

# Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class

    # Add predictions and references (true labels) to both metrics
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute both metrics
f1_score = f1_metric.compute(average="macro")  # Use weighted average for F1 if you have imbalanced classes
accuracy = accuracy_metric.compute()

print(f"Macro-F1 Score: {f1_score['f1']:.4f}")
print(f"Accuracy: {accuracy['accuracy']:.4f}")

Macro-F1 Score: 0.3198
Accuracy: 0.5356


In [96]:
#ROUND 5 OF EXPERIMENT

model = AutoModelForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')
tokenizer = AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')
#repeat this cell with 'few_shot_batch2.csv' and 'few_shot_batch3.csv' for the next two rounds
data_files = {'train': 'batch3.csv', 'test': 'test.csv', 'validation': 'validation.csv'}

dataset = load_dataset("/content/drive/MyDrive/Fewshotproject/SecondExperiment/Data/", data_files=data_files)

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [93]:
#tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [97]:
#round 5
torch.cuda.empty_cache()
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.remove_columns(["label description"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

small_eval_dataset = tokenized_datasets["validation"] #rename validation set as small_eval_dataset
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=4,
    num_workers=2,
    pin_memory=True
    )
#try increasing evaluation batch_size from 8 to 16
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
#new training loop
from tqdm.auto import tqdm
import torch

# Create progress bar
progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)

model.train()
for epoch in range(num_epochs):
    # Update the description to include the current epoch
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")

    # Training loop
    total_train_loss = 0  # Track the total training loss for the epoch
    num_train_batches = 0  # Track the number of training batches

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Accumulate training loss
        total_train_loss += loss.item()
        num_train_batches += 1

        # Update the progress bar with the current loss
        progress_bar.update(1)
        progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches)

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    eval_loss = 0
    num_eval_batches = 0

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():  # Disable gradient calculations
            outputs = model(**batch)
            eval_loss += outputs.loss.item()  # Accumulate evaluation loss
        num_eval_batches += 1

    eval_loss /= num_eval_batches  # Average evaluation loss
    progress_bar.set_postfix(train_loss=total_train_loss / num_train_batches, eval_loss=eval_loss)  # Update progress bar with both losses

    print(f"Epoch {epoch+1} | Training Loss: {total_train_loss / num_train_batches:.4f} | Evaluation Loss: {eval_loss:.4f}")

    model.train()  # Set model back to training mode

Training:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1 | Training Loss: 2.7242 | Evaluation Loss: 1.7631
Epoch 2 | Training Loss: 0.6589 | Evaluation Loss: 1.0920
Epoch 3 | Training Loss: 0.4635 | Evaluation Loss: 0.8537
Epoch 4 | Training Loss: 0.4419 | Evaluation Loss: 0.8349
Epoch 5 | Training Loss: 0.3854 | Evaluation Loss: 0.8518


In [98]:
#get both accuracy and f1-macro on the full test set
import evaluate

# Prepare the evaluation dataloader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Load the F1 and accuracy metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

model.eval()  # Set the model to evaluation mode

# Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the predicted class

    # Add predictions and references (true labels) to both metrics
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute both metrics
f1_score = f1_metric.compute(average="macro")  # Use weighted average for F1 if you have imbalanced classes
accuracy = accuracy_metric.compute()

print(f"Macro-F1 Score: {f1_score['f1']:.4f}")
print(f"Accuracy: {accuracy['accuracy']:.4f}")

Macro-F1 Score: 0.3507
Accuracy: 0.6174


In [99]:
# Python program to get average of a list
def Average(lst):
    return sum(lst) / len(lst)

# Driver Code
macrof1_scores = [0.3690, 0.3468, 0.3461, 0.3198, 0.3507]
average_macrof1 = Average(macrof1_scores)
accuracy_scores =[0.6497, 0.5940, 0.5940, 0.5356, 0.6174]
average_accuracy = Average(accuracy_scores)

# Printing average of the list
print("Average macro f1 =", round(average_macrof1, 4))
print("Average accuracy =", round(average_accuracy, 4))

Average macro f1 = 0.3465
Average accuracy = 0.5981
