In [1]:
# Some settings
# Which pre-trained model to use.
# See https://huggingface.co/models for options.
checkpoint = "microsoft/deberta-v3-small"

# How much training data to use.
# 1.0 uses the whole training set but it can take a bit of time to train.
train_data_sample_ratio = 0.1

# Example sentence to use
# We print out predictions for this sentence before and after training
example_sentence = "this was by far the best movie of the year"
     

In [2]:
# Install the necessary libraries
!pip install datasets evaluate transformers[sentencepiece]  

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting transformers[sentencepiece]
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from datasets)
  Using cached filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash

In [3]:
!pip install torch

Collecting torch
  Using cached torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using cached nvidia_cublas_cu12-12.4.5.8

In [2]:
# Import the libraries
import torch
import evaluate

from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AdamW
from transformers import AutoModelForSequenceClassification
from transformers import get_scheduler
from tqdm.auto import tqdm

In [3]:
# Checking whether you are running on CPU or GPU.
# If the output here says "cuda" then it's running on GPU. Otherwise it's probably CPU.
# In order to run your code in Colab on the GPU, go to Edit -> Notebook settings -> Hardware accelerator and set it to "GPU".
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [4]:
# Loading the pretrained model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model = model.to(device)

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
import pandas as pd

In [6]:
train_dataset_path ="../dataset/dpm_pcl_train.csv"
train_dataset = pd.read_csv(train_dataset_path)

val_dataset_path ="../dataset/dpm_pcl_val.csv"
val_dataset = pd.read_csv(val_dataset_path)

test_dataset_path ="../dataset/dpm_pcl_test.csv"
test_dataset = pd.read_csv(test_dataset_path)

train_dataset.head()

Unnamed: 0,par_id,art_id,keyword,country,text,orig_label
0,5825,@@9438566,in-need,au,Describing local police as under resourced and...,0
1,7171,@@1934487,hopeless,ng,The only force capable of stopping them is the...,3
2,680,@@9525972,in-need,nz,Plans by the Government to return to bulk fund...,0
3,4906,@@22596758,refugee,bd,"New figures reveal that more than 48,000 Rohin...",0
4,8180,@@13717053,migrant,ph,"He then listed several immigrants , mostly fro...",0


In [7]:
train_dataset["label"] = train_dataset["orig_label"].apply(lambda x : 0 if (x == 0 or x == 1) else 1)
val_dataset["label"] = val_dataset["orig_label"].apply(lambda x : 0 if (x == 0 or x == 1) else 1)
test_dataset["label"] = test_dataset["orig_label"].apply(lambda x : 0 if (x == 0 or x == 1) else 1)

train_dataset.loc[train_dataset["text"].isna(), "text"] = ""
val_dataset.loc[val_dataset["text"].isna(), "text"] = ""
test_dataset.loc[test_dataset["text"].isna(), "text"] = ""

train_dataset = train_dataset.drop(['par_id', 'art_id', 'keyword', 'country', 'orig_label'], axis=1)
val_dataset = val_dataset.drop(['par_id', 'art_id', 'keyword', 'country', 'orig_label'], axis=1)
test_dataset = test_dataset.drop(['par_id', 'art_id', 'keyword', 'country', 'orig_label'], axis=1)

train_dataset.head()

Unnamed: 0,text,label
0,Describing local police as under resourced and...,0
1,The only force capable of stopping them is the...,1
2,Plans by the Government to return to bulk fund...,0
3,"New figures reveal that more than 48,000 Rohin...",0
4,"He then listed several immigrants , mostly fro...",0


In [8]:
# Perform tokenization
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=512)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

raw_datasets_train = Dataset.from_pandas(train_dataset[['text', 'label']])
raw_datasets_val = Dataset.from_pandas(val_dataset[['text', 'label']])
raw_datasets_test = Dataset.from_pandas(test_dataset[['text', 'label']])

tokenized_datasets_train = raw_datasets_train.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_datasets_val = raw_datasets_val.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_datasets_test = raw_datasets_test.map(tokenize_function, batched=True, remove_columns=['text'])

# DataCollatorWithPadding constructs batches that are padded to the length of the longest sentence in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    tokenized_datasets_train, shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets_val, batch_size=8, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_datasets_test, batch_size=8, collate_fn=data_collator
)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Map:   0%|          | 0/2094 [00:00<?, ? examples/s]

In [9]:
next(iter(train_dataloader))

{'input_ids': tensor([[     1,  15541,  19507,  ...,      0,      0,      0],
        [     1,  42370,    270,  ...,      0,      0,      0],
        [     1,  10066,  24266,  ...,      0,      0,      0],
        ...,
        [     1,   4785,  11905,  ...,      0,      0,      0],
        [     1,  18443,   1964,  ...,      0,      0,      0],
        [     1, 108434,    357,  ...,      0,      0,      0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 0, 1, 0, 0, 0, 0])}

In [10]:
# Printing out the shapes in one batch
example_batch = None
for batch in train_dataloader:
    example_batch = batch
    break

print({k: v.shape for k, v in example_batch.items()})


# Then printing out the loss, output shape and output values from one batch.
outputs = model(**example_batch.to(device))
print("output.loss: ", outputs.loss)
print("output.logits.shape: ", outputs.logits.shape)
print("output.logits: ", outputs.logits)

# Generating predictions for an example sentence.
# Haven't trained the model yet so these will be random.
def print_example_predictions(example_sentence, example_model):
    _e = tokenize_function({"text": example_sentence})
    _k = {k: torch.tensor([_e[k]]).to(device) for k in _e}
    model.eval()
    example_outputs = model(**_k)
    example_logits = example_outputs.logits.cpu().detach().numpy()
    example_probabilities = torch.nn.functional.softmax(example_outputs.logits, dim=1).cpu().detach().numpy()
    print(example_probabilities)
    print("Example sentence: ", example_sentence)
    print("Predicted logits: ", example_logits)
    print("Predicted probabilities: ", example_probabilities)
    print("Prediction: ", "negative" if example_probabilities[0][0] > example_probabilities[0][1] else "positive")

print_example_predictions(example_sentence, model)

{'input_ids': torch.Size([8, 512]), 'token_type_ids': torch.Size([8, 512]), 'attention_mask': torch.Size([8, 512]), 'labels': torch.Size([8])}
output.loss:  tensor(0.6671, device='cuda:0', grad_fn=<NllLossBackward0>)
output.logits.shape:  torch.Size([8, 2])
output.logits:  tensor([[ 0.0530, -0.0206],
        [ 0.0518, -0.0173],
        [ 0.0558, -0.0232],
        [ 0.0506, -0.0183],
        [ 0.0487, -0.0137],
        [ 0.0566, -0.0187],
        [ 0.0497, -0.0158],
        [ 0.0523, -0.0182]], device='cuda:0', grad_fn=<AddmmBackward0>)
[[0.5155225  0.48447752]]
Example sentence:  this was by far the best movie of the year
Predicted logits:  [[ 0.04703892 -0.01507097]]
Predicted probabilities:  [[0.5155225  0.48447752]]
Prediction:  negative


In [11]:

# Setting up model training for fine-tuning
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)



In [12]:

# Setting the model to training mode
model.train()

# Running the training
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/2514 [00:00<?, ?it/s]

In [15]:


# Setting the model to evaluation mode
model.eval()

# Running evaluation
metric = evaluate.load("f1")
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(metric.compute())

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

{'f1': 0.47368421052631576}


In [16]:
# Getting predictions for the example sentence again, now that we have trained the model
print_example_predictions(example_sentence, model)

[[0.9951402  0.00485979]]
Example sentence:  this was by far the best movie of the year
Predicted logits:  [[ 2.9714403 -2.3504477]]
Predicted probabilities:  [[0.9951402  0.00485979]]
Prediction:  negative
