In [1]:
# !pip install transformers datasets evaluate peft

In [2]:
# importing the libraries

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

import evaluate
import torch
import numpy as np


In [3]:
# setting the base model

model_checkpoint = "distilbert-base-uncased"

#define label maps
id2label = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
label2id = {"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# load dataset
import pandas as pd

dataset = load_dataset('csv', data_files='/content/cleaned_sentiment_data.csv')
dataset



DatasetDict({
    train: Dataset({
        features: ['Cleaned_Sentence', 'Sentiment'],
        num_rows: 5842
    })
})

In [7]:
# from datasets import Dataset

# dataset = [
#     {"text":"geosolutions technology leverage benefon gps solution provide location base search technology community platform location relevant multimedia content new powerful commercial model .",
#      "sentiment": "2"},
#     {"text":"$ esi low $ 1.50 $ 2.50 bk real possibility",
#      "sentiment": "0"},
#     {"text":"accord finnishrussian chamber commerce major construction company finland operate russia .","sentiment": "1"
#     },
# ]

# dataset = Dataset.from_list(dataset)

In [8]:
# Preprocessing the data
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)


#create tokenize function
def tokenize_function(examples):
  #extract text and sentiment
  text = examples["Cleaned_Sentence"]
  sentiment = examples["Sentiment"]

  # tokenize and truncate text
  tokenizer.truncate_side = "left"
  tokenized_inputs = tokenizer(
      text,
      return_tensors="np",
      truncation = True,
      max_length = 512
      )

  # Add labels to the tokenized inputs
  tokenized_inputs["labels"] = [int(s) for s in sentiment] # Convert sentiment to integer labels

  return tokenized_inputs

# add pad token if none exits
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

# tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets # Corrected variable name to display the result

DatasetDict({
    train: Dataset({
        features: ['Cleaned_Sentence', 'Sentiment', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5842
    })
})

In [13]:
print(type(tokenized_datasets))


<class 'datasets.dataset_dict.DatasetDict'>


In [25]:
# Split the dataset into training and validation sets
train_test_split = tokenized_datasets.train_test_split(test_size=0.2) # 80% train, 20% validation

# Create a new DatasetDict with split data and remove unnecessary columns
tokenized_datasets = DatasetDict({
    'train': train_test_split['train'].remove_columns(['Cleaned_Sentence', 'Sentiment']),
    'validation': train_test_split['test'].remove_columns(['Cleaned_Sentence', 'Sentiment'])
})

display(tokenized_datasets)

AttributeError: 'DatasetDict' object has no attribute 'train_test_split'

In [16]:
display(tokenized_datasets)


DatasetDict({
    train: Dataset({
        features: ['Cleaned_Sentence', 'Sentiment', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4673
    })
    validation: Dataset({
        features: ['Cleaned_Sentence', 'Sentiment', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1169
    })
})

In [17]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

## Evaluation metrics

In [18]:
# Evaluation metrics

accuracy = evaluate.load("accuracy")

#define an evaluation function to pass into trainer later

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)

  acc = accuracy.compute(predictions=predictions, references=labels)
  return {"accuracy": accuracy.compute(predictions=predictions,
  references = labels)}

## Untrained model performance

In [19]:
# define list of examples
text_list = [
    "The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model .",
    "$ESI on lows, down $1.50 to $2.50 BK a real possibility",
    "According to the Finnish-Russian Chamber of Commerce , all the major construction companies of Finland are operating in Russia ."
]

print("Untrained model predictions.")
print("-----------------------------")

# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Move model to the device

for text in text_list:
  #tokenize text
  inputs = tokenizer.encode(text, return_tensors="pt").to(device) # Move input tensor to the device

  #compute logits
  logits = model(inputs).logits

  #convert logits to label
  predictions = torch.argmax(logits)

  print(text + " - "+ id2label[predictions.tolist()])
  print(inputs)

Untrained model predictions.
-----------------------------
The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model . - NEUTRAL
tensor([[  101,  1996, 20248, 19454, 13700,  2015,  2974,  2097, 21155,  3841,
         12879,  2239,  1005,  1055, 14658,  7300,  2011,  4346,  3295,  2241,
          3945,  2974,  1010,  1037,  4279,  4132,  1010,  3295,  7882, 14959,
          4180,  1998,  1037,  2047,  1998,  3928,  3293,  2944,  1012,   102]],
       device='cuda:0')
$ESI on lows, down $1.50 to $2.50 BK a real possibility - NEUTRAL
tensor([[  101,  1002,  9686,  2072,  2006,  2659,  2015,  1010,  2091,  1002,
          1015,  1012,  2753,  2000,  1002,  1016,  1012,  2753, 23923,  1037,
          2613,  6061,   102]], device='cuda:0')
According to the Finnish-Russian Chamber of Commerce , all the major construction companies of Fi

## Fine-tuning with LORA


In [20]:
peft_config = LoraConfig(task_type="SEQ_CLS", #Sequence calssificaion
                         r=4,  #intrinsic rank of trainable weight matrix
                         lora_alpha = 32, # this is like a learning rate
                         lora_dropout=0.01, #probablity of the dropout
                         target_modules = ['q_lin'] #we apply lora to query layer
                         )

In [21]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 629,763 || all params: 67,585,542 || trainable%: 0.9318


In [22]:
# hyperparameters
lr = 1e-3 # size of optimization step
batch_size = 4 # number of examples processed per optimization step
num_epochs = 10

#defining training arguments
training_args = TrainingArguments(
    output_dir = "sentiment_model",
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    weight_decay = 0.01,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)

In [17]:
!pip install transformers==4.45.2 peft==0.17.1




In [28]:
# creater trainer object

trainer = Trainer(
    model = model, # our peft model
    args = training_args, # hyperparameter
    train_dataset = tokenized_datasets['train'], # Use the training split
    eval_dataset = tokenized_datasets['validation'], # Use the validation split
    tokenizer = tokenizer, #define tokenizer
    data_collator = data_collator, # this will dynamically pad examples
    compute_metrics = compute_metrics, # evaluates model using compute_metrics
)

#train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6191,0.730628,{'accuracy': 0.7476475620188195}
2,0.5948,0.746123,{'accuracy': 0.7279726261762189}
3,0.6541,0.805193,{'accuracy': 0.7553464499572284}
4,0.5904,0.759899,{'accuracy': 0.7279726261762189}
5,0.5901,0.7827,{'accuracy': 0.7331052181351583}
6,0.5335,0.802292,{'accuracy': 0.7433704020530368}
7,0.4725,0.908017,{'accuracy': 0.7467921300256629}
8,0.4561,0.888691,{'accuracy': 0.7459366980325064}
9,0.4512,0.979279,{'accuracy': 0.739093242087254}
10,0.4029,0.970635,{'accuracy': 0.737382378100941}


Trainer is attempting to log a value of "{'accuracy': 0.7476475620188195}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.7279726261762189}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.7553464499572284}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.7279726261762189}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.7331052181351583}" o

TrainOutput(global_step=11690, training_loss=0.5326281290568278, metrics={'train_runtime': 290.5429, 'train_samples_per_second': 160.837, 'train_steps_per_second': 40.235, 'total_flos': 376972821223200.0, 'train_loss': 0.5326281290568278, 'epoch': 10.0})

## Trained model performance

In [29]:
# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # moving model to the device

print("Trained model predictions:")

print("---------------------------")

for text in text_list:
  inputs = tokenizer.encode(text, return_tensors="pt").to(device) # Move input tensor to the device

  logits = model(inputs).logits

  predictions = torch.argmax(logits)

  print(text + " - " + id2label[predictions.tolist()])

Trained model predictions:
---------------------------
The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model . - POSITIVE
$ESI on lows, down $1.50 to $2.50 BK a real possibility - NEGATIVE
According to the Finnish-Russian Chamber of Commerce , all the major construction companies of Finland are operating in Russia . - NEUTRAL


In [30]:
# Define a directory to save the model
save_directory = "./fine_tuned_sentiment_model"

# Save the PEFT model (LoRA adapter weights)
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Fine-tuned model and tokenizer saved to {save_directory}")

Fine-tuned model and tokenizer saved to ./fine_tuned_sentiment_model


In [31]:
import os
from google.colab import files

# Define the directory where the model was saved
save_directory = "./fine_tuned_sentiment_model"

# List files in the directory
for filename in os.listdir(save_directory):
    filepath = os.path.join(save_directory, filename)
    # Check if it's a file before attempting to download
    if os.path.isfile(filepath):
        print(f"Downloading {filename}...")
        files.download(filepath)

print("Download process initiated.")

Downloading adapter_config.json...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading adapter_model.safetensors...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading vocab.txt...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading special_tokens_map.json...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading tokenizer.json...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading README.md...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading tokenizer_config.json...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download process initiated.


In [33]:
# Example text for prediction
text_to_predict = "This is a great day!"

# Tokenize the input text using the loaded tokenizer
inputs = loaded_tokenizer(text_to_predict, return_tensors="pt")

# Move the input tensors to the same device as the loaded model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)
inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

# Get the model's predictions (logits)
with torch.no_grad(): # Disable gradient calculation for inference
    outputs = loaded_model(**inputs)
    logits = outputs.logits

# Get the predicted class index
predicted_class_id = torch.argmax(logits, dim=-1).item()

# Map the predicted class index to the sentiment label
predicted_label = id2label[predicted_class_id]

print(f"The sentiment of the text '{text_to_predict}' is: {predicted_label}")

The sentiment of the text 'This is a great day!' is: POSITIVE


In [32]:
from transformers import AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig
import torch

# Define the directory where you saved the model
save_directory = "./fine_tuned_sentiment_model"

# Load the base model
model_checkpoint = "distilbert-base-uncased"
base_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

# Load the PEFT model (LoRA adapter weights)
# We load the base model first and then add the PEFT layers
loaded_model = PeftModel.from_pretrained(base_model, save_directory)

# You can also load the tokenizer
from transformers import AutoTokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)

print("Fine-tuned model and tokenizer loaded successfully.")

# You can now use loaded_model and loaded_tokenizer for inference

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fine-tuned model and tokenizer loaded successfully.
