In [1]:
import pandas as pd
from datasets import Dataset
from dotenv import load_dotenv
import os

from pathlib import Path

# Specify your .env file path
dotenv_path = Path("../.env").resolve()

load_dotenv(dotenv_path=dotenv_path.resolve(), override=True)

columns = ['tweet_id', 'entity', 'sentiment', 'tweet_content']

base_path = os.getenv('BASE_DATASET_PATH')

train_df = pd.read_csv(f'{base_path}\\twitter_training.csv', names=columns, header=None)
val_df = pd.read_csv(f'{base_path}\\twitter_validation.csv', names=columns, header=None)


In [2]:
train_df.head(2)

Unnamed: 0,tweet_id,entity,sentiment,tweet_content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...


In [4]:
train_df = train_df.drop(columns=['tweet_id', 'entity'])
val_df = val_df.drop(columns=['tweet_id', 'entity'])

In [5]:
train_df.head(2)

Unnamed: 0,sentiment,tweet_content
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...


In [8]:
import numpy as np
labrel_map = {'neutral': 0, 'positive': 1, 'negative': 2}

train_df['label'] = train_df['sentiment'].str.strip().str.lower().map(labrel_map)
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df['label'] = train_df['label'].fillna(0).astype(int)

val_df['label'] = val_df['sentiment'].str.strip().str.lower().map(labrel_map) 
val_df = val_df.replace([np.inf, -np.inf], np.nan)
val_df['label'] = val_df['label'].fillna(0).astype(int)

In [9]:
train_df.head(2)

Unnamed: 0,sentiment,tweet_content,label
0,Positive,im getting on borderlands and i will murder yo...,1
1,Positive,I am coming to the borders and I will kill you...,1


In [10]:
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

train_ds = train_ds.filter(lambda x: x['label'] is not None)
val_ds   = val_ds.filter(lambda x: x['label'] is not None)

Filter:   0%|          | 0/74682 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

model.config.problem_type = "single_label_classification"

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def tokenize_function(examples):
    texts = [str(x) if x is not None else "" for x in examples['tweet_content']]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_function, batched=True, remove_columns=['tweet_content'])
val_ds   = val_ds.map(tokenize_function,   batched=True, remove_columns=['tweet_content'])

Map:   0%|          | 0/74682 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
import transformers, peft, huggingface_hub
print(transformers.__version__)
print(peft.__version__)
print(huggingface_hub.__version__)

4.54.0
0.16.0
0.34.3


In [14]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,    
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)

In [None]:
#help(LoraConfig)


**How These Parameters Work Together**  
- During fine-tuning, LoRA replaces each targeted weight matrix $$W$$ with:  
  $$W + \frac{\alpha}{r} \Delta W,$$  
  where $$\Delta W = A \times B$$ with $$A \in \mathbb{R}^{d\times r}$$ and $$B \in \mathbb{R}^{r\times d}$$.  
- Setting `r` and `alpha` controls the capacity and influence of the adapter.  
- `lora_dropout` regularizes adapter training, and `bias` dictates whether additional bias parameters are trainable.  
- `target_modules` lets you focus adaptation on specific layers (here, attention query/value projections).  
- `inference_mode=False` (the default) ensures gradients and training hooks remain enabled.

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./bert-base-uncased-twitter-sentiment-finetuned-checkpoint",
    eval_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3, 
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
#help(TrainingArguments)

In [16]:
trainer.train()

  return forward_call(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1
500,0.8018,0.697092,0.71,0.709557
1000,0.7437,0.688998,0.705,0.703908
1500,0.7085,0.580383,0.752,0.752087
2000,0.6905,0.567597,0.757,0.756572
2500,0.6338,0.545937,0.766,0.766324
3000,0.6239,0.49065,0.796,0.79614
3500,0.6093,0.502629,0.795,0.794356
4000,0.6177,0.446155,0.82,0.819891
4500,0.5715,0.471205,0.805,0.804323
5000,0.5251,0.407247,0.834,0.834293


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=7002, training_loss=0.6301653206229244, metrics={'train_runtime': 7178.2925, 'train_samples_per_second': 31.212, 'train_steps_per_second': 0.975, 'total_flos': 1.4788518876232704e+16, 'train_loss': 0.6301653206229244, 'epoch': 3.0})

In [17]:
output_dir="./bert-base-uncased-twitter-sentiment-finetuned-model"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
trainer.save_model(output_dir)