In [None]:
!pip uninstall transformers datasets

In [1]:
import os
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


#### ABSA Dataset

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Label encoding
label_map = {"positive": 2, "neutral": 1, "negative": 0}

def load_absa_dataset(folder_path):
  all_data = []
  for root, _, files in os.walk(folder_path):
    for file in files:
      if file.endswith(".json"):
        path = os.path.join(root, file)
        with open(path, "r", encoding="utf-8") as f:
          content = json.load(f)
          for entry in content:
            text_tokens = entry["token"]
            full_text = " ".join(text_tokens)

            for aspect in entry.get("aspects", []):
              aspect_tokens = aspect["term"]
              aspect_text = " ".join(aspect_tokens)
              label = label_map.get(aspect["polarity"], 1)  # default to neutral

              # Format input as: sentence [SEP] aspect
              encoding = tokenizer(
                full_text,
                aspect_text,
                truncation=True,
                padding="max_length",
                max_length=128,
                return_tensors="pt"
              )
              item = {
                "input_ids": encoding["input_ids"][0],
                "attention_mask": encoding["attention_mask"][0],
                "token_type_ids": encoding["token_type_ids"][0],
                "label": label
              }
              all_data.append(item)
  return all_data


In [5]:
absa_data = load_absa_dataset(r"C:\Users\HP\Downloads\ABSA")
len(absa_data)

46472

In [7]:
absa_data

[{'input_ids': tensor([ 101, 9573, 2051, 2003, 3565, 3435, 1010, 2105, 5973, 2013, 3486, 3823,
          2000, 1015, 3371, 1012,  102, 9573, 2051,  102,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       

#### PyTorch Model

In [9]:
# pytorch dataset
class ABSADataset(Dataset):
  def __init__(self, data):
    self.data = data  # list of dictionaries

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    item = self.data[idx]
    return {
      "input_ids": item["input_ids"],
      "attention_mask": item["attention_mask"],
      "token_type_ids": item["token_type_ids"],
      "labels": torch.tensor(item["label"], dtype=torch.long)
    }


In [11]:
train_data, val_data = train_test_split(absa_data, test_size=0.1, random_state=42)

train_dataset = ABSADataset(train_data)
val_dataset = ABSADataset(val_data)


In [13]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
  output_dir="./absa_model",
  num_train_epochs=2,
  eval_strategy="epoch",
  logging_strategy="epoch",
  save_strategy="epoch",
  logging_dir="./logs",
  logging_steps=10,
  load_best_model_at_end=True,
  metric_for_best_model="accuracy",
  save_total_limit=1,
  report_to="none",  # disable WandB/MLFlow etc.
)

In [23]:
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=1)
  return {
    "accuracy": accuracy_score(labels, predictions),
    "f1": f1_score(labels, predictions, average="macro")
  }


In [25]:
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
  compute_metrics=compute_metrics
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4929,0.289729,0.923838,0.919667
2,0.1518,0.166277,0.97031,0.96852




TrainOutput(global_step=10456, training_loss=0.3223378970227172, metrics={'train_runtime': 52468.7071, 'train_samples_per_second': 1.594, 'train_steps_per_second': 0.199, 'total_flos': 5502227791527936.0, 'train_loss': 0.3223378970227172, 'epoch': 2.0})

In [27]:
def classify_sentiment(text, aspect):
    inputs = tokenizer(
        text, aspect,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    )
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()
    label_map = {0: "negative", 1: "neutral", 2: "positive"}
    return label_map[prediction]

In [37]:
text = "I think the best thing about this laptop is its CPU. But the rest of the hardware needs upgrading, like the RAM, it's fair but not good enough."
print(classify_sentiment(text, "RAM"))
print(classify_sentiment(text, "CPU"))

negative
positive


### To continue training

##### 1- From checkpoint (same dataset)

In [None]:
trainer.train(resume_from_checkpoint="Python Notebooks/absa_model/checkpoint-10456")

##### 2- Load Model (different dataset)

In [None]:
model = BertForSequenceClassification.from_pretrained("Python Notebooks/absa_model/checkpoint-10456")

### Deployment

In [None]:
# save to a different folder for deployment
model.save_pretrained("./Sentemint_Model")
tokenizer.save_pretrained("./Sentemint_Model")