<a href="https://colab.research.google.com/github/Sivsai/ABSA/blob/main/ABSA_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DebertaForTokenClassification,
    DebertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    DataCollatorForTokenClassification,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import re
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Define helper functions (metrics, label creator)




In [3]:
def create_bio_labels(row):
    sentence = row['Sentence']
    aspect_terms = row['Aspect Term']
    from_indices = row['from']
    to_indices = row['to']

    # Tokenize the sentence
    tokenized_output = tokenizer(
        sentence,
        return_offsets_mapping=True,
        return_special_tokens_mask=True,
        truncation=True,
    )
    tokens = tokenizer.convert_ids_to_tokens(tokenized_output['input_ids'])
    offsets = tokenized_output['offset_mapping']
    special_mask = tokenized_output['special_tokens_mask']

    # Initialize labels with 'O'
    labels = [label_map['O']] * len(tokens)


     # Align BIO using character spans
    for i in range(len(aspect_terms)):
        start_char = from_indices[i]
        end_char = to_indices[i]
        is_first_token = True
        for j, (offset_start, offset_end) in enumerate(offsets):
            if special_mask[j] == 1:  # special tokens
                continue
            if offset_start >= start_char and offset_end <= end_char:
                labels[j] = label_map['B-ASP'] if is_first_token else label_map['I-ASP']
                is_first_token = False
            elif offset_start < end_char and offset_end > start_char:
                labels[j] = label_map['B-ASP'] if is_first_token else label_map['I-ASP']
                is_first_token = False

    # Ignore special tokens during loss
    for j, is_special in enumerate(special_mask):
        if is_special == 1:
            labels[j] = -100

    tokenized_output['labels'] = labels
    return tokenized_output

#Evalution or Compute metrices for extraction for model
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    true_predictions = [
        [inv_label_map[p_] for (p_, l_) in zip(pred, lab) if l_ != -100]
        for pred, lab in zip(preds, labels)
    ]
    true_labels = [
        [inv_label_map[l_] for (p_, l_) in zip(pred, lab) if l_ != -100]
        for pred, lab in zip(preds, labels)
    ]

    flat_preds = [x for seq in true_predictions for x in seq]
    flat_labels = [x for seq in true_labels for x in seq]

    return {
        "accuracy": accuracy_score(flat_labels, flat_preds),
        "f1": f1_score(flat_labels, flat_preds, average='macro'),
    }

#Evalution for sentiment classfication model
def compute_sentiment_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average='macro')
    }

##Define dataset classes

In [4]:
# Dataset for Aspect Extraction
class AspectDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(item['labels'], dtype=torch.long)
        }

# Dataset for Sentiment Classification
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, polarity_map, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.polarity_map = polarity_map

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]['sentence']
        aspect = self.data[idx]['aspect_term']
        polarity = self.data[idx]['polarity']
        text = f"{aspect} [SEP] {sentence}"

        encoding = self.tokenizer(
            text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.polarity_map[polarity], dtype=torch.long)
        }


##Define the training function (run_absa_training)

In [11]:
def run_absa_training(dataset_path, model_save_prefix):
    """
    Loads a dataset, trains aspect and sentiment models, and saves them.
    """
    print(f"--- Starting training for {dataset_path} ---")

    # 1. Load and preprocess data
    df = pd.read_csv(dataset_path)
    agg_df = df.groupby('id').agg({
        'Sentence': 'first',
        'Aspect Term': list,
        'polarity': list,
        'from': list,
        'to': list
    }).reset_index()

    # --- Aspect Extraction Model Training ---
    print("\n--- Training Aspect Extraction Model ---")
    processed_data = agg_df.apply(create_bio_labels, axis=1).tolist()
    train_data, test_data = train_test_split(processed_data, test_size=0.2, random_state=42)

    train_dataset = AspectDataset(train_data)
    test_dataset = AspectDataset(test_data)

    aspect_model = DebertaForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(label_map), ignore_mismatched_sizes=True)
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    aspect_output_dir = f'./results_{model_save_prefix}_aspect'
    training_args = TrainingArguments(
        output_dir=aspect_output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to="none"
    )

    trainer = Trainer(
        model=aspect_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator
    )
    trainer.train()
    trainer.save_model(aspect_output_dir)
    tokenizer.save_pretrained(aspect_output_dir)

    print(f"Aspect model saved to {aspect_output_dir}")

    # --- Sentiment Classification Model Training ---
    print("\n--- Training Sentiment Classification Model ---")
    sentiment_data = []
    for _, row in df.iterrows():
        if pd.notna(row['Aspect Term']):
            sentiment_data.append({
                'sentence': row['Sentence'],
                'aspect_term': row['Aspect Term'],
                'polarity': row['polarity']
            })

    polarity_map = {p: i for i, p in enumerate(df['polarity'].unique())}

    train_sentiment_data, test_sentiment_data = train_test_split(sentiment_data, test_size=0.2, random_state=42)
    train_sentiment_dataset = SentimentDataset(train_sentiment_data, tokenizer, polarity_map=polarity_map)
    test_sentiment_dataset = SentimentDataset(test_sentiment_data, tokenizer, polarity_map=polarity_map)

    sentiment_model = DebertaForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=len(polarity_map),ignore_mismatched_sizes=True
    )

    sentiment_output_dir = f'./results_{model_save_prefix}_sentiment'
    sentiment_training_args = TrainingArguments(
        output_dir=sentiment_output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to="none"
    )

    sentiment_trainer = Trainer(
        model=sentiment_model,
        args=sentiment_training_args,
        train_dataset=train_sentiment_dataset,
        eval_dataset=test_sentiment_dataset,
        compute_metrics=compute_sentiment_metrics
    )
    sentiment_trainer.train()
    sentiment_trainer.save_model(sentiment_output_dir)
    print(f"Sentiment model saved to {sentiment_output_dir}")

    return aspect_output_dir, sentiment_output_dir, polarity_map

##Run training for one or multiple datasets

In [12]:
# Define model name and tokenizer before running this cell
MODEL_NAME = "microsoft/deberta-v3-base"
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

laptopDataset = "/content/drive/MyDrive/Colab_Data/ABSA_Datasets/Laptop_Train_v2.csv"

# Label map for aspect tagging (BIO labels)
label_map = {'B-ASP': 0, 'I-ASP': 1, 'O': 2}
inv_label_map = {v: k for k, v in label_map.items()}


# Example: Train for the laptop dataset
laptop_aspect_path, laptop_sentiment_path, laptop_polarity_map = run_absa_training(
    dataset_path=laptopDataset,
    model_save_prefix='laptop'
)



--- Starting training for /content/drive/MyDrive/Colab_Data/ABSA_Datasets/Laptop_Train_v2.csv ---


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



--- Training Aspect Extraction Model ---


You are using a model of type deberta-v2 to instantiate a model of type deberta. This is not supported for all configurations of models and can yield errors.
Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.0.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.1.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.self

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.236,0.203238,0.927471,0.725392
2,0.171,0.204957,0.930121,0.739528
3,0.1291,0.195968,0.936744,0.75623


Aspect model saved to ./results_laptop_aspect

--- Training Sentiment Classification Model ---


You are using a model of type deberta-v2 to instantiate a model of type deberta. This is not supported for all configurations of models and can yield errors.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.0.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.1.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.s

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1041,1.053978,0.563559,0.3181
2,1.0341,0.985104,0.610169,0.344156
3,0.8293,0.980444,0.616525,0.34532


Sentiment model saved to ./results_laptop_sentiment


In [15]:
RestaurantDataSet = "/content/drive/MyDrive/Colab_Data/ABSA_Datasets/Restaurants_Train_v2.csv"



Restaurant_aspect_path, Restaurant_sentiment_path, Restaurant_polarity_map = run_absa_training(
    dataset_path=RestaurantDataSet,
    model_save_prefix='Restaurant'
)




--- Starting training for /content/drive/MyDrive/Colab_Data/ABSA_Datasets/Restaurants_Train_v2.csv ---


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



--- Training Aspect Extraction Model ---


You are using a model of type deberta-v2 to instantiate a model of type deberta. This is not supported for all configurations of models and can yield errors.
Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.0.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.1.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.self

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2062,0.222474,0.922489,0.72322
2,0.188,0.198703,0.929137,0.752107
3,0.1188,0.198244,0.933239,0.769472


Aspect model saved to ./results_Restaurant_aspect

--- Training Sentiment Classification Model ---


You are using a model of type deberta-v2 to instantiate a model of type deberta. This is not supported for all configurations of models and can yield errors.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'deberta.encoder.layer.0.attention.self.in_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_proj.weight', 'deberta.encoder.layer.0.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.0.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.0.attention.self.q_bias', 'deberta.encoder.layer.0.attention.self.v_bias', 'deberta.encoder.layer.1.attention.self.in_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_proj.weight', 'deberta.encoder.layer.1.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.1.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.1.attention.self.q_bias', 'deberta.encoder.layer.1.attention.s

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9363,0.93043,0.572395,0.182014
2,0.9134,1.083671,0.589986,0.2233
3,0.8481,0.897311,0.65088,0.334683


Sentiment model saved to ./results_Restaurant_sentiment


##Load saved models and test prediction



##Laptop Models

In [13]:
from transformers import DebertaForTokenClassification, DebertaForSequenceClassification


save_dir = "/content/drive/MyDrive/Colab_Models/ABSA_Models"
os.makedirs(save_dir, exist_ok=True)

#Laptop Models
laptop_aspect_model = DebertaForTokenClassification.from_pretrained(laptop_aspect_path)
laptop_sentiment_model = DebertaForSequenceClassification.from_pretrained(laptop_sentiment_path)

laptop_aspect_model.save_pretrained(f"{save_dir}/laptop_aspect_model")
tokenizer.save_pretrained(f"{save_dir}/laptop_aspect_model")

laptop_sentiment_model.save_pretrained(f"{save_dir}/laptop_sentiment_model")
tokenizer.save_pretrained(f"{save_dir}/laptop_sentiment_model")





('/content/drive/MyDrive/Colab_Models/ABSA_Models/laptop_sentiment_model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab_Models/ABSA_Models/laptop_sentiment_model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab_Models/ABSA_Models/laptop_sentiment_model/spm.model',
 '/content/drive/MyDrive/Colab_Models/ABSA_Models/laptop_sentiment_model/added_tokens.json',
 '/content/drive/MyDrive/Colab_Models/ABSA_Models/laptop_sentiment_model/tokenizer.json')

##Restaurant Models


In [16]:
#Restaurant Models

Restaurant_aspect_model = DebertaForTokenClassification.from_pretrained(Restaurant_aspect_path)
Restaurant_sentiment_model = DebertaForSequenceClassification.from_pretrained(Restaurant_sentiment_path)

Restaurant_aspect_model.save_pretrained(f"{save_dir}/Restaurant_aspect_model")
tokenizer.save_pretrained(f"{save_dir}/Restaurant_aspect_model")

Restaurant_sentiment_model.save_pretrained(f"{save_dir}/Restaurant_sentiment_model")
tokenizer.save_pretrained(f"{save_dir}/Restaurant_sentiment_model")

('/content/drive/MyDrive/Colab_Models/ABSA_Models/Restaurant_sentiment_model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab_Models/ABSA_Models/Restaurant_sentiment_model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab_Models/ABSA_Models/Restaurant_sentiment_model/spm.model',
 '/content/drive/MyDrive/Colab_Models/ABSA_Models/Restaurant_sentiment_model/added_tokens.json',
 '/content/drive/MyDrive/Colab_Models/ABSA_Models/Restaurant_sentiment_model/tokenizer.json')

##Predict function

In [17]:
#Predict function

import torch
import numpy as np

def predict_absa(sentence, aspect_model, sentiment_model, tokenizer, inv_label_map, inv_polarity_map, max_len=128):
    aspect_model.eval()
    sentiment_model.eval()

    # ========== 1️⃣ ASPECT EXTRACTION ==========
    inputs = tokenizer(
        sentence,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=max_len
    )

    with torch.no_grad():
        outputs = aspect_model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=2).squeeze().tolist()

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
    labels = [inv_label_map[p] for p in preds]

    # Extract aspect terms from BIO tags
    aspects = []
    current_aspect = []
    for token, label in zip(tokens, labels):
        if label == 'B-ASP':
            if current_aspect:
                aspects.append(' '.join(current_aspect))
                current_aspect = []
            current_aspect.append(token)
        elif label == 'I-ASP' and current_aspect:
            current_aspect.append(token)
        else:
            if current_aspect:
                aspects.append(' '.join(current_aspect))
                current_aspect = []
    if current_aspect:
        aspects.append(' '.join(current_aspect))

    # Clean up subword tokens like "##ing", "##ly"
    clean_aspects = []
    for asp in aspects:
        asp = asp.replace("##", "")
        asp = asp.replace("[CLS]", "").replace("[SEP]", "").strip()
        if asp:
            clean_aspects.append(asp)

    if not clean_aspects:
        return {"aspects": [], "sentiments": []}

    # ========== 2️⃣ SENTIMENT CLASSIFICATION ==========
    sentiments = []
    for aspect in clean_aspects:
        combined_text = f"{aspect} [SEP] {sentence}"
        enc = tokenizer(
            combined_text,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=max_len
        )

        with torch.no_grad():
            output = sentiment_model(**enc)
            pred = torch.argmax(output.logits, dim=1).item()

        sentiments.append(inv_polarity_map[pred])

    return {"aspects": clean_aspects, "sentiments": sentiments}




###
# Load the fine-tuned laptop models

laptop_aspect_model = DebertaForTokenClassification.from_pretrained(f"{save_dir}/laptop_aspect_model")
laptop_sentiment_model = DebertaForSequenceClassification.from_pretrained(f"{save_dir}/laptop_sentiment_model")
tokenizer = AutoTokenizer.from_pretrained(f"{save_dir}/laptop_aspect_model")

inv_laptop_polarity_map = {v: k for k, v in laptop_polarity_map.items()}

# Test on a sample review
laptop_review = "The keyboard is great but the battery dies quickly."
results = predict_absa(
    laptop_review,
    laptop_aspect_model,
    laptop_sentiment_model,
    tokenizer,
    inv_label_map,
    inv_laptop_polarity_map
)
print(results)


{'aspects': ['▁keyboard', '▁battery'], 'sentiments': ['positive', 'positive']}
