# Aspect-based Sentiment Regression and Evaluation for Beer Reviews
### (Fine-tuned from a Pretrained BERT Model: GiRak/beer-sentiment-bert)
### 1. Base Model selection
- We chose a domain-specific pre-trained model (GiRak/beer-sentiment-bert)
- This model was already fine-tuned on beer-related text
- Provides a strong foundation for beer review analysis
### 2. Model Architecture Moddification
#### Parameter Freezing:
- All BERT parameters are frozen (freeze_encoder=True)
- Prevents catastrophic forgetting
- Preserves pre-trained knowledge
#### New Layers Added:
- Dropout layer (rate=0.1) for regularization
- Linear regression head for score prediction
- Only these new layers are trainable
### 3. Input Processing Modifications
- Modified input format to include aspect information
- Combined aspect and text using template: "{aspect}: {text}"
- Enables aspect-specific sentiment analysis
### 4. Training Strategy Modifications
- Changed from classification to regression
- Using MSE loss for score prediction
- Suitable for 1-5 rating scale

### 5. Evaluation Modifications
- Added regression-specific metrics (MAE, RMSE)
- Suitable for continuous score prediction
- Better evaluation of model performance

In [1]:
# train_absa.py

import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import datetime

## Data Preprocessing
### This function preprocesses the raw dataset into ABSA format.
- Loads raw dataset
- Splits into train/test sets
- Creates ABSA samples for each review
- Returns train/test DataFrames


In [2]:
def preprocess_absa_data(
    input_path: str = "../data/train_preprocessed_binned.csv",
    aspect_list: list = ["look", "smell", "taste", "feel"],
    test_size: float = 0.2,
    random_state: int = 42
):
    """
    Load raw dataset and produce train/test splits in ABSA format.
    Each review is split into multiple samples, one per non-null aspect score.
    """
    df = pd.read_csv(input_path)
    absa_samples = []
    for _, row in df.iterrows():
        text = row["text"]
        for aspect in aspect_list:
            score = row.get(aspect)
            if pd.notnull(score):
                absa_samples.append({
                    "text": text,
                    "aspect": aspect,
                    "score": score
                })
    absa_df = pd.DataFrame(absa_samples)
    train_df, test_df = train_test_split(
        absa_df,
        test_size=test_size,
        random_state=random_state
    )
    print(f"Preprocessing complete. Train samples: {len(train_df)}, Test samples: {len(test_df)}")
    return train_df, test_df

## Dataset Class
- Handles text and aspect data preparation
- Implements tokenization using pre-trained tokenizer
- Manages input sequence length
- Prepares data for model training

In [3]:
class ABSADataSet(Dataset):
    """
    PyTorch Dataset for ABSA regression.
    Prepares (input_ids, attention_mask, labels) for each (aspect, text, score) sample.
    """

    def __init__(
        self,
        data,
        tokenizer_name: str = "GiRak/beer-sentiment-bert",
        max_length: int = 128
    ):
        """
        data: either a pandas DataFrame or path to CSV with columns ['text', 'aspect', 'score'].
        """
        if isinstance(data, str):
            df = pd.read_csv(data)
        else:
            df = data.copy()

        self.texts = df["text"].tolist()
        self.aspects = df["aspect"].tolist()
        self.labels = df["score"].tolist()
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx: int):
        text = self.texts[idx]
        aspect = self.aspects[idx]
        label = self.labels[idx]
        input_text = f"{aspect}: {text}"
        encoding = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.float)
        return item


## Model Architecture
### This class implements the main model architecture using a pre-trained BERT model. It includes:
- Pre-trained BERT model initialization
- Optional parameter freezing for transfer learning
- Dropout layer for regularization
- Linear regression head for score prediction


In [None]:
class ABSARegressor(nn.Module):
    """
    Regression model for ABSA using a (frozen or unfrozen) BERT encoder and a linear head.
    """

    def __init__(
        self,
        base_model_name: str = "GiRak/beer-sentiment-bert",
        dropout_rate: float = 0.1,
        freeze_encoder: bool = True
    ):
        """
        freeze_encoder: if True, all BERT encoder parameters will be frozen.
        """
        super().__init__()
        config = AutoConfig.from_pretrained(base_model_name)
        self.bert = AutoModel.from_pretrained(base_model_name, config=config)
        if freeze_encoder:
            for param in self.bert.parameters():
                param.requires_grad = False
        self.dropout = nn.Dropout(dropout_rate)
        hidden_size = config.hidden_size
        self.regressor = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # use pooled output (CLS token) for regression
        pooled = bert_outputs.pooler_output
        x = self.dropout(pooled)
        logits = self.regressor(x).squeeze(-1)
        output = {"logits": logits}
        if labels is not None:
            loss_fct = nn.MSELoss()
            output["loss"] = loss_fct(logits, labels)
        return output

## Evaluation Metrics
- Mean Absolute Error (MAE)
- Root Mean Squared Error (RMSE)
- Used for model performance evaluation

In [5]:
import numpy as np

def compute_metrics(eval_pred):
    """
    Compute Mean Absolute Error and Root Mean Squared Error.
    """
    predictions, labels = eval_pred
    preds = predictions.squeeze(-1) if predictions.ndim > 1 else predictions
    mae = np.mean(np.abs(preds - labels))
    rmse = np.sqrt(np.mean((preds - labels) ** 2))
    return {"mae": mae, "rmse": rmse}


## Training Loop
### Main training pipeline:
- Data preparation and loading
- Model initialization
- Trainer setup
- Training execution
- Evaluation

In [None]:
def main():
    """
    Full training routine using Huggingface Trainer.
    """
    # Preprocess data
    train_df, test_df = preprocess_absa_data()

    # Prepare datasets
    train_dataset = ABSADataSet(data=train_df)
    eval_dataset  = ABSADataSet(data=test_df)

    # Data collator for dynamic padding
    data_collator = DataCollatorWithPadding(tokenizer=train_dataset.tokenizer)

    # Training arguments (adjust for small-scale testing if needed)
    training_args = TrainingArguments(
        output_dir="./results",
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        logging_steps=100,
        save_steps=max(len(train_dataset)//16, 1),
        save_total_limit=2,
        fp16=False  # set True if GPU with mixed precision available
    )

    # Initialize model with frozen encoder
    model = ABSARegressor(freeze_encoder=True)

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train and evaluate
    trainer.train()
    eval_results = trainer.evaluate(eval_dataset=eval_dataset)
    print(f"Evaluation results: {eval_results}")

    # Save final model
    model_save_path = f"./models/absa_bert_regressor_02"
    trainer.save_model(model_save_path)
    train_dataset.tokenizer.save_pretrained(model_save_path)

if __name__ == "__main__":
    main()

In [11]:
train_df, test_df = preprocess_absa_data()

print("Train DataFrame Preview:")
display(train_df.head())

print("\nTrain DataFrame Info:")
train_df.info()

print("\nTest DataFrame Preview:")
display(test_df.head())



Preprocessing complete. Train samples: 85929, Test samples: 21483
Train DataFrame Preview:


Unnamed: 0,text,aspect,score
47077,comes in a clear 550ml.glass bottle and pou...,smell,4.0
1297,"500 ml. bottle. capped, batch # 246 b. date...",smell,1.0
75327,"on cask at meridian pint (march 2014), serv...",feel,4.75
13420,"here we go, we the oddly placed pantheon of...",look,2.5
34534,poured from a bottle. expiration 4/22/14. a...,taste,1.75



Train DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 85929 entries, 47077 to 15795
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    85929 non-null  object 
 1   aspect  85929 non-null  object 
 2   score   85929 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.6+ MB

Test DataFrame Preview:


Unnamed: 0,text,aspect,score
28278,appearance: clear yellow with vaporware hea...,taste,2.0
80569,appearance: poured into a chimay glass with...,smell,4.5
64724,"thanks to readbaron for the bottle, spicela...",look,4.0
35023,341ml brown bottle with a twist-off cap pou...,feel,2.0
88146,acquired via trade...thanks frank. 12 ounce...,taste,4.5
