In [12]:
import transformers
print(transformers.__version__)

4.51.3


In [13]:
!pip install -U transformers



In [15]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)

In [16]:
# Cell 2: Data Preprocessing Function
def preprocess_absa_data(
    input_path: str = "../data/reviews_undersampled_dataset.csv",
    aspect_list: list = ["look", "smell", "taste", "feel"],
    test_size: float = 0.2,
    random_state: int = 42
):
    """
    Load raw dataset and produce train/test splits in ABSA format.
    Each review is split into multiple samples, one per non-null aspect score.
    """
    df = pd.read_csv(input_path)
    absa_samples = []
    for _, row in df.iterrows():
        text = row["text"]
        for aspect in aspect_list:
            score = row.get(aspect)
            if pd.notnull(score):
                absa_samples.append({
                    "text": text,
                    "aspect": aspect,
                    "score": score
                })
    absa_df = pd.DataFrame(absa_samples)
    train_df, test_df = train_test_split(
        absa_df,
        test_size=test_size,
        random_state=random_state
    )
    print(f"Preprocessing complete. Train samples: {len(train_df)}, Test samples: {len(test_df)}")
    return train_df, test_df

In [17]:
# Cell 3: PyTorch Dataset Class
class ABSADataSet(Dataset):
    """
    PyTorch Dataset for ABSA regression.
    Prepares (input_ids, attention_mask, labels) for each (aspect, text, score) sample.
    """
    def __init__(
        self,
        data,
        tokenizer_name: str = "GiRak/beer-sentiment-bert",
        max_length: int = 128
    ):
        if isinstance(data, str):
            df = pd.read_csv(data)
        else:
            df = data.copy()
        self.texts = df["text"].tolist()
        self.aspects = df["aspect"].tolist()
        self.labels = df["score"].tolist()
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx: int):
        text = self.texts[idx]
        aspect = self.aspects[idx]
        label = self.labels[idx]
        input_text = f"{aspect}: {text}"
        encoding = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.float)
        return item

In [18]:
# Cell 4: ABSA Regressor Model with SmoothL1Loss and Last Two Layers Unfrozen
class ABSARegressor(nn.Module):
    """
    Regression model for ABSA using a (partially unfrozen) BERT encoder and a linear head.
    Uses SmoothL1Loss (Huber Loss) for robustness.
    """
    def __init__(
        self,
        base_model_name: str = "GiRak/beer-sentiment-bert",
        dropout_rate: float = 0.1,
        freeze_encoder: bool = True
    ):
        super().__init__()
        config = AutoConfig.from_pretrained(base_model_name)
        self.bert = AutoModel.from_pretrained(base_model_name, config=config)
        # Freeze all layers first
        for param in self.bert.parameters():
            param.requires_grad = False
        # Unfreeze the last two encoder layers and pooler
        for name, param in self.bert.named_parameters():
            if "encoder.layer.10" in name or "encoder.layer.11" in name or "pooler" in name:
                param.requires_grad = True
        self.dropout = nn.Dropout(dropout_rate)
        hidden_size = config.hidden_size
        self.regressor = nn.Linear(hidden_size, 1)
        self.loss_fn = nn.SmoothL1Loss()  # Use Huber Loss

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        bert_outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled = bert_outputs.pooler_output
        x = self.dropout(pooled)
        logits = self.regressor(x).squeeze(-1)
        output = {"logits": logits}
        if labels is not None:
            output["loss"] = self.loss_fn(logits, labels)
        return output

In [19]:
# Cell 5: Metrics Function
def compute_metrics(eval_pred):
    """
    Compute Mean Absolute Error and Root Mean Squared Error.
    """
    predictions, labels = eval_pred
    preds = predictions.squeeze(-1) if predictions.ndim > 1 else predictions
    mae = np.mean(np.abs(preds - labels))
    rmse = np.sqrt(np.mean((preds - labels) ** 2))
    return {"mae": mae, "rmse": rmse}

In [24]:
# Cell 6: Training and Evaluation
def main():
    # 1. Preprocess data
    train_df, test_df = preprocess_absa_data()

    # 2. Prepare datasets
    train_dataset = ABSADataSet(data=train_df)
    eval_dataset  = ABSADataSet(data=test_df)

    # 3. Data collator for dynamic padding
    data_collator = DataCollatorWithPadding(tokenizer=train_dataset.tokenizer)

        # 4. Training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        logging_steps=100,
        save_steps=max(len(train_dataset)//16, 1),
        save_total_limit=2,
        fp16=False,
        evaluation_strategy="epoch",
        metric_for_best_model="eval_loss"  # Required for EarlyStoppingCallback
    )


    # 5. Initialize model with last two layers unfrozen
    model = ABSARegressor(freeze_encoder=True)

    # 6. Initialize Trainer with EarlyStopping
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # 7. Train and evaluate
    trainer.train()
    eval_results = trainer.evaluate(eval_dataset=eval_dataset)
    print(f"Evaluation results: {eval_results}")

    # 8. Save final model
    trainer.save_model("./models/absa_bert_regressor")

if __name__ == "__main__":
    main()

Preprocessing complete. Train samples: 85929, Test samples: 21483


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'