# SAAM-R Workflow: Aspect-Based Sentiment Regression

This workflow implements the SAAM-R (Sentiment-Aspect Attribution Module for Regression) model to extract fine-grained sentiment scores for predefined aspects (e.g., look, smell, taste, feel) from beer reviews.

---

## 1. Prepare aspect score-labeled dataset

In [4]:
import pandas as pd
csv_path = '../data/reviews_undersampled_dataset.csv'
# Load the CSV file with Dask (lazy evaluation)
df = pd.read_csv(csv_path)

In [5]:
df

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score,has_look,has_smell,has_taste,has_feel,mentioned_aspects,sentiment
0,1428,Aethien,2012-02-12,"poured into a tulip, it looks a nice golden...",3.00,1.5,1.5,2.0,2.0,1.74,True,True,True,True,"look, smell, taste, feel",negative
1,4351,kojevergas,2011-07-31,can served into norrebro bryghus stemware i...,2.50,2.0,2.0,1.5,2.0,1.98,True,True,True,True,"look, smell, taste, feel",negative
2,70485,CMUbrew,2012-03-10,reviewed from notes 500ml can poured into a...,3.00,1.0,1.0,1.5,1.0,1.17,True,True,True,True,"look, smell, taste, feel",negative
3,27069,maximum12,2010-05-12,rec'd a bottle of this unbidden in a recent...,2.50,2.0,2.0,2.0,1.0,1.83,True,True,True,True,"look, smell, taste, feel",negative
4,44788,HarleyRider,2010-06-26,heading out on the boat my wife brings home...,3.00,1.0,1.0,3.0,2.0,1.52,True,True,True,True,"look, smell, taste, feel",negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27697,90141,BlackBelt5112203,2013-07-26,5 oz pour into a tasting glass on tap at th...,4.25,4.5,4.5,4.0,4.5,4.44,True,True,True,True,"look, smell, taste, feel",positive
27698,2128,woodychandler,2011-08-08,an afternoon of aus beers continued at my a...,4.00,4.0,4.0,4.0,4.0,4.00,True,True,True,True,"look, smell, taste, feel",positive
27699,19216,05Harley,2011-02-25,bottled on: (should be on the bottom right ...,5.00,4.0,4.0,4.0,4.0,4.06,True,True,True,True,"look, smell, taste, feel",positive
27700,41951,brewdlyhooked13,2008-07-19,appearance - pours a dark reddish caramel w...,4.00,4.0,4.0,4.5,5.0,4.25,True,True,True,True,"look, smell, taste, feel",positive


In [6]:
import pandas as pd
import numpy as np

# Prepare aspect-level score columns
df['look_score'] = df['look'].where(df['has_look'], np.nan)
df['smell_score'] = df['smell'].where(df['has_smell'], np.nan)
df['taste_score'] = df['taste'].where(df['has_taste'], np.nan)
df['feel_score'] = df['feel'].where(df['has_feel'], np.nan)

# Drop rows with empty text or no aspect scores
df_cleaned = df.dropna(subset=['text'])
df_cleaned = df_cleaned.dropna(
    subset=['look_score', 'smell_score', 'taste_score', 'feel_score'],
    how='all'
)

# Keep only required columns
df_final = df_cleaned[['text', 'look_score', 'smell_score', 'taste_score', 'feel_score']]

# Save to CSV for SAAM-R
df_final.to_csv('../data/beer_saamr_ready.csv', index=False)
print("Dataset saved as: beer_saamr_ready.csv")

Dataset saved as: beer_saamr_ready.csv


In [7]:
df_final.head()

Unnamed: 0,text,look_score,smell_score,taste_score,feel_score
0,"poured into a tulip, it looks a nice golden...",3.0,1.5,1.5,2.0
1,can served into norrebro bryghus stemware i...,2.5,2.0,2.0,1.5
2,reviewed from notes 500ml can poured into a...,3.0,1.0,1.0,1.5
3,rec'd a bottle of this unbidden in a recent...,2.5,2.0,2.0,2.0
4,heading out on the boat my wife brings home...,3.0,1.0,1.0,3.0


In [9]:
import pandas as pd

# Assuming you already have df_final (containing text, look_score, smell_score, taste_score, feel_score)
aspects = ['look', 'smell', 'taste', 'feel']
records = []

# Iterate through each row of the DataFrame
for _, row in df_final.iterrows():
    for aspect in aspects:
        score_col = f"{aspect}_score"
        if pd.notna(row[score_col]):  # Skip if score is missing
            records.append({
                'text': row['text'],
                'aspect': aspect,
                'score': row[score_col]
            })

# Create a new DataFrame in SAAM-R format
df_saamr_format = pd.DataFrame(records)

# Save to CSV file for SAAM-R
df_saamr_format.to_csv('../data/beer_saamr_train_ready.csv', index=False)
print("SAAM-R training format data saved as: beer_saamr_train_ready.csv")

# Display the first few rows of the resulting DataFrame
df_saamr_format.head()

SAAM-R training format data saved as: beer_saamr_train_ready.csv


Unnamed: 0,text,aspect,score
0,"poured into a tulip, it looks a nice golden...",look,3.0
1,"poured into a tulip, it looks a nice golden...",smell,1.5
2,"poured into a tulip, it looks a nice golden...",taste,1.5
3,"poured into a tulip, it looks a nice golden...",feel,2.0
4,can served into norrebro bryghus stemware i...,look,2.5


In [10]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

# Download Punkt tokenizer if not already downloaded
nltk.download('punkt')

# Load the formatted dataset (from previous step)
df = pd.read_csv('../data/beer_saamr_train_ready.csv')

# Split each review text into individual sentences
df['sentences'] = df['text'].apply(sent_tokenize)

# Explode the sentence list so each sentence becomes a separate row (linked to the same aspect and score)
df_exploded = df.explode('sentences').reset_index(drop=True)

# Rename the exploded sentence column to 'sentence' for clarity
df_exploded = df_exploded.rename(columns={'sentences': 'sentence'})

# Save the sentence-level dataset for SAAM-R model input
df_exploded.to_csv('../data/beer_saamr_sentences.csv', index=False)
print("Saved sentence-level dataset as: beer_saamr_sentences.csv")

# Preview the result
df_exploded.head()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhangzihan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Saved sentence-level dataset as: beer_saamr_sentences.csv


Unnamed: 0,text,aspect,score,sentence
0,"poured into a tulip, it looks a nice golden...",look,3.0,"poured into a tulip, it looks a nice golden..."
1,"poured into a tulip, it looks a nice golden...",look,3.0,"the smell is just alcohol, alcohol and alcohol..."
2,"poured into a tulip, it looks a nice golden...",look,3.0,the taste is just sugary sweetness and alcohol...
3,"poured into a tulip, it looks a nice golden...",look,3.0,the mouthfeel isn't bad but that's about all i...
4,"poured into a tulip, it looks a nice golden...",look,3.0,"all in all, it just tastes artificial to me, i..."


In [None]:
# Rename columns to standard names for SAAM-R
df_input = df_exploded.rename(columns={
    'sentence': 'text',
    'aspect': 'aspect',
    'score': 'label'
})

# Save the model-ready input
df_input.to_csv('../data/beer_saamr_model_input.csv', index=False)
print("Final SAAM-R training data saved as: beer_saamr_model_input.csv")

Final SAAM-R training data saved as: beer_saamr_model_input.csv


Unnamed: 0,text,aspect,label,text.1
0,"poured into a tulip, it looks a nice golden...",look,3.0,"poured into a tulip, it looks a nice golden..."
1,"poured into a tulip, it looks a nice golden...",look,3.0,"the smell is just alcohol, alcohol and alcohol..."
2,"poured into a tulip, it looks a nice golden...",look,3.0,the taste is just sugary sweetness and alcohol...
3,"poured into a tulip, it looks a nice golden...",look,3.0,the mouthfeel isn't bad but that's about all i...
4,"poured into a tulip, it looks a nice golden...",look,3.0,"all in all, it just tastes artificial to me, i..."


In [13]:
df_input.head()

Unnamed: 0,text,aspect,label,text.1
0,"poured into a tulip, it looks a nice golden...",look,3.0,"poured into a tulip, it looks a nice golden..."
1,"poured into a tulip, it looks a nice golden...",look,3.0,"the smell is just alcohol, alcohol and alcohol..."
2,"poured into a tulip, it looks a nice golden...",look,3.0,the taste is just sugary sweetness and alcohol...
3,"poured into a tulip, it looks a nice golden...",look,3.0,the mouthfeel isn't bad but that's about all i...
4,"poured into a tulip, it looks a nice golden...",look,3.0,"all in all, it just tastes artificial to me, i..."


In [29]:
import torch
import torch.nn as nn
from transformers import AutoModel

# Sentence Encoder
class SentenceEncoder(nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super(SentenceEncoder, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.last_hidden_state[:, 0]  # CLS embedding

# Aspect Encoder
class AspectEncoder(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(AspectEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.ReLU()
        )

    def forward(self, x):
        return self.encoder(x)

# Regression Head
class RegressionHead(nn.Module):
    def __init__(self, input_dim):
        super(RegressionHead, self).__init__()
        self.regressor = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.regressor(x).squeeze(-1)

# SAAM-R model
class SAAMRModel(nn.Module):
    def __init__(self, sentence_encoder, aspect_encoder, regression_head):
        super(SAAMRModel, self).__init__()
        self.sentence_encoder = sentence_encoder
        self.aspect_encoder = aspect_encoder
        self.regression_head = regression_head

    def forward(self, input_ids, attention_mask):
        sentence_rep = self.sentence_encoder(input_ids, attention_mask)
        aspect_rep = self.aspect_encoder(sentence_rep)
        score = self.regression_head(aspect_rep)
        return score

In [30]:
# Define each module
sentence_encoder = SentenceEncoder('bert-base-uncased')
aspect_encoder = AspectEncoder(input_dim=768, output_dim=768)
regression_head = RegressionHead(input_dim=768)

# Compose into SAAM-R model
model = SAAMRModel(sentence_encoder, aspect_encoder, regression_head)

# Define loss and optimizer
loss_fn = nn.MSELoss()
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

In [31]:
import torch
import torch.nn as nn
from torch.optim import AdamW

# Define loss function
loss_fn = nn.MSELoss()

# Initialize optimizer (you can set model.parameters())
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

In [32]:
from tqdm import tqdm

def train_one_epoch(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        aspect_indices = batch['aspect_idx'].to(device)
        labels = batch['score'].to(device)

        optimizer.zero_grad()
        preds = model(input_ids, attention_mask, aspect_indices)
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [33]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def evaluate(model, dataloader, device):
    model.eval()
    preds, targets = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            aspect_indices = batch['aspect_idx'].to(device)
            labels = batch['score'].to(device)

            outputs = model(input_ids, attention_mask, aspect_indices)
            preds.extend(outputs.cpu().numpy())
            targets.extend(labels.cpu().numpy())

    mae = mean_absolute_error(targets, preds)
    rmse = mean_squared_error(targets, preds, squared=False)
    return mae, rmse

In [35]:
from torch.utils.data import Dataset

class BeerReviewDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        inputs = self.tokenizer(
            row['sentence'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'score': torch.tensor(row['score'], dtype=torch.float)
        }

In [36]:
df_exploded

Unnamed: 0,text,aspect,score,sentence
0,"poured into a tulip, it looks a nice golden...",look,3.0,"poured into a tulip, it looks a nice golden..."
1,"poured into a tulip, it looks a nice golden...",look,3.0,"the smell is just alcohol, alcohol and alcohol..."
2,"poured into a tulip, it looks a nice golden...",look,3.0,the taste is just sugary sweetness and alcohol...
3,"poured into a tulip, it looks a nice golden...",look,3.0,the mouthfeel isn't bad but that's about all i...
4,"poured into a tulip, it looks a nice golden...",look,3.0,"all in all, it just tastes artificial to me, i..."
...,...,...,...,...
1550331,"22 ounce bottle into tulip glass, bottled o...",feel,4.0,"medium carbonation and body; with a smooth, cr..."
1550332,"22 ounce bottle into tulip glass, bottled o...",feel,4.0,alcohol is very well hidden with only a light ...
1550333,"22 ounce bottle into tulip glass, bottled o...",feel,4.0,overall this is an awesome dipa.
1550334,"22 ounce bottle into tulip glass, bottled o...",feel,4.0,all around great complexity of citrus/floral h...


In [37]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_exploded, test_size=0.1, random_state=42)

In [38]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_dataset = BeerReviewDataset(df_train, tokenizer)
val_dataset = BeerReviewDataset(df_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [40]:
def train_one_epoch(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        scores = batch['score'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.squeeze(), scores)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [41]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

def evaluate(model, dataloader, device):
    model.eval()
    predictions, targets = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            scores = batch['score'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = outputs.squeeze().cpu().numpy()
            labels = scores.cpu().numpy()

            predictions.extend(preds)
            targets.extend(labels)

    mae = mean_absolute_error(targets, predictions)
    rmse = np.sqrt(mean_squared_error(targets, predictions))
    return mae, rmse

In [44]:
from transformers import get_linear_schedule_with_warmup

num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    train_loss = train_one_epoch(model, train_loader, optimizer, loss_fn, device)
    mae, rmse = evaluate(model, val_loader, device)
    print(f"Train Loss: {train_loss:.4f} | Val MAE: {mae:.3f} | RMSE: {rmse:.3f}")
    scheduler.step()


Epoch 1/3


KeyboardInterrupt: 