In [19]:
csv_path='../data/reviews_stratified_sampled.csv'
df = dd.read_csv(csv_path)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch.nn as nn
import os
from typing import Dict, Optional, Union, List

class ABSARegressor(nn.Module):
    """
    BERT-based regression model for aspect-based sentiment analysis
    """
    def __init__(
        self,
        base_model_name: str = "GiRak/beer-sentiment-bert",
        dropout_rate: float = 0.1,
        freeze_encoder: bool = True
    ):
        super().__init__()
        config = AutoConfig.from_pretrained(base_model_name)
        self.bert = AutoModel.from_pretrained(base_model_name, config=config)
        if freeze_encoder:
            for param in self.bert.parameters():
                param.requires_grad = False
        self.dropout = nn.Dropout(dropout_rate)
        hidden_size = config.hidden_size
        self.regressor = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = bert_outputs.pooler_output
        x = self.dropout(pooled)
        logits = self.regressor(x).squeeze(-1)
        output = {"logits": logits}
        if labels is not None:
            loss_fct = nn.MSELoss()
            output["loss"] = loss_fct(logits, labels)
        return output

class BeerReviewAnalyzer:
    """
    啤酒评论分析器类，封装了模型加载和预测功能
    """
    def __init__(
        self,
        model_path: str = "1-sentiment-analysis/models/absa_bert_regressor",
        base_model_name: str = "GiRak/beer-sentiment-bert",
        device: str = "cuda" if torch.cuda.is_available() else "cpu"
    ):
        """
        初始化分析器
        
        Args:
            model_path: 模型路径
            base_model_name: 基础BERT模型名称
            device: 运行设备 ('cuda' 或 'cpu')
        """
        self.device = device
        self.model_path = model_path
        self.base_model_name = base_model_name
        self.aspects = ["look", "smell", "taste", "feel"]
        
        # 加载模型和分词器
        self._load_model_and_tokenizer()
        
    def _load_model_and_tokenizer(self):
        """加载模型和分词器"""
        try:
            # 加载分词器
            self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)
            print("Tokenizer loaded successfully!")

            # 检查模型路径
            if not os.path.exists(self.model_path):
                print(f"Warning: Model path {self.model_path} does not exist")
                alt_path = "../1-sentiment-analysis/models/absa_bert_regressor"
                if os.path.exists(alt_path):
                    self.model_path = alt_path
                else:
                    raise FileNotFoundError(f"Model not found in {self.model_path} or {alt_path}")

            print(f"Loading model from: {self.model_path}")

            # 初始化模型
            self.model = ABSARegressor()
            
            # 加载模型权重
            state_dict = torch.load(
                os.path.join(self.model_path, "model.safetensors"),
                map_location=torch.device(self.device),
                weights_only=False
            )
            self.model.load_state_dict(state_dict)
            self.model.to(self.device)
            self.model.eval()
            print("Model loaded successfully!")
            
        except Exception as e:
            print(f"Error loading model: {str(e)}")
            raise

    def analyze_review(
        self,
        text: str,
        aspects: Optional[List[str]] = None,
        return_dict: bool = True
    ) -> Union[Dict[str, float], List[float]]:
        """
        分析啤酒评论
        
        Args:
            text: 评论文本
            aspects: 要分析的具体方面，如果为None则分析所有方面
            return_dict: 是否返回字典格式的结果
            
        Returns:
            如果 return_dict=True，返回 {aspect: score} 格式的字典
            如果 return_dict=False，返回 [score1, score2, ...] 格式的列表
        """
        if aspects is None:
            aspects = self.aspects
            
        try:
            results = {}
            with torch.no_grad():
                for aspect in aspects:
                    input_text = f"{aspect}: {text}"
                    inputs = self.tokenizer(
                        input_text,
                        truncation=True,
                        padding="max_length",
                        max_length=128,
                        return_tensors="pt"
                    ).to(self.device)
                    
                    outputs = self.model(**inputs)
                    score = outputs["logits"].item()
                    results[aspect] = round(score, 2)
            
            return results if return_dict else list(results.values())
            
        except Exception as e:
            print(f"Error during prediction: {str(e)}")
            return None

    def analyze_batch(
        self,
        texts: List[str],
        aspects: Optional[List[str]] = None
    ) -> List[Dict[str, float]]:
        """
        批量分析多个评论
        
        Args:
            texts: 评论文本列表
            aspects: 要分析的具体方面
            
        Returns:
            包含每个评论分析结果的列表
        """
        return [self.analyze_review(text, aspects) for text in texts]

    def get_model_info(self) -> Dict:
        """
        获取模型信息
        
        Returns:
            包含模型信息的字典
        """
        return {
            "device": self.device,
            "model_path": self.model_path,
            "base_model": self.base_model_name,
            "num_parameters": sum(p.numel() for p in self.model.parameters()),
            "vocab_size": len(self.tokenizer),
            "aspects": self.aspects
        }

# 使用示例
if __name__ == "__main__":
    # 初始化分析器
    analyzer = BeerReviewAnalyzer()
    
    # 分析单个评论
    review = "This beer has a beautiful golden color with a nice head. The aroma is fruity and hoppy. The taste is well-balanced with a slight bitterness. The mouthfeel is smooth and creamy."
    results = analyzer.analyze_review(review)
    print("\nSingle review analysis:")
    for aspect, score in results.items():
        print(f"{aspect}: {score}")
    
    # 批量分析多个评论
    reviews = [
        "The beer looks cloudy and dark. It smells like coffee and chocolate. The taste is very bitter and strong. The mouthfeel is thick and heavy.",
        "Clear golden color with a white head. Aroma of citrus and pine. Taste is balanced with a nice hop bitterness. Smooth mouthfeel."
    ]
    batch_results = analyzer.analyze_batch(reviews)
    print("\nBatch analysis results:")
    for i, result in enumerate(batch_results, 1):
        print(f"\nReview {i}:")
        for aspect, score in result.items():
            print(f"{aspect}: {score}")
    
    # 获取模型信息
    model_info = analyzer.get_model_info()
    print("\nModel Information:")
    for key, value in model_info.items():
        print(f"{key}: {value}")

Tokenizer loaded successfully!
Loading model from: ../1-sentiment-analysis/models/absa_bert_regressor
Error loading model: invalid load key, '\xe0'.


UnpicklingError: invalid load key, '\xe0'.

In [3]:
df

Unnamed: 0_level_0,beer_id,username,date,text,look,smell,taste,feel,overall,score,has_look,has_smell,has_taste,has_feel,mentioned_aspects,sentiment
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
,int64,string,string,string,float64,float64,float64,float64,float64,float64,bool,bool,bool,bool,string,string
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [23]:
import pandas as pd
from tqdm import tqdm
import dask.dataframe as dd
import torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load beer review dataset (must contain a 'text' column)
csv_path='../data/reviews_stratified_sampled.csv'
df = pd.read_csv(csv_path) 

# Load the tokenizer and fine-tuned BERT model for beer sentiment classification
model_name = "GiRak/beer-sentiment-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()  # Set the model to evaluation mode

# Prepare a list to store sentiment predictions
predicted_sentiments = []

# Loop through each review and classify sentiment
for text in tqdm(df['text'], desc="Predicting sentiment"):
    # Tokenize the input text and prepare it for the model
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Disable gradient calculation for inference
    with torch.no_grad():
        # Get raw logits from the model
        outputs = model(**inputs)
        
        # Apply softmax to convert logits into probability distribution
        probs = softmax(outputs.logits, dim=1)
        
        # Get the predicted label (0 = negative, 1 = positive)
        label_id = torch.argmax(probs).item()
        predicted_label = 'positive' if label_id == 1 else 'negative'
        
        # Store the predicted sentiment label
        predicted_sentiments.append(predicted_label)

# Add the predictions to the original DataFrame
df['bert_sentiment'] = predicted_sentiments

# Save the DataFrame with sentiment predictions to a new CSV file
df.to_csv('../data/beer_reviews_with_bert_sentiment.csv', index=False)
print("Sentiment analysis completed and results saved to 'beer_reviews_with_bert_sentiment.csv'")


Predicting sentiment: 100%|██████████| 27702/27702 [49:49<00:00,  9.27it/s]  


Sentiment analysis completed and results saved to 'beer_reviews_with_bert_sentiment.csv'


In [24]:
df_bert = pd.read_csv('../data/beer_reviews_with_bert_sentiment.csv')
df_bert

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score,has_look,has_smell,has_taste,has_feel,mentioned_aspects,sentiment,bert_sentiment
0,1428,Aethien,2012-02-12,"poured into a tulip, it looks a nice golden...",3.00,1.5,1.5,2.0,2.0,1.74,True,True,True,True,"look, smell, taste, feel",negative,negative
1,4351,kojevergas,2011-07-31,can served into norrebro bryghus stemware i...,2.50,2.0,2.0,1.5,2.0,1.98,True,True,True,True,"look, smell, taste, feel",negative,negative
2,70485,CMUbrew,2012-03-10,reviewed from notes 500ml can poured into a...,3.00,1.0,1.0,1.5,1.0,1.17,True,True,True,True,"look, smell, taste, feel",negative,negative
3,27069,maximum12,2010-05-12,rec'd a bottle of this unbidden in a recent...,2.50,2.0,2.0,2.0,1.0,1.83,True,True,True,True,"look, smell, taste, feel",negative,negative
4,44788,HarleyRider,2010-06-26,heading out on the boat my wife brings home...,3.00,1.0,1.0,3.0,2.0,1.52,True,True,True,True,"look, smell, taste, feel",negative,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27697,90141,BlackBelt5112203,2013-07-26,5 oz pour into a tasting glass on tap at th...,4.25,4.5,4.5,4.0,4.5,4.44,True,True,True,True,"look, smell, taste, feel",positive,positive
27698,2128,woodychandler,2011-08-08,an afternoon of aus beers continued at my a...,4.00,4.0,4.0,4.0,4.0,4.00,True,True,True,True,"look, smell, taste, feel",positive,positive
27699,19216,05Harley,2011-02-25,bottled on: (should be on the bottom right ...,5.00,4.0,4.0,4.0,4.0,4.06,True,True,True,True,"look, smell, taste, feel",positive,positive
27700,41951,brewdlyhooked13,2008-07-19,appearance - pours a dark reddish caramel w...,4.00,4.0,4.0,4.5,5.0,4.25,True,True,True,True,"look, smell, taste, feel",positive,positive


In [30]:
# Define the aspect columns to include in score computation
aspect_columns = ['look', 'smell', 'taste', 'feel']

# Step 1: Compute the mean score across the 4 aspects
df_bert['average_score'] = df_bert[aspect_columns].mean(axis=1).round(2)

# Step 2: Convert the average score into binary sentiment labels
# Rule: score >= 3.5 → 'positive', else → 'negative'
df_bert['new_sentiment'] = df_bert['average_score'].apply(
    lambda x: 'positive' if x >= 3.5 else 'negative'
)

# Step 3: Display updated DataFrame preview
print(df_bert[['look', 'smell', 'taste', 'feel', 'average_score','new_sentiment']].head())


   look  smell  taste  feel  average_score new_sentiment
0   3.0    1.5    1.5   2.0           2.00      negative
1   2.5    2.0    2.0   1.5           2.00      negative
2   3.0    1.0    1.0   1.5           1.62      negative
3   2.5    2.0    2.0   2.0           2.12      negative
4   3.0    1.0    1.0   3.0           2.00      negative


In [31]:
# Ensure both sentiment columns are lowercase strings
df_bert['new_sentiment'] = df_bert['new_sentiment'].astype(str).str.lower()
df_bert['bert_sentiment'] = df_bert['bert_sentiment'].astype(str).str.lower()

# Create a boolean column indicating whether the prediction matches the reference
df_bert['match'] = df_bert['new_sentiment'] == df_bert['bert_sentiment']

# Calculate match statistics
total = len(df_bert)
matched = df_bert['match'].sum()
unmatched = total - matched
accuracy = matched / total

# Print summary
print(f"Sentiment agreement: {accuracy:.2%} ({matched}/{total} matched)")
print(f"Mismatches: {unmatched}")

# Display a sample of mismatched rows for manual inspection
print("\n Sample mismatches:")
display(df_bert[df_bert['match'] == False][['text', 'new_sentiment', 'bert_sentiment']].head(10))


Sentiment agreement: 81.16% (22483/27702 matched)
Mismatches: 5219

 Sample mismatches:


Unnamed: 0,text,new_sentiment,bert_sentiment
9,"$2.30 for a 12oz single @ jacks. a ""robust ...",negative,positive
16,thanks to bitterpachyderm (!) for sending m...,negative,positive
20,"presentation: 1 pint 0.9 fluid ounce, half ...",negative,positive
21,"355ml foil-topped brown bottle, not sure if...",negative,positive
31,chilled bottle into a glass. a generous ext...,negative,positive
37,11.2 oz. bottle into a pint glass. appearan...,negative,positive
41,i was highly anticipating trying orval afte...,negative,positive
47,poured into a snifter on 1/7/11 shared by s...,negative,positive
49,very clear and crisp medium dark amber colo...,negative,positive
53,poured this 22 oz. brown bottle into a dfh ...,negative,positive
