# Benchmark SemEval v·ªõi best_model_bitfit.pt
Model DistilBERT + LSTM v·ªõi BitFit (ch·ªâ train bias v√† layernorm)

In [None]:
# =============================================================================
# BENCHMARK SEMEVAL DATASET V·ªöI BEST_MODEL_BITFIT.PT
# =============================================================================
# Model: DistilBERT + LSTM v·ªõi BitFit (freeze DistilBERT, ch·ªâ train bias + layernorm)

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

# Ki·ªÉm tra GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üñ•Ô∏è Using device: {device}")

# =============================================================================
# ƒê·ªäNH NGHƒ®A MODEL ARCHITECTURE (gi·ªëng v·ªõi training notebook)
# =============================================================================
class DistilBertLSTMClassifier(nn.Module):
    def __init__(self, n_classes=3, lstm_hidden_size=128, lstm_layers=2, dropout=0.3):
        super(DistilBertLSTMClassifier, self).__init__()
        
        # DistilBERT
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        
        # Freeze DistilBERT parameters (gi·ªëng nh∆∞ khi train)
        for param in self.distilbert.parameters():
            param.requires_grad = False
        
        # Unfreeze bias v√† layernorm (BitFit)
        for name, param in self.distilbert.named_parameters():
            if "bias" in name:
                param.requires_grad = True
            if "layernorm" in name.lower():
                param.requires_grad = True
        
        # LSTM
        self.lstm = nn.LSTM(
            input_size=768,  # DistilBERT hidden size
            hidden_size=lstm_hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            dropout=dropout if lstm_layers > 1 else 0,
            bidirectional=True
        )
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Classifier
        self.fc = nn.Linear(lstm_hidden_size * 2, n_classes)  # *2 for bidirectional
        
    def forward(self, input_ids, attention_mask):
        # DistilBERT embeddings
        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # last_hidden_state: [batch_size, seq_len, 768]
        sequence_output = distilbert_output.last_hidden_state
        
        # LSTM
        lstm_output, (hidden, cell) = self.lstm(sequence_output)
        
        # L·∫•y output t·ª´ hidden state cu·ªëi c√πng c·ªßa c·∫£ 2 directions
        hidden_fwd = hidden[-2, :, :]  # forward direction
        hidden_bwd = hidden[-1, :, :]  # backward direction
        
        # Concatenate
        hidden_concat = torch.cat((hidden_fwd, hidden_bwd), dim=1)
        
        # Dropout
        hidden_concat = self.dropout(hidden_concat)
        
        # Classifier
        output = self.fc(hidden_concat)
        
        return output

print("‚úì Model class defined (BitFit architecture)")

In [None]:
# =============================================================================
# LOAD MODEL WEIGHTS
# =============================================================================
model_path = 'H:/SentimentAnalystSchool/MidtermExam/ModelWeight/best_model_bitfit.pt'

print(f"üì¶ Loading model from: {model_path}")

# Kh·ªüi t·∫°o model
model = DistilBertLSTMClassifier(n_classes=3)

# Load weights
checkpoint = torch.load(model_path, map_location=device, weights_only=False)

# Ki·ªÉm tra lo·∫°i checkpoint
if isinstance(checkpoint, dict):
    print(f"  ‚Üí Checkpoint keys: {list(checkpoint.keys())}")
    
    if 'model_state_dict' in checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'])
        print("  ‚Üí Loaded from 'model_state_dict' key")
    elif 'state_dict' in checkpoint:
        model.load_state_dict(checkpoint['state_dict'])
        print("  ‚Üí Loaded from 'state_dict' key")
    else:
        # Th·ª≠ load tr·ª±c ti·∫øp nh∆∞ state_dict
        try:
            model.load_state_dict(checkpoint)
            print("  ‚Üí Loaded directly as state_dict")
        except:
            print("  ‚Üí ERROR: Cannot load checkpoint")
else:
    # checkpoint l√† state_dict
    model.load_state_dict(checkpoint)
    print("  ‚Üí Loaded as state_dict")

model = model.to(device)
model.eval()
print("‚úì Model loaded successfully!")

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
print("‚úì Tokenizer loaded")

# Setup Label Encoder (th·ª© t·ª±: negative=0, neutral=1, positive=2)
label_encoder = LabelEncoder()
label_encoder.fit(['negative', 'neutral', 'positive'])
print(f"‚úì Label encoder ready: {label_encoder.classes_}")

In [None]:
# =============================================================================
# LOAD DATASET SEMEVAL BALANCED 750
# =============================================================================
data_path = 'H:/SentimentAnalystSchool/MidtermExam/Semeval/semeval_balanced_750.csv'
df_benchmark = pd.read_csv(data_path)

print(f"üìÅ Dataset: {data_path}")
print(f"üìä Total samples: {len(df_benchmark)}")
print(f"\nüìà Distribution:")
print(df_benchmark['Sentiment'].value_counts())
print(f"\nüîç Sample data:")
display(df_benchmark.head())

In [None]:
# =============================================================================
# H√ÄM BENCHMARK
# =============================================================================

def benchmark_model(model, tokenizer, df, label_encoder, device, max_len=128):
    """
    Benchmark model tr√™n dataset SemEval
    S·ª≠ d·ª•ng format input: "[CLS] review: " + review + " [SEP] aspect: " + aspect + "[SEP]"
    """
    model.eval()
    
    texts = df['reviewText'].tolist()
    
    # Ki·ªÉm tra xem c√≥ c·ªôt AspectTerm kh√¥ng
    if 'AspectTerm' in df.columns:
        aspects = df['AspectTerm'].tolist()
    else:
        # N·∫øu kh√¥ng c√≥ aspect, s·ª≠ d·ª•ng "general" l√†m default
        aspects = ['general'] * len(texts)
        print("‚ö†Ô∏è No 'AspectTerm' column found, using 'general' as default aspect")
    
    true_labels_text = df['Sentiment'].tolist()
    true_labels = label_encoder.transform(true_labels_text)
    
    predictions = []
    probabilities = []
    
    print("=" * 80)
    print("üöÄ BENCHMARK STARTING")
    print("=" * 80)
    print(f"üìä Total samples: {len(texts)}")
    print(f"‚öôÔ∏è Max length: {max_len}")
    print(f"üñ•Ô∏è Device: {device}")
    print(f"üìù Input format: [CLS] review: <text> [SEP] aspect: <aspect> [SEP]")
    print("-" * 80)
    
    start_time = time.time()
    
    with torch.no_grad():
        for i, (text, aspect) in enumerate(zip(texts, aspects)):
            # T·∫°o combined text theo format training
            combined_text = "[CLS] review: " + str(text) + " [SEP] aspect: " + str(aspect) + "[SEP]"
            
            # Tokenize
            encoding = tokenizer.encode_plus(
                combined_text,
                add_special_tokens=True,
                max_length=max_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )
            
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            
            # Predict
            outputs = model(input_ids, attention_mask)
            probs = torch.softmax(outputs, dim=1)
            predicted_class = torch.argmax(probs, dim=1).item()
            
            predictions.append(predicted_class)
            probabilities.append(probs[0].cpu().numpy())
            
            # Progress
            if (i + 1) % 100 == 0:
                print(f"  Processed: {i + 1}/{len(texts)} samples...")
    
    elapsed = time.time() - start_time
    print(f"\n‚úì Completed in {elapsed:.2f}s ({len(texts)/elapsed:.2f} samples/sec)")
    
    # Convert predictions to text labels
    pred_labels_text = label_encoder.inverse_transform(predictions)
    
    # Calculate metrics
    results = {
        'accuracy': accuracy_score(true_labels, predictions),
        'f1_macro': f1_score(true_labels, predictions, average='macro'),
        'f1_weighted': f1_score(true_labels, predictions, average='weighted'),
        'precision_macro': precision_score(true_labels, predictions, average='macro'),
        'recall_macro': recall_score(true_labels, predictions, average='macro'),
        'f1_per_class': f1_score(true_labels, predictions, average=None),
        'precision_per_class': precision_score(true_labels, predictions, average=None),
        'recall_per_class': recall_score(true_labels, predictions, average=None),
        'confusion_matrix': confusion_matrix(true_labels, predictions),
        'predictions': predictions,
        'pred_labels_text': pred_labels_text,
        'probabilities': probabilities,
        'elapsed_time': elapsed,
        'samples_per_second': len(texts) / elapsed
    }
    
    return results

print("‚úì Benchmark function defined")
print("üìù Input format: [CLS] review: <text> [SEP] aspect: <aspect> [SEP]")

In [None]:
# =============================================================================
# CH·∫†Y BENCHMARK
# =============================================================================
results = benchmark_model(
    model=model,
    tokenizer=tokenizer,
    df=df_benchmark,
    label_encoder=label_encoder,
    device=device,
    max_len=128
)

# =============================================================================
# HI·ªÇN TH·ªä K·∫æT QU·∫¢
# =============================================================================
print("\n" + "=" * 80)
print("üìä BENCHMARK RESULTS - SEMEVAL BALANCED 750 (BitFit Model)")
print("=" * 80)
print(f"‚è±Ô∏è Time elapsed: {results['elapsed_time']:.2f}s")
print(f"üöÄ Speed: {results['samples_per_second']:.2f} samples/second")

print(f"\n{'='*80}")
print("üìà OVERALL METRICS")
print("=" * 80)
print(f"  Accuracy:         {results['accuracy']:.4f} ({results['accuracy']*100:.2f}%)")
print(f"  F1 Macro:         {results['f1_macro']:.4f}")
print(f"  F1 Weighted:      {results['f1_weighted']:.4f}")
print(f"  Precision Macro:  {results['precision_macro']:.4f}")
print(f"  Recall Macro:     {results['recall_macro']:.4f}")

print(f"\n{'='*80}")
print("üìä PER-CLASS METRICS")
print("=" * 80)
classes = label_encoder.classes_
print(f"{'Class':<15} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
print("-" * 51)
for i, cls in enumerate(classes):
    print(f"{cls:<15} {results['precision_per_class'][i]:.4f}       {results['recall_per_class'][i]:.4f}       {results['f1_per_class'][i]:.4f}")

In [None]:
# =============================================================================
# CLASSIFICATION REPORT CHI TI·∫æT
# =============================================================================
print("=" * 80)
print("üìã CLASSIFICATION REPORT")
print("=" * 80)

true_labels = label_encoder.transform(df_benchmark['Sentiment'].tolist())
print(classification_report(
    true_labels, 
    results['predictions'], 
    target_names=label_encoder.classes_,
    digits=4
))

In [None]:
# =============================================================================
# V·∫º CONFUSION MATRIX
# =============================================================================
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Confusion Matrix (counts)
cm = results['confusion_matrix']
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
    annot_kws={"size": 14},
    ax=axes[0]
)
axes[0].set_title('Confusion Matrix (Counts)\nSemEval Balanced 750 - BitFit Model', fontsize=14)
axes[0].set_ylabel('True Label', fontsize=12)
axes[0].set_xlabel('Predicted Label', fontsize=12)

# Normalized Confusion Matrix (percentage)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(
    cm_normalized, 
    annot=True, 
    fmt='.2%', 
    cmap='Greens',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
    annot_kws={"size": 14},
    ax=axes[1]
)
axes[1].set_title('Normalized Confusion Matrix (%)\nSemEval Balanced 750 - BitFit Model', fontsize=14)
axes[1].set_ylabel('True Label', fontsize=12)
axes[1].set_xlabel('Predicted Label', fontsize=12)

plt.tight_layout()
plt.show()

# In confusion matrix d·∫°ng s·ªë
print("\nüìä Confusion Matrix (numeric):")
print(f"{'':15} {'Predicted':^45}")
print(f"{'':15} {'negative':^15} {'neutral':^15} {'positive':^15}")
print("-" * 60)
for i, true_class in enumerate(label_encoder.classes_):
    row = f"{true_class:15}"
    for j in range(3):
        row += f" {cm[i][j]:^14}"
    print(row)

In [None]:
# =============================================================================
# SAMPLE PREDICTIONS
# =============================================================================
# Th√™m predictions v√†o dataframe
df_benchmark['Predicted'] = results['pred_labels_text']
df_benchmark['Correct'] = df_benchmark['Sentiment'] == df_benchmark['Predicted']

print("=" * 80)
print("üîç SAMPLE PREDICTIONS")
print("=" * 80)

# L·∫•y m·ªôt s·ªë m·∫´u ƒë√∫ng v√† sai
correct_samples = df_benchmark[df_benchmark['Correct'] == True].sample(min(5, len(df_benchmark[df_benchmark['Correct'] == True])), random_state=42)
incorrect_samples = df_benchmark[df_benchmark['Correct'] == False].sample(min(5, len(df_benchmark[df_benchmark['Correct'] == False])), random_state=42)

print("\n‚úÖ CORRECT PREDICTIONS:")
print("-" * 80)
for idx, row in correct_samples.iterrows():
    text_display = row['reviewText'][:100] + "..." if len(row['reviewText']) > 100 else row['reviewText']
    print(f"Text: {text_display}")
    print(f"  ‚úì True: {row['Sentiment']} | Predicted: {row['Predicted']}")
    print()

print("\n‚ùå INCORRECT PREDICTIONS:")
print("-" * 80)
for idx, row in incorrect_samples.iterrows():
    text_display = row['reviewText'][:100] + "..." if len(row['reviewText']) > 100 else row['reviewText']
    print(f"Text: {text_display}")
    print(f"  ‚úó True: {row['Sentiment']} | Predicted: {row['Predicted']}")
    print()

In [None]:
# =============================================================================
# BENCHMARK SUMMARY
# =============================================================================
print("=" * 80)
print("üìã BENCHMARK SUMMARY - BitFit Model")
print("=" * 80)
print(f"üìÅ Dataset:           semeval_balanced_750.csv")
print(f"üî¢ Total samples:     {len(df_benchmark)}")
print(f"üéØ Model:             best_model_bitfit.pt")
print(f"üîß Architecture:      DistilBERT + BiLSTM (BitFit)")
print(f"‚è±Ô∏è Inference time:    {results['elapsed_time']:.2f}s")
print(f"üöÄ Speed:             {results['samples_per_second']:.2f} samples/sec")
print("-" * 80)
print(f"üìä Accuracy:          {results['accuracy']:.4f} ({results['accuracy']*100:.2f}%)")
print(f"üìä F1 Macro:          {results['f1_macro']:.4f}")
print(f"üìä F1 Weighted:       {results['f1_weighted']:.4f}")
print(f"üìä Precision Macro:   {results['precision_macro']:.4f}")
print(f"üìä Recall Macro:      {results['recall_macro']:.4f}")
print("-" * 80)
print(f"‚úÖ Correct predictions:   {df_benchmark['Correct'].sum()}")
print(f"‚ùå Incorrect predictions: {len(df_benchmark) - df_benchmark['Correct'].sum()}")
print("=" * 80)