# Text Classification TOPSIS Analysis
This notebook performs sentiment analysis on the IMDB dataset using 6 different Hugging Face models and ranks them using TOPSIS.

In [19]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import pipeline
import time
import os
import torch
from topsis_package import topsis

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1
device_type = "GPU" if torch.cuda.is_available() else "CPU"
print(f"Using device: {device_type}\n")

Using device: CPU



In [20]:
# Load IMDB dataset
dataset = load_dataset("imdb")
train_data = dataset["train"].select(range(1000))

# Create a balanced test set with both positive and negative samples
test_negative = dataset["test"].filter(lambda x: x["label"] == 0).select(range(100))
test_positive = dataset["test"].filter(lambda x: x["label"] == 1).select(range(100))

# Combine them
from datasets import concatenate_datasets
test_data = concatenate_datasets([test_negative, test_positive])

# Extract test labels and texts
test_texts = test_data["text"]
test_labels = test_data["label"]

# Convert to lists
test_texts = list(test_texts)
test_labels = list(test_labels)

print(f"Loaded {len(test_texts)} test samples")
print(f"Positive samples: {sum(test_labels)}, Negative samples: {len(test_labels) - sum(test_labels)}")


Loaded 200 test samples
Positive samples: 100, Negative samples: 100


In [21]:
# Define models
models = [
    "distilbert-base-uncased-finetuned-sst-2-english",
    "textattack/bert-base-uncased-SST-2",
    "siebert/sentiment-roberta-large-english",
    "cardiffnlp/twitter-roberta-base-sentiment",
    "nlptown/bert-base-multilingual-uncased-sentiment"
]

print(f"Models to test: {len(models)}")
for i, model in enumerate(models, 1):
    print(f"{i}. {model}")


Models to test: 5
1. distilbert-base-uncased-finetuned-sst-2-english
2. textattack/bert-base-uncased-SST-2
3. siebert/sentiment-roberta-large-english
4. cardiffnlp/twitter-roberta-base-sentiment
5. nlptown/bert-base-multilingual-uncased-sentiment


In [22]:
results = []

for model_name in models:
    print(f"\nTesting model: {model_name}")
    
    try:
        # Load pipeline with timeout settings
        classifier = pipeline("sentiment-analysis", model=model_name, device=device, revision="main")
        
        # Measure inference time
        start_time = time.time()
        predictions = classifier(test_texts, truncation=True, max_length=512)
        inference_time = time.time() - start_time
        
        # Process predictions
        pred_labels = []
        for pred in predictions:
            label = pred["label"]
            score = pred.get("score", 0.5)
            # Map different label formats to binary (0=negative, 1=positive)
            if label in ["POSITIVE", "POS", "LABEL_1"]:
                pred_labels.append(1)
            elif label in ["NEGATIVE", "NEG", "LABEL_0"]:
                pred_labels.append(0)
            elif label in ["neutral", "LABEL_2"]:
                # For neutral, map based on score
                pred_labels.append(1 if score > 0.5 else 0)
            else:
                # For multilingual models that output star ratings or other formats
                if "star" in label.lower():
                    stars = int(label.split()[0])
                    pred_labels.append(1 if stars >= 3 else 0)
                else:
                    # Check score for generic binary classification
                    pred_labels.append(1 if score > 0.5 else 0)
        
        # Calculate accuracy
        accuracy = sum([1 for i in range(len(test_labels)) if pred_labels[i] == test_labels[i]]) / len(test_labels)
        
        # Count predictions
        num_positive_pred = sum(pred_labels)
        num_negative_pred = len(pred_labels) - num_positive_pred
        num_positive_actual = sum(test_labels)
        num_negative_actual = len(test_labels) - num_positive_actual
        
        # Calculate F1 score 
        tp = sum(1 for i in range(len(test_labels)) if test_labels[i] == 1 and pred_labels[i] == 1)
        fp = sum(1 for i in range(len(test_labels)) if test_labels[i] == 0 and pred_labels[i] == 1)
        fn = sum(1 for i in range(len(test_labels)) if test_labels[i] == 1 and pred_labels[i] == 0)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        # Get model size
        model_path = classifier.model.config._name_or_path
        model_size = 0
        try:
            # Estimate model size from parameters
            total_params = sum(p.numel() for p in classifier.model.parameters())
            # Approximate size in MB (assuming float32 = 4 bytes per parameter)
            model_size = (total_params * 4) / (1024 * 1024)
        except:
            model_size = 0
        
        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "F1": f1_score,
            "Inference_Time": inference_time,
            "Model_Size_MB": model_size
        })
        
        print(f"  Accuracy: {accuracy:.4f}, F1: {f1_score:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
        print(f"  Pred: {num_positive_pred} pos / {num_negative_pred} neg, Actual: {num_positive_actual} pos / {num_negative_actual} neg")
        print(f"  Time: {inference_time:.2f}s, Size: {model_size:.2f}MB")
        
    except Exception as e:
        print(f"  Error loading model: {str(e)[:100]}...skipping this model")
        continue



Testing model: distilbert-base-uncased-finetuned-sst-2-english


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

  Accuracy: 0.8750, F1: 0.8731, Precision: 0.8866, Recall: 0.8600
  Pred: 97 pos / 103 neg, Actual: 100 pos / 100 neg
  Time: 30.88s, Size: 255.41MB

Testing model: textattack/bert-base-uncased-SST-2


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

  Accuracy: 0.8800, F1: 0.8846, Precision: 0.8519, Recall: 0.9200
  Pred: 108 pos / 92 neg, Actual: 100 pos / 100 neg
  Time: 61.62s, Size: 417.65MB

Testing model: siebert/sentiment-roberta-large-english


Loading weights:   0%|          | 0/393 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: siebert/sentiment-roberta-large-english
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


  Accuracy: 0.9650, F1: 0.9655, Precision: 0.9515, Recall: 0.9800
  Pred: 103 pos / 97 neg, Actual: 100 pos / 100 neg
  Time: 209.37s, Size: 1355.60MB

Testing model: cardiffnlp/twitter-roberta-base-sentiment


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-sentiment
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


  Accuracy: 0.8700, F1: 0.8774, Precision: 0.8304, Recall: 0.9300
  Pred: 112 pos / 88 neg, Actual: 100 pos / 100 neg
  Time: 61.02s, Size: 475.49MB

Testing model: nlptown/bert-base-multilingual-uncased-sentiment


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

  Accuracy: 0.7850, F1: 0.8186, Precision: 0.7080, Recall: 0.9700
  Pred: 137 pos / 63 neg, Actual: 100 pos / 100 neg
  Time: 65.46s, Size: 638.43MB


In [23]:
# Create DataFrame
df = pd.DataFrame(results)
print("\nInitial Results:")
print(df.to_string(index=False))


Initial Results:
                                           Model  Accuracy       F1  Inference_Time  Model_Size_MB
 distilbert-base-uncased-finetuned-sst-2-english     0.875 0.873096       30.882878     255.413094
              textattack/bert-base-uncased-SST-2     0.880 0.884615       61.624366     417.647469
         siebert/sentiment-roberta-large-english     0.965 0.965517      209.374728    1355.597664
       cardiffnlp/twitter-roberta-base-sentiment     0.870 0.877358       61.022336     475.494152
nlptown/bert-base-multilingual-uncased-sentiment     0.785 0.818565       65.462829     638.428730


In [24]:
# Apply TOPSIS
weights = [0.3, 0.3, 0.2, 0.2]
impacts = ['+', '+', '-', '-']

print(f"\nApplying TOPSIS with:")
print(f"  Weights: {weights}")
print(f"  Criteria: Accuracy (+), F1 (+), Inference_Time (-), Model_Size_MB (-)")
print(f"  Impacts: {impacts}")

# Prepare data for TOPSIS (exclude Model column)
topsis_data = df[["Accuracy", "F1", "Inference_Time", "Model_Size_MB"]].values

# Apply TOPSIS using topsis_package
result = topsis(topsis_data, weights, impacts)

# Add TOPSIS results to DataFrame
df["TOPSIS_Score"] = result.scores
df["Rank"] = result.ranks

print("\nTOPSIS applied successfully")


Applying TOPSIS with:
  Weights: [0.3, 0.3, 0.2, 0.2]
  Criteria: Accuracy (+), F1 (+), Inference_Time (-), Model_Size_MB (-)
  Impacts: ['+', '+', '-', '-']

TOPSIS applied successfully


In [25]:
# Sort by rank
df = df.sort_values("Rank")

# Display final DataFrame
print("\n" + "="*120)
print("FINAL RESULTS - MODELS RANKED BY TOPSIS SCORE")
print("="*120)
print(df.to_string(index=False))
print("="*120)


FINAL RESULTS - MODELS RANKED BY TOPSIS SCORE
                                           Model  Accuracy       F1  Inference_Time  Model_Size_MB  TOPSIS_Score  Rank
 distilbert-base-uncased-finetuned-sst-2-english     0.875 0.873096       30.882878     255.413094      0.911201     1
              textattack/bert-base-uncased-SST-2     0.880 0.884615       61.624366     417.647469      0.820434     2
       cardiffnlp/twitter-roberta-base-sentiment     0.870 0.877358       61.022336     475.494152      0.797925     3
nlptown/bert-base-multilingual-uncased-sentiment     0.785 0.818565       65.462829     638.428730      0.695399     4
         siebert/sentiment-roberta-large-english     0.965 0.965517      209.374728    1355.597664      0.149865     5
