In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
from time import time
import torch

# Example list of models
models = {
    "GPT2": "gpt2",
    "T5": "t5-small",  # T5 model needs AutoModelForSeq2SeqLM
    "BART": "facebook/bart-large"  # BART can be used with AutoModelForSeq2SeqLM as well
}

# Test prompts for generating responses
test_prompts = [
    "What's the weather like today?",
    "Tell me a joke.",
    "How are you?",
    "Who won the world series in 2020?",
    "What's the capital of France?"
]

# Function to evaluate each model based on given criteria
def evaluate_model(model_name, model_id):
    # Dynamically choose the correct model class
    if "t5" in model_id or "bart" in model_id:  # For T5 and BART
        model_class = AutoModelForSeq2SeqLM
    else:  # For causal language models like GPT-2, GPT-3, etc.
        model_class = AutoModelForCausalLM

    # Load the model and tokenizer
    model = model_class.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Start the evaluation for this model
    start_time = time()

    # Generate responses for each test prompt and measure time
    total_time = 0
    responses = []
    for prompt in test_prompts:
        inputs = tokenizer(prompt, return_tensors="pt")
        with torch.no_grad():
            outputs = model.generate(inputs['input_ids'], max_length=50)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        responses.append(response)
        total_time += time() - start_time  # measure inference time

    # Measure inference time (seconds per response)
    avg_response_time = total_time / len(test_prompts)

    # Dummy metrics for the sake of this example
    # In a real scenario, you should evaluate the accuracy of the model on a real dataset.
    accuracy = np.random.uniform(0.85, 0.95)  # Simulated accuracy for demonstration

    # Calculate F1 score based on dummy predicted and true labels
    true_labels = [
        "I'm good, thank you!", "My name is GPT.", "Why don't scientists trust atoms? Because they make up everything!",
        "42", "AI is the simulation of human intelligence in machines."
    ]
    predicted_labels = responses

    f1 = f1_score(true_labels, predicted_labels, average='weighted', zero_division=1)

    # Simulate memory usage (MB) based on model size (just for demonstration)
    model_size = model.num_parameters() / 1e6  # Convert from parameters to MB approximation
    memory_usage = model_size

    # Simulate diversity score based on the variety of responses
    diversity_score = np.random.uniform(0.7, 1.0)  # This would require a more complex evaluation

    return accuracy, avg_response_time, memory_usage, f1, diversity_score

# Initialize lists to store the scores for each model
accuracy_scores = []
speed_scores = []
memory_scores = []
f1_scores = []
diversity_scores = []

# Loop over each model and collect their performance metrics
for model_name, model_id in models.items():
    accuracy, speed, memory, f1, diversity = evaluate_model(model_name, model_id)
    accuracy_scores.append(accuracy)
    speed_scores.append(speed)
    memory_scores.append(memory)
    f1_scores.append(f1)
    diversity_scores.append(diversity)

# Collect the results in a table format
results_df = pd.DataFrame({
    "Model": list(models.keys()),
    "Accuracy": accuracy_scores,
    "Response Speed (s)": speed_scores,
    "Memory Usage (MB)": memory_scores,
    "F1 Score": f1_scores,
    "Diversity Score": diversity_scores
})

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

In [19]:
# Normalize the scores (apply MinMax scaling between 0 and 1)
scaler = MinMaxScaler()
normalized_df = results_df.copy()
normalized_df.iloc[:, 1:] = scaler.fit_transform(results_df.iloc[:, 1:])

# Define the ideal and negative ideal solutions
ideal_solution = normalized_df.drop("Model", axis=1).max()  # Ideal solution (max value for each metric)
negative_ideal_solution = normalized_df.drop("Model", axis=1).min()  # Negative ideal solution (min value for each metric)

# Convert ideal_solution and negative_ideal_solution to numpy arrays for broadcasting
ideal_solution = ideal_solution.values
negative_ideal_solution = negative_ideal_solution.values

# Calculate the Euclidean distance to the ideal and negative ideal solutions
distance_to_ideal = np.sqrt(((normalized_df.drop("Model", axis=1).values - ideal_solution) ** 2).sum(axis=1))
distance_to_negative_ideal = np.sqrt(((normalized_df.drop("Model", axis=1).values - negative_ideal_solution) ** 2).sum(axis=1))

# Calculate the TOPSIS score
topsis_score = distance_to_negative_ideal / (distance_to_ideal + distance_to_negative_ideal)

# Add the TOPSIS score to the DataFrame
normalized_df["TOPSIS Score"] = topsis_score

# Rank the models based on the TOPSIS score
normalized_df["Rank"] = normalized_df["TOPSIS Score"].rank(ascending=False)

# Display the final results with ranking
print(normalized_df[["Model", "TOPSIS Score", "Rank"]])

  Model  TOPSIS Score  Rank
0  GPT2      0.525058   2.0
1    T5      0.154582   3.0
2  BART      0.581161   1.0
