In [None]:
#@title Evaluation
!pip install datasets
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the model and tokenizer from Hugging Face
model_name = "iZELX1/CodePath"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the dataset
dataset = load_dataset("iZELX1/Comsci-Concepts-25k")
eval_dataset = dataset['train'].select(range(16000, 20000))  # Using the same range as in the training

def generate_response(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def evaluate_model(dataset):
    true_labels = []
    predicted_labels = []
    bleu_scores = []

    for example in tqdm(dataset):
        input_text = f"Human: {example['input']}\nAI:"
        response = generate_response(input_text)

        # Simple classification based on response content
        true_label = 1 if example['output'] in example['input'] else 0
        pred_label = 1 if example['output'] in response else 0

        true_labels.append(true_label)
        predicted_labels.append(pred_label)

        # Calculate BLEU score
        reference = nltk.word_tokenize(example['output'])
        candidate = nltk.word_tokenize(response)
        bleu_score = sentence_bleu([reference], candidate, smoothing_function=SmoothingFunction().method1)
        bleu_scores.append(bleu_score)

    return true_labels, predicted_labels, bleu_scores

print("Evaluating model...")
true_labels, predicted_labels, bleu_scores = evaluate_model(eval_dataset)

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='binary')
recall = recall_score(true_labels, predicted_labels, average='binary')
f1 = f1_score(true_labels, predicted_labels, average='binary')
avg_bleu = np.mean(bleu_scores)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Average BLEU Score: {avg_bleu:.4f}")

# Confusion Matrix
cm = confusion_matrix(true_labels, predicted_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# ROC Curve
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(true_labels, predicted_labels)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Response Length Distribution
response_lengths = [len(generate_response(f"Human: {ex['input']}\nAI:").split()) for ex in tqdm(eval_dataset)]
plt.figure(figsize=(10, 6))
plt.hist(response_lengths, bins=30, edgecolor='black')
plt.title('Distribution of Response Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()

# BLEU Score Distribution
plt.figure(figsize=(10, 6))
plt.hist(bleu_scores, bins=30, edgecolor='black')
plt.title('Distribution of BLEU Scores')
plt.xlabel('BLEU Score')
plt.ylabel('Frequency')
plt.show()

# Additional Metrics
# Remove the problematic line and replace with proper perplexity calculation
def calculate_perplexity(dataset, model, tokenizer, device, batch_size=4):
    model.eval()
    total_loss = 0
    total_length = 0

    print(f"Type of dataset: {type(dataset)}")
    print(f"Length of dataset: {len(dataset)}")

    # Convert dataset to a list if it's not already
    dataset_list = list(dataset)

    with torch.no_grad():
        for i in tqdm(range(0, len(dataset_list), batch_size), desc="Calculating perplexity"):
            batch = dataset_list[i:i + batch_size]

            # Debug: Print information about the batch
            print(f"Batch size: {len(batch)}")
            print(f"Type of first item in batch: {type(batch[0])}")
            print(f"Keys in first item: {batch[0].keys() if isinstance(batch[0], dict) else 'Not a dictionary'}")

            # Check if the items are dictionaries with 'input' and 'output' keys
            if isinstance(batch[0], dict) and 'input' in batch[0] and 'output' in batch[0]:
                inputs = tokenizer(
                    [f"Human: {ex['input']}\nAI: {ex['output']}" for ex in batch],
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=512
                ).to(device)
            else:
                # If not, assume the items are strings and use them directly
                inputs = tokenizer(
                    batch,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=512
                ).to(device)

            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            total_loss += loss.item() * inputs["input_ids"].size(0)
            total_length += inputs["input_ids"].size(0)

    avg_loss = total_loss / total_length
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

# Calculate perplexity
print("Calculating perplexity...")
perplexity = calculate_perplexity(eval_dataset, model, tokenizer, device)
print(f"Perplexity: {perplexity:.4f}")

# Token-level accuracy
def calculate_token_accuracy(dataset):
    total_correct = 0
    total_tokens = 0

    for example in tqdm(dataset):
        input_text = f"Human: {example['input']}\nAI:"
        target_text = example['output']

        # Tokenize input and target separately
        input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)['input_ids'].to(device)
        target_ids = tokenizer(target_text, return_tensors="pt", truncation=True, max_length=512)['input_ids'].to(device)

        # Ensure input_ids and target_ids have the same length
        max_length = max(input_ids.size(1), target_ids.size(1))
        input_ids = torch.nn.functional.pad(input_ids, (0, max_length - input_ids.size(1)), value=tokenizer.pad_token_id)
        target_ids = torch.nn.functional.pad(target_ids, (0, max_length - target_ids.size(1)), value=tokenizer.pad_token_id)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, labels=target_ids)
            logits = outputs.logits

        predictions = torch.argmax(logits, dim=-1)
        correct = (predictions == target_ids).sum().item()
        total_correct += correct
        total_tokens += target_ids.numel()

    return total_correct / total_tokens

# Calculate token-level accuracy
print("Calculating token-level accuracy...")
token_accuracy = calculate_token_accuracy(eval_dataset)
print(f"Token-level Accuracy: {token_accuracy:.4f}")

# Print all metrics
print("\nFinal Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Average BLEU Score: {avg_bleu:.4f}")
print(f"Perplexity: {perplexity:.4f}")
print(f"Token-level Accuracy: {token_accuracy:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

# BLEU Score percentiles
bleu_percentiles = np.percentile(bleu_scores, [25, 50, 75])
print("\nBLEU Score Percentiles:")
print(f"25th Percentile: {bleu_percentiles[0]:.4f}")
print(f"50th Percentile (Median): {bleu_percentiles[1]:.4f}")
print(f"75th Percentile: {bleu_percentiles[2]:.4f}")

# GPU Memory Management
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"\nGPU Memory Usage:")
    print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")