In [1]:
# Install required libraries
!pip install transformers datasets accelerate

# Import required libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import random

from huggingface_hub import login
# Use your Hugging Face token
login(token="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")

# Step 1: Load the Anatomy Subset of MMLU Dataset
dataset = load_dataset("cais/mmlu", "anatomy", split="test")

# Step 2: Preprocess the Data
def preprocess_mmlu(data):
    """
    Prepares the MMLU dataset by randomly masking one incorrect option.
    """
    processed_data = []
    for example in data:  # Access the dataset directly
        question = example["question"]
        options = example["choices"]
        correct_index = example["answer"]  # Correct answer index
        
        # Mask one incorrect option randomly
        incorrect_indices = [i for i in range(len(options)) if i != correct_index]
        masked_index = random.choice(incorrect_indices)
        masked_options = options.copy()
        masked_options[masked_index] = "[MASK]"
        
        processed_data.append({
            "question": question,
            "options": options,
            "masked_options": masked_options,
            "correct_index": correct_index,
            "masked_index": masked_index,
        })
    return processed_data

processed_data = preprocess_mmlu(dataset)

# Step 3: Load LLaMA 2 Model and Tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with 8B when available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Fix the padding token issue
tokenizer.pad_token = tokenizer.eos_token  # Set eos_token as the pad_token

# Load the model without quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Automatically maps the model to available GPUs
    torch_dtype=torch.float16  # Use mixed precision for better performance
)

# Step 4: Predict the Masked Option
def predict_with_llama(question, masked_options, tokenizer, model):
    """
    Predicts the missing option in a multiple-choice question using LLaMA.
    """
    # Prepare the input prompt
    input_prompt = (
        f"Please fill in the [MASK] in option A based on your benchmark knowledge. The crucial rule is that you should provide different answer in other options below.\n" 
        f"Question: {question}\n"
        f"Options: {' | '.join(masked_options)}\n"
        f"The missing option is:"
    )
    
    # Tokenize the input
    inputs = tokenizer(
        input_prompt,
        return_tensors="pt",
        padding=True,  # Apply padding
        truncation=True,
        return_attention_mask=True
    ).to("cuda")  # Move input to GPU
    
    # Generate prediction
    with torch.no_grad():
        output_ids = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=inputs.input_ids.shape[1] + 20,
            num_return_sequences=1
        )
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Extract the predicted option
    predicted_text = output_text.replace(input_prompt, "").strip()
    return predicted_text

# Step 5: Check Substring Match
def substring_match(predicted_output, masked_option):
    """
    Checks if the masked option is a substring of the predicted output.
    """
    return masked_option.lower() in predicted_output.lower()

# Step 6: Evaluate the Model with Substring Matching
def evaluate_llama_with_substring(data, tokenizer, model):
    """
    Evaluates LLaMA on the MMLU dataset by checking if the masked option is in the predicted output.
    """
    correct_predictions = 0
    total_predictions = 0
    
    for example in data:
        question = example["question"]
        masked_options = example["masked_options"]
        masked_option = example["options"][example["masked_index"]]

        temp = example["options"]
        print(f"question: {question}")
        print(f"orignal options: {temp}")
        print(f"masked options: {masked_options}")
        # Predict the masked option
        predicted_output = predict_with_llama(question, masked_options, tokenizer, model)
        print(f"Predicted output: {predicted_output}")
        print(f"Masked option: {masked_option}")
        
        # Check if the masked option is in the predicted output
        if substring_match(predicted_output, masked_option):
            correct_predictions += 1
            print("Its a match!")
        total_predictions += 1
    
    # Calculate accuracy
    accuracy = (correct_predictions / total_predictions) * 100
    return accuracy

# Step 7: Run Evaluation
accuracy = evaluate_llama_with_substring(processed_data[:50], tokenizer, model)
print(f"LLaMA Model Accuracy on Anatomy Subset with Substring Matching: {accuracy:.2f}%")


Defaulting to user installation because normal site-packages is not writeable




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


question: A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral
orignal options: ['paralysis of the facial muscles.', 'paralysis of the facial muscles and loss of taste.', 'paralysis of the facial muscles, loss of taste and lacrimation.', 'paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.']
masked options: ['paralysis of the facial muscles.', 'paralysis of the facial muscles and loss of taste.', 'paralysis of the facial muscles, loss of taste and lacrimation.', '[MASK]']
Predicted output: paralysis of the facial muscles, loss of taste, lacrimation, and
Masked option: paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.
question: A "dished face" profile is often associated with
orignal options: ['a protruding mandible due to reactivation of the condylar cartilage by acromegaly.', 'a recessive maxilla due to failure of elongation of the cranial base.', 'an enlarged frontal b

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


question: A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral
orignal options: ['paralysis of the facial muscles.', 'paralysis of the facial muscles and loss of taste.', 'paralysis of the facial muscles, loss of taste and lacrimation.', 'paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.']
masked options: ['paralysis of the facial muscles.', '[MASK]', 'paralysis of the facial muscles, loss of taste and lacrimation.', 'paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.']
Predicted output: Option B: paralysis of the facial muscles, loss of taste and lacrim
Masked option: paralysis of the facial muscles and loss of taste.
question: A "dished face" profile is often associated with
orignal options: ['a protruding mandible due to reactivation of the condylar cartilage by acromegaly.', 'a recessive maxilla due to failure of elongation of the cranial base.', 'an enlarged fronta

In [5]:
# Install required libraries
!pip install transformers datasets accelerate

# Import required libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import random

from huggingface_hub import login
# Use your Hugging Face token
login(token="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")

# Step 1: Load the Anatomy Subset of MMLU Dataset
dataset = load_dataset("cais/mmlu", "anatomy", split="test")

# Step 2: Preprocess the Data
def preprocess_mmlu(data):
    """
    Prepares the MMLU dataset by randomly masking one incorrect option.
    """
    processed_data = []
    for example in data:  # Access the dataset directly
        question = example["question"]
        options = example["choices"]
        correct_index = example["answer"]  # Correct answer index
        
        # Mask one incorrect option randomly
        incorrect_indices = [i for i in range(len(options)) if i != correct_index]
        masked_index = random.choice(incorrect_indices)
        masked_options = options.copy()
        masked_options[masked_index] = "[MASK]"
        
        processed_data.append({
            "question": question,
            "options": options,
            "masked_options": masked_options,
            "correct_index": correct_index,
            "masked_index": masked_index,
        })
    return processed_data

processed_data = preprocess_mmlu(dataset)

# Step 3: Load LLaMA 2 Model and Tokenizer
model_name = "meta-llama/Llama-2-7b-hf"  # Replace with 8B when available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Fix the padding token issue
tokenizer.pad_token = tokenizer.eos_token  # Set eos_token as the pad_token

# Load the model without quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Automatically maps the model to available GPUs
    torch_dtype=torch.float16  # Use mixed precision for better performance
)

# Step 4: Predict the Masked Option
def predict_with_llama(question, masked_options, tokenizer, model):
    """
    Predicts the missing option in a multiple-choice question using LLaMA.
    """
    # Prepare the input prompt
    input_prompt = (
        f"Please fill in the [MASK] in option A based on your benchmark knowledge. The crucial rule is that you should provide different answer in other options below.\n" 
        f"Question: {question}\n"
        f"Options: {' | '.join(masked_options)}\n"
        f"The missing option is:"
    )
    
    # Tokenize the input
    inputs = tokenizer(
        input_prompt,
        return_tensors="pt",
        padding=True,  # Apply padding
        truncation=True,
        return_attention_mask=True
    ).to("cuda")  # Move input to GPU
    
    # Generate prediction
    with torch.no_grad():
        output_ids = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=inputs.input_ids.shape[1] + 20,
            num_return_sequences=1
        )
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Extract the predicted option
    predicted_text = output_text.replace(input_prompt, "").strip()
    return predicted_text

# Step 5: Check Substring Match
def substring_match(predicted_output, masked_option):
    """
    Checks if the masked option is a substring of the predicted output.
    """
    return masked_option.lower() in predicted_output.lower()

# Step 6: Evaluate the Model with Substring Matching
def evaluate_llama_with_substring(data, tokenizer, model):
    """
    Evaluates LLaMA on the MMLU dataset by checking if the masked option is in the predicted output.
    """
    correct_predictions = 0
    total_predictions = 0
    
    for example in data:
        question = example["question"]
        masked_options = example["masked_options"]
        masked_option = example["options"][example["masked_index"]]

        temp = example["options"]
        print(f"question: {question}")
        print(f"orignal options: {temp}")
        print(f"masked options: {masked_options}")
        # Predict the masked option
        predicted_output = predict_with_llama(question, masked_options, tokenizer, model)
        print(f"Predicted output: {predicted_output}")
        print(f"Masked option: {masked_option}")
        
        # Check if the masked option is in the predicted output
        if substring_match(predicted_output, masked_option):
            correct_predictions += 1
            print("Its a match!")
        total_predictions += 1
    
    # Calculate accuracy
    accuracy = (correct_predictions / total_predictions) * 100
    return accuracy

# Step 7: Run Evaluation
accuracy = evaluate_llama_with_substring(processed_data[:50], tokenizer, model)
print(f"LLaMA Model Accuracy on Anatomy Subset with Substring Matching: {accuracy:.2f}%")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


question: A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral
orignal options: ['paralysis of the facial muscles.', 'paralysis of the facial muscles and loss of taste.', 'paralysis of the facial muscles, loss of taste and lacrimation.', 'paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.']
masked options: ['paralysis of the facial muscles.', '[MASK]', 'paralysis of the facial muscles, loss of taste and lacrimation.', 'paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.']
Predicted output: paralysis of the facial muscles, loss of taste, lacrimation and decreased
Masked option: paralysis of the facial muscles and loss of taste.
question: A "dished face" profile is often associated with
orignal options: ['a protruding mandible due to reactivation of the condylar cartilage by acromegaly.', 'a recessive maxilla due to failure of elongation of the cranial base.', 'an enlarged 