In [None]:
import pandas as pd
import requests
from openai import OpenAI
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from langchain_ollama import ChatOllama
from langchain_core.messages import AIMessage

# OpenAI API Configuration
OPENAI_API_KEY = "addkey"
client = OpenAI(api_key=OPENAI_API_KEY)

# Load the NQ-Open dataset from Hugging Face
splits = {"validation": "nq_open/validation-00000-of-00001.parquet"}  # Define the validation split file
df_validation = pd.read_parquet("hf://datasets/google-research-datasets/nq_open/" + splits["validation"])

# Extract questions and answers
questions = df_validation["question"].tolist()
answers = df_validation["answer"].tolist()

# Load GPT-2
model_gpt2 = "gpt2"
tokenizer_gpt2 = AutoTokenizer.from_pretrained(model_gpt2)
model_gpt2 = AutoModelForCausalLM.from_pretrained(model_gpt2)
gpt2_pipeline = pipeline("text-generation", model=model_gpt2, tokenizer=tokenizer_gpt2)

# Initialize LangChain models for Ollama and DeepSeek
llm_llama2 = ChatOllama(model="llama2", temperature=0)
llm_llama3 = ChatOllama(model="llama3.2", temperature=0)
llm_deepseek = ChatOllama(model="deepseek-r1", temperature=0)

def answer_question_openai(question):
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "user", "content": f"{question}? Answer concisely in one or two words only with no punctuation."}
        ],
        model="gpt-3.5-turbo",
    )
    return chat_completion.choices[0].message.content.strip()

def answer_question_llama2(question):
    messages = [("human", f"{question}? Answer concisely in one or two words only with no punctuation.")]
    return llm_llama2.invoke(messages).content.strip()

def answer_question_llama3(question):
    messages = [("human", f"{question}? Answer concisely in one or two words only with no punctuation.")]
    return llm_llama3.invoke(messages).content.strip()

def answer_question_deepseek(question):
    messages = [("human", f"{question}? Answer in one or two words only with no punctiation.")]
    return llm_deepseek.invoke(messages).content.strip()

def answer_question_gpt2(question):
    prompt = f"{question}? Answer in one or two words only with no punctiation."
    response = gpt2_pipeline(prompt, max_length=50, truncation=True, pad_token_id=tokenizer_gpt2.eos_token_id)
    return response[0]['generated_text'].split("Answer in one or two words only with no punctiation.")[-1].strip()

# Evaluate with Exact Match Score and Save Predictions
def exact_match(pred, true):
    return int(any(pred.strip().lower() == t.strip().lower() for t in true))

def evaluate_and_save(model_name, model_func, n_samples=50):
    results = []
    correct = 0
    for i in range(n_samples):
        pred = model_func(questions[i])
        is_correct = exact_match(pred, answers[i])
        correct += is_correct
        results.append({
            "question": questions[i],
            "true_answers": ", ".join(answers[i]),
            "predicted": pred,
            "correct": is_correct
        })
    accuracy = correct / n_samples
    df_results = pd.DataFrame(results)
    df_results.to_csv(f"outputs/{model_name}_predictions.csv", index=False)
    return accuracy

# Run Evaluation and Save Predictions
nb_Samples = 1000
accuracy_openai = evaluate_and_save("openai_gpt3.5", answer_question_openai, nb_Samples)
accuracy_llama2 = evaluate_and_save("llama2", answer_question_llama2, nb_Samples)
accuracy_llama3 = evaluate_and_save("llama3.2", answer_question_llama3, nb_Samples)
# accuracy_deepseek = evaluate_and_save("deepseek-r1", answer_question_deepseek, nb_Samples)
accuracy_gpt2 = evaluate_and_save("gpt2", answer_question_gpt2, nb_Samples)

print("Device set to use cpu")
print(f"Exact Match Accuracy (OpenAI GPT-3.5): {accuracy_openai:.2f}")
print(f"Exact Match Accuracy (LLaMA 2 via LangChain): {accuracy_llama2:.2f}")
print(f"Exact Match Accuracy (LLaMA 3 via LangChain): {accuracy_llama3:.2f}")
# print(f"Exact Match Accuracy (DeepSeek-R1 via LangChain): {accuracy_deepseek:.2f}")
print(f"Exact Match Accuracy (GPT-2): {accuracy_gpt2:.2f}")

Device set to use cpu


Device set to use cpu
Exact Match Accuracy (OpenAI GPT-3.5): 0.33
Exact Match Accuracy (LLaMA 2 via LangChain): 0.11
Exact Match Accuracy (LLaMA 3 via LangChain): 0.20
Exact Match Accuracy (GPT-2): 0.00
