In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# Load the pickle file
file_path = '/content/drive/MyDrive/NLP_PROJECT/DS_tests_with_difficulty.csv'
df = pd.read_csv(file_path)

# Filter only open questions
open_df = df[df["question_type"] == "Open"].reset_index(drop=True)
open_df.fillna("", inplace=True)

In [None]:
# OMRI Experiment with LLaMA 2 7B in Google Colab

# 1. Install Required Libraries
!pip install -q transformers accelerate datasets scikit-learn matplotlib

# 2. Load Dataset (Assumes a CSV file is uploaded to Colab)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

In [None]:
from huggingface_hub import login

login()

In [None]:
# 3. Load LLaMA 2 7B model (via HuggingFace Hub)
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)


In [None]:
PROMPT_TEMPLATE = """
You are an expert in question difficulty estimation, in the field of data structures.
Estimate the difficulty of the new question based on a semantic and technical analysis of the question and its' answer.
Please estimate the difficulty of the question on a scale from 0 (very easy) to 1 (very hard), rounded to 3 decimal points.

New Question:
"{question}"
New Answer:
"{answer}"

Estimated Difficulty:
"""

In [None]:
import re

def extract_estimated_difficulty(text):
    match = re.search(r"Estimated Difficulty:\s*([0-9.]+)", text)
    if match:
        return float(match.group(1))
    return None

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

true_difficulties = []
estimated_difficulties = []

# Iterate through all rows (or a sample if you want)
sample_df = open_df.reset_index(drop=True)

for idx, row in sample_df.iterrows():
    question = row["question_translated_to_English"]
    answer = row["answer_translated_to_English"]
    true_diff = row["Difficulty"]
    # Build a simple prompt without any external database
    prompt = PROMPT_TEMPLATE.format(question=question, answer=answer)
    print("i")
    # Send to LLaMA
    response = llama_pipeline(prompt)[0]['generated_text']

    est_diff = extract_estimated_difficulty(response)


    true_difficulties.append(true_diff)
    estimated_difficulties.append(est_diff)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

true_vals = np.array(true_difficulties)
est_vals = np.array([v if v is not None else 0 for v in estimated_difficulties])

mse = mean_squared_error(true_vals, est_vals)
mae = mean_absolute_error(true_vals, est_vals)
rmse = sqrt(mse)

print("\nEvaluation Metrics for QDE with LLaMA 2 7B:")
print(f"  MSE  = {round(mse, 4)}")
print(f"  RMSE = {round(rmse, 4)}")
print(f"  MAE  = {round(mae, 4)}")