# script questions to LLM and Server

In [5]:
import random
import json

In [1]:
from datasets import load_dataset
import requests
import time
import re
import matplotlib.pyplot as plt
mmlu_ds = load_dataset("cais/mmlu", "all")
df = mmlu_ds["validation"].to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def convert_num_to_letter(num):
    mapping = {0: 'a', 1: 'b', 2: 'c', 3: 'd'}
    return mapping.get(num, None)


In [3]:
response_times = []
correct_Server = []
correct_llm = []

In [4]:
import re

def extract_final_answer(model_response: str,kind:str) -> str | None:
    print(f"{kind}:\n{model_response}\n")
    match = re.search(r'Final answer:\s*([a-dA-D])', model_response)
    if match:
        return match.group(1).lower()
    return None


In [42]:
def ask_local_llm(question: str):
    url = "http://127.0.0.1:8013/generate"
    headers = {"Content-Type": "application/json"}
    payload = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question}
                ]
            }
        ]
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(payload))
        response.raise_for_status()  
        return response.json()
    except requests.RequestException as e:
        print(f"❌ שגיאה בשליחה לשרת: {e}")
        return None

In [7]:
def send_message_to_backend( request_text: str):

    url = "http://localhost:8002/api/message/add"
    headers = {"Content-Type": "application/json"}
    payload = {
        "userId": "demo_user",
        "request": request_text
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(payload))
        response.raise_for_status()
        return response.json()  # מחזירה את התגובה כ־dict
    except requests.RequestException as e:
        print(f"❌ שגיאה בשליחה לשרת הראשי: {e}")
        return None

In [61]:
def send_message_to_add_test(user_id: str, request_text: str):
    url = "http://localhost:8002/api/message/addtest"
    headers = {"Content-Type": "application/json"}
    payload = {
        "userId": user_id,
        "request": request_text
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(payload))
        response.raise_for_status()
        data = response.json()
        
        return data
    except requests.RequestException as e:
        print(f"❌ שגיאה בשליחה לשרת: {e}")
        return None

In [17]:
def run_evaluation(sample_df):
    response_times = []
    correct_flags_s = []
    correct_mlm_flags = []
    for index, row in sample_df.iterrows():
        choices = row["choices"].tolist() if not isinstance(row["choices"], list) else row["choices"]
        gold_letter = convert_num_to_letter(int(row["answer"]))
        
        prompt_M = f"""
    Question: {row['question']}
    Answers:
    a. {choices[0]}
    b. {choices[1]}
    c. {choices[2]}
    d. {choices[3]}

    Answer the question by selecting one option from each of the answers. 
    Briefly explain your reasoning, and then give a final answer of the letter of the selected answer in this exact format:
    Final answer: <letter>
    """
        
        prompt_S =f"""Which of the following answers answers the question?
    Question: {row['question']}
    Answers:
    a. {choices[0]}
    b. {choices[1]}
    c. {choices[2]}
    d. {choices[3]}

    Answer the in this exact format:
    Final answer: <letter>
    """

        try:
            start_time = time.time()
            response = send_message_to_backend(prompt_S)['answer']
            end_time = time.time()
            duration = end_time - start_time
            response_times.append(duration)
            model_output = ask_local_llm(prompt_M)['text']

            pred_s = extract_final_answer(response,"Server")
            pred_m = extract_final_answer(model_output,"LLM")
            if pred_s:
                is_correct = (pred_s == gold_letter)
            else:
                pred_s = "?"
                is_correct = False
            correct_flags_s.append(is_correct)
            if pred_m:
                is_correct = (pred_m == gold_letter)
            else:
                pred_s = "?"
                is_correct = False
            correct_mlm_flags.append(is_correct)
            print(f"[{index}] \nServer: {pred_s} \nLLM:{pred_m} \n---excpected: {gold_letter} in {duration:.2f}s  \n")

            # print(f"Subject:{row['subject']}\n Question: {row['question']} \n Choices:a. {choices[0]}\nb. {choices[1]}\nc. {choices[2]}\nd. {choices[3]}\n\n")

        except Exception as e:
            print(f"[{index}] ❌ error: {e}")
    return response_times, correct_flags_s , correct_mlm_flags


In [20]:
import matplotlib.pyplot as plt
import numpy as np

def summarize_and_plot(response_times, correct_flags,correct_mlm_flags):

    accuracy = 100 * sum(correct_flags) / len(correct_flags)
    avg_time = np.mean(response_times)
    median_time = np.median(response_times)
    std_time = np.std(response_times)

    print("\n--- Summary ---")
    print(f"Total questions: {len(response_times)}")
    print(f"Accuracy: {accuracy:.2f}%")


    print("\n----Model LLM---")
    accuracy_mlm = 100 * sum(correct_mlm_flags) / len(correct_mlm_flags)
    print(f"Accuracy: {accuracy_mlm:.2f}%")


In [49]:
subjects = df['subject'].unique()
subjects


array(['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics',
       'clinical_knowledge', 'college_biology', 'college_chemistry',
       'college_computer_science', 'college_mathematics',
       'college_medicine', 'college_physics', 'computer_security',
       'conceptual_physics', 'econometrics', 'electrical_engineering',
       'elementary_mathematics', 'formal_logic', 'global_facts',
       'high_school_biology', 'high_school_chemistry',
       'high_school_computer_science', 'high_school_european_history',
       'high_school_geography', 'high_school_government_and_politics',
       'high_school_macroeconomics', 'high_school_mathematics',
       'high_school_microeconomics', 'high_school_physics',
       'high_school_psychology', 'high_school_statistics',
       'high_school_us_history', 'high_school_world_history',
       'human_aging', 'human_sexuality', 'international_law',
       'jurisprudence', 'logical_fallacies', 'machine_learning',
       'management', 'marketing

In [51]:
sample_df = df[df['subject'] == 'high_school_computer_science'].sample(n=2, random_state=42)
# response_times, correct_flags ,correct_flags_LLM = run_evaluation(sample_df)

In [54]:
def get_response_process_asking(row: str):
    choices = row["choices"].tolist() if not isinstance(row["choices"], list) else row["choices"]
    gold_letter = convert_num_to_letter(int(row["answer"]))
    promt = f"""
    Question: {row['question']}
    Answers:
    a. {choices[0]}
    b. {choices[1]}
    c. {choices[2]}
    d. {choices[3]}

    """
    output = send_message_to_add_test("demo_user",promt)
    print("Gold answer:",gold_letter)
    print("Q:",row['question'])
    print("Choices:")
    for c in choices:
        print(c)
    print(json.dumps(output, indent=2, ensure_ascii=False))
    

In [46]:
def get_response_llm(row: str):
    choices = row["choices"].tolist() if not isinstance(row["choices"], list) else row["choices"]
    gold_letter = convert_num_to_letter(int(row["answer"]))
    promt = f"""Which of the following answers answers the question?
    Question: {row['question']}
    Answers:
    a. {choices[0]}
    b. {choices[1]}
    c. {choices[2]}
    d. {choices[3]}

    Respond only answer with the letter (a, b, c, d) of the correct answer. 
    Do not write words or explanations - only the appropriate letter.
    """
    output = ask_local_llm(promt)
    print("Gold answer:",gold_letter)
    print("Q:",row['question'])
    print("Choices:")
    for c in choices:
        print(c)
    print(json.dumps(output, indent=2, ensure_ascii=False))
    

In [52]:
sample_df.iloc[1]

question    Which of the following best describes a Web se...
subject                          high_school_computer_science
choices     [A computer system that delivers Web pages to ...
answer                                                      0
Name: 345, dtype: object

תשובה נכונה

In [62]:
get_response_process_asking(sample_df.iloc[1])

❌ שגיאה בשליחה לשרת: 500 Server Error: Internal Server Error for url: http://localhost:8002/api/message/addtest
Gold answer: a
Q: Which of the following best describes a Web server?
Choices:
A computer system that delivers Web pages to clients
A computer system that determines the shortest path between two computers over the Internet
A computer system running software that provides a user-friendly interface for creating Web pages
A computer system that translates domain names to IP addresses
null


תשובה שגויה מהמודל הקטן

In [47]:
get_response_llm(sample_df.iloc[1])

Gold answer: b
Q: Which of the following guidelines is applicable to initialization of the weight vector in a fully connected neural network.
Choices:
Should not set it to zero since otherwise it will cause overfitting
Should not set it to zero since otherwise (stochastic) gradient descent will explore a very small space
Should set it to zero since otherwise it causes a bias
Should set it to zero in order to preserve symmetry across all neurons
{
  "text": "a",
  "tokens": 3,
  "duration_s": 0.36277174949645996
}


In [16]:
summarize_and_plot(response_times, correct_flags,correct_flags_LLM)


--- Summary ---
Total questions: 2
Accuracy: 0.00%

----Model LLM---
Accuracy: 0.00%


In [None]:
sample_df = df[df['subject'] == 'machine_learning'].sample(n=2, random_state=42)
response_times, correct_flags ,correct_flags_LLM = run_evaluation(sample_df)