In [None]:
import sys
import importlib
import typing_extensions
import wandb
from datasets import load_dataset
import pandas as pd
import re
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer
from datasets import Dataset
import os
import json
from tqdm import tqdm
import time
from openai import OpenAI

['/scratch/users/njain26/grpo-venv/lib/python3.9/site-packages',
 '/scratch/users/njain26',
 '',
 '/share/software/user/open/py-jupyterlab/4.0.8_py39/lib/python3.9/site-packages',
 '/share/software/user/open/python/3.9.0/lib/python39.zip',
 '/share/software/user/open/python/3.9.0/lib/python3.9',
 '/share/software/user/open/python/3.9.0/lib/python3.9/lib-dynload',
 '/scratch/users/njain26/grpo-venv/lib/python3.9/site-packages']

In [None]:

ds_test = load_dataset("osunlp/TravelPlanner", "test")['test']
ds_train = load_dataset("osunlp/TravelPlanner", "train")['train']
ds_val = load_dataset("osunlp/TravelPlanner", "validation")['validation']

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

data = []
for item in ds_train:
    data.append({
        'org': item.get('org'),
        'dest': item.get('dest'),
        'days': item.get('days'),
        'visiting_city_number': item.get('visiting_city_number'),
        'date': item.get('date'),
        'people_number': item.get('people_number'),
        'local_constraint': item.get('local_constraint'),
        'budget': item.get('budget'),
        'query': item.get('query'),
        'level': item.get('level'),
        'annotated_plan': item.get('annotated_plan'),
        'reference_information': item.get('reference_information')
    })
df_train = pd.DataFrame(data)

print(df_train.columns)

Index(['org', 'dest', 'days', 'visiting_city_number', 'date', 'people_number',
       'local_constraint', 'budget', 'query', 'level', 'annotated_plan',
       'reference_information'],
      dtype='object')


In [None]:
def create_system_prompt(row):
    """
    Creates a system prompt from a row in the travel dataset.
    
    Args:
        row: A pandas DataFrame row containing travel information
        
    Returns:
        A string with the formatted system prompt
    """
    prompt = f"""You are a professional travel agent. Create a detailed travel plan based on the following information:

Origin: {row['org']}
Destination: {row['dest']}
Duration: {row['days']} days
Travel Date: {row['date']}
Number of People: {row.get('people_number', 'Not specified')}
Budget: {row.get('budget', 'Not specified')}

Instructions:
1. Enclose your thinking process in <think></think> tags
2. Enclose your final answer in <answer></answer> tags
3. When checking for flights or hotels, enclose those API calls in <tool></tool> tags

For example:
<think>
I'll need to find suitable flights from {row['org']} to {row['dest']}, then look for accommodations within the budget.
</think>

<tool>
SearchFlights(origin="{row['org']}", destination="{row['dest']}", date="{row['date']}", passengers={row.get('people_number', 1)})
</tool>

<answer>
Here is your personalized travel itinerary...
</answer>

Please provide a comprehensive travel plan that includes transportation and accomidations.
"""
    return prompt

df_train['prompt'] = df_train.apply(create_system_prompt, axis=1)

In [None]:

def reward_function(prompts, completions, **kwargs):
    """
    Evaluates model responses based on adherence to tag syntax and correct tool usage.
    
    Args:
        prompts: List of input prompts
        completions: List of model completions/responses
        **kwargs: Additional keyword arguments
        
    Returns:
        List[float]: Reward scores between 0.0 and 1.0 for each completion
    """
    rewards = []
    
    for i, (prompt, completion) in enumerate(zip(prompts, completions)):
        # Handle different completion formats that TRL might pass
        if isinstance(completion, list) and len(completion) > 0 and isinstance(completion[0], dict):
            completion_content = completion[0]["content"]
        else:
            completion_content = completion
            
        # Extract travel details from the prompt
        travel_details = {}
        
        # Extract origin
        origin_match = re.search(r'from\s+(\w+)', prompt, re.IGNORECASE)
        if origin_match:
            travel_details['org'] = origin_match.group(1)
            
        # Extract destination
        dest_match = re.search(r'to\s+(\w+)', prompt, re.IGNORECASE)
        if dest_match:
            travel_details['dest'] = dest_match.group(1)
            
        # Extract people number
        people_match = re.search(r'with\s+(\d+)\s+people', prompt, re.IGNORECASE)
        if people_match:
            travel_details['people_number'] = people_match.group(1)
            
        # Extract dates
        date_match = re.search(r'starting\s+on\s+(\S+)', prompt, re.IGNORECASE)
        if date_match:
            travel_details['date'] = date_match.group(1)
            
        # Extract days
        days_match = re.search(r'for\s+(\d+)\s+days', prompt, re.IGNORECASE)
        if days_match:
            travel_details['days'] = days_match.group(1)
        
        # Base scoring
        base_score = 0.0
        max_score = 1.0
        
        # Check for basic tag structure
        has_think_tags = bool(re.search(r'<think>.*?</think>', completion_content, re.DOTALL))
        has_answer_tags = bool(re.search(r'<answer>.*?</answer>', completion_content, re.DOTALL))
        
        # Check for tool formats
        flights_pattern = r'<tool>\s*SearchFlights\(.*?\)\s*</tool>'
        hotel_pattern = r'<tool>\s*SearchHotel\(.*?\)\s*</tool>'
        
        has_flights_tool = bool(re.search(flights_pattern, completion_content, re.DOTALL))
        has_hotel_tool = bool(re.search(hotel_pattern, completion_content, re.DOTALL))
        
        # Initialize scores for different components
        think_score = 0.2 if has_think_tags else 0.0
        answer_score = 0.3 if has_answer_tags else 0.0
        tool_score = 0.0
        
        # Evaluate tool usage for flights
        if has_flights_tool:
            # Base score for having the correct flights tool format
            flight_score = 0.25  
            
            # Extract all SearchFlights calls
            search_flights_calls = re.findall(r'<tool>\s*SearchFlights\((.*?)\)\s*</tool>', completion_content, re.DOTALL)
            
            if search_flights_calls:
                # Check for parameter correctness
                param_scores = []
                
                for call in search_flights_calls:
                    param_score = 0.0
                    
                    # Check origin parameter
                    if travel_details.get('org') and re.search(fr'origin\s*=\s*["\']?{travel_details["org"]}["\']?', call, re.IGNORECASE):
                        param_score += 0.2
                    
                    # Check destination parameter
                    if travel_details.get('dest') and re.search(fr'destination\s*=\s*["\']?{travel_details["dest"]}["\']?', call, re.IGNORECASE):
                        param_score += 0.2
                    
                    # Check passengers parameter
                    if travel_details.get('people_number') and re.search(fr'passengers\s*=\s*{travel_details["people_number"]}', call):
                        param_score += 0.1
                    
                    # Check date parameter (just check if it exists)
                    if re.search(r'date\s*=\s*[\'\"]', call):
                        param_score += 0.1
                    
                    param_scores.append(param_score)
                
                # Adjust flight score based on parameter correctness (max additional 0.15)
                if param_scores:
                    avg_param_score = sum(param_scores) / len(param_scores)
                    flight_score += min(0.15, avg_param_score)
            
            tool_score += flight_score
        
        if has_hotel_tool:

            hotel_score = 0.25
            search_hotel_calls = re.findall(r'<tool>\s*SearchHotel\((.*?)\)\s*</tool>', completion_content, re.DOTALL)
            
            if search_hotel_calls:
                param_scores = []
                
                for call in search_hotel_calls:
                    param_score = 0.0
                    
                    # Check location parameter (should match destination)
                    if travel_details.get('dest') and re.search(fr'location\s*=\s*["\']?{travel_details["dest"]}["\']?', call, re.IGNORECASE):
                        param_score += 0.2
                    
                    # Check check_in parameter
                    if re.search(r'check_in\s*=\s*[\'\"]', call):
                        param_score += 0.1
                        
                    # Check check_out parameter 
                    if re.search(r'check_out\s*=\s*[\'\"]', call):
                        param_score += 0.1
                    
                    # Check guests parameter
                    if travel_details.get('people_number') and re.search(fr'guests\s*=\s*{travel_details["people_number"]}', call):
                        param_score += 0.2
                    
                    param_scores.append(param_score)
                
                if param_scores:
                    avg_param_score = sum(param_scores) / len(param_scores)
                    hotel_score += min(0.15, avg_param_score)
            
            tool_score += hotel_score
        
        final_score = base_score + think_score + answer_score + tool_score
        
        # Normalize to ensure maximum is 1.0
        rewards.append(min(max_score, final_score))
    
    for i in range(len(rewards)):
        rw = rewards[i]
        print(f"\n\nreward: {rw}")
        print("prompt:")
        print(prompts[i])
        print("prompt")    
        print("completion:")
        print(completions[i])
        print("completion\n\n")
    
    return rewards

In [None]:
train_dataset = Dataset.from_pandas(df_train)



In [None]:
model_id = "Qwen/Qwen2.5-1.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir=CACHE_DIR,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    cache_dir=CACHE_DIR
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [None]:
# Setting GRPO training parameters
training_args = GRPOConfig(
    output_dir="qwen-travel-agent-grpo-run2",
    learning_rate=2e-5,
    remove_unused_columns=False,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    bf16=torch.cuda.is_available(),  
    fp16=not torch.cuda.is_available() and torch.cuda.is_available(),  
    
    max_completion_length=512,
    num_generations=4,
    max_prompt_length=256,

    optim="adamw_torch",
    gradient_checkpointing=True,  
    
    report_to="wandb",
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="steps",
    save_steps=1,
    evaluation_strategy="no",  
)



In [12]:
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[reward_function],
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnjain26[0m ([33mnjain26-stanford-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


KeyboardInterrupt: 

In [None]:
trainer.save_model(training_args.output_dir)
trainer.push_to_hub(dataset_name=dataset_id)

In [None]:
def load_model_and_tokenizer(model_path):
    print(f"Loading model from {model_path}")
    
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        device_map="auto",
        cache_dir=CACHE_DIR
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    
    return model, tokenizer

def generate_response(model, tokenizer, system_prompt):
    """Generate a response from the model using only a system prompt."""
    messages = [
        {"role": "system", "content": system_prompt}
    ]
    
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=1024,
            temperature=0.7,
            do_sample=True,
            top_p=0.9
        )
    
    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    response = full_output[len(prompt):].strip()
    
    return response


In [None]:
grpo_model, grpo_tokenizer = load_model_and_tokenizer("/scratch/users/njain26/qwen-travel-agent-grpo-run2/checkpoint-9/")

Loading model from /scratch/users/njain26/qwen-travel-agent-grpo-run2/checkpoint-8/


In [1]:
baseline_model, baseline_tokenizer = load_model_and_tokenizer("Qwen/Qwen2.5-1.5B-Instruct")

NameError: name 'load_model_and_tokenizer' is not defined

In [15]:
large_model, large_tokenizer = load_model_and_tokenizer("Qwen/Qwen2.5-7B-Instruct")

Loading model from Qwen/Qwen2.5-7B-Instruct


Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.79s/it]


In [17]:
test_data = []
for item in ds_test:
    test_data.append({
        'org': item.get('org'),
        'dest': item.get('dest'),
        'days': item.get('days'),
        'visiting_city_number': item.get('visiting_city_number'),
        'date': item.get('date'),
        'people_number': item.get('people_number'),
        'local_constraint': item.get('local_constraint'),
        'budget': item.get('budget'),
        'query': item.get('query'),
        'level': item.get('level'),
        'annotated_plan': item.get('annotated_plan'),
        'reference_information': item.get('reference_information')
    })
df_test = pd.DataFrame(test_data)

df_test['prompt'] = df_test.apply(create_system_prompt, axis=1)
df_sample = df_test.sample(n=200, random_state=42)


In [None]:
def create_response_dataframe(model, tokenizer, dataset):
    results = []
    
    for example in tqdm(dataset, desc="Generating responses"):
        # Get only the system prompt from the dataset
        system_prompt = example.get("prompt", "You are a helpful AI travel planning agent.")
        response = generate_response(model, tokenizer, system_prompt)
        
        # Result dictionary with all original fields plus the response
        result = {**example, "model_response": response}
        results.append(result)
    
    df_results = pd.DataFrame(results)
    return df_results

In [19]:
output_data = Dataset.from_pandas(df_sample)

output_data = Dataset.from_pandas(df_sample)
output_test = create_response_dataframe(grpo_model, grpo_tokenizer, output_data) 

In [20]:
output_test = create_response_dataframe(grpo_model, grpo_tokenizer, output_data)

Generating responses: 100%|██████████| 200/200 [1:50:55<00:00, 33.28s/it]


In [21]:
baseline_test = create_response_dataframe(baseline_model, baseline_tokenizer, output_data)

Generating responses: 100%|██████████| 200/200 [25:20<00:00,  7.60s/it]


In [19]:
large_test = create_response_dataframe(large_model,large_tokenizer, output_data)

Generating responses: 100%|██████████| 200/200 [1:14:21<00:00, 22.31s/it]


In [21]:
output_test.to_csv("model_outputs.csv", index=False)

In [23]:
baseline_test.to_csv("baseline_outputs.csv", index=False)

In [20]:
large_test.to_csv("large_outputs.csv", index=False)

In [43]:
print(output_test)

               org            dest  days visiting_city_number  \
0         Hartford           Texas     5                 None   
1        Knoxville         Atlanta     3                 None   
2          Chicago          Newark     3                 None   
3           Tucson        Illinois     7                 None   
4      Little Rock         Atlanta     3                 None   
..             ...             ...   ...                  ...   
195        Memphis           Tampa     3                 None   
196       Key West  North Carolina     7                 None   
197   Palm Springs           Texas     5                 None   
198       San Jose      Washington     7                 None   
199  San Francisco           Tampa     3                 None   

                                                  date people_number  \
0    ['2022-03-04', '2022-03-05', '2022-03-06', '20...          None   
1           ['2022-03-06', '2022-03-07', '2022-03-08']          None   
2  

In [None]:
client = OpenAI(api_key="nopeeking!")

def create_evaluation_prompt(row):
    """
    Create a prompt for the evaluator model to assess the quality of tool calls.
    
    Arguments:
        row: DataFrame row containing model_response and annotated_plan
    
    Returns:
        Evaluation prompt string
    """
    prompt = f"""You are an expert evaluator of AI travel planning assistants. Your job is to score how well a model generates appropriate tool calls for flight and hotel searches.

TRAVEL CONTEXT:
{row['annotated_plan']}

MODEL RESPONSE:
{row['model_response']}

EVALUATION INSTRUCTIONS:
1. Focus ONLY on evaluating the quality of the "search_flights" and "search_hotels" tool calls, making sure they are present and properly called.
2. De-emphasize any other aspects of the response like itinerary details or other tool calls, but use the context to make sure the model is not hallucinating, and generating relevant information to the original prompt.
3. Score the response on a scale of 1-5, where:
   - 5: Perfect tool calls with correct parameters matching the travel context
   - 4: Good tool calls with minor issues
   - 3: Acceptable tool calls but with noticeable errors
   - 2: Poor tool calls with major errors
   - 1: Missing or completely incorrect tool calls

Please provide:
1. A numeric score (1-5)
2. A brief explanation of your reasoning (2-3 sentences)
3. End with "FINAL_SCORE: X" where X is your numeric score

Your evaluation:"""
    
    return prompt

def generate_evaluation(prompt, model="gpt-4o-mini"):
    """Generate an evaluation using OpenAI API."""
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert evaluator of travel planning AI responses."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2, 
            max_tokens=512
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        time.sleep(5)
        return "Error: Could not generate evaluation. FINAL_SCORE: 0"

def extract_score(evaluation_text):
    """Extract the numerical score from the evaluation text."""
    if "FINAL_SCORE:" in evaluation_text:
        try:
            score_text = evaluation_text.split("FINAL_SCORE:")[-1].strip()
            score = float(score_text.split()[0])
            return score
        except:
            import re
            scores = re.findall(r'\b[1-5]\.?0?\b', evaluation_text)
            if scores:
                return float(scores[-1])
    
    return None

def evaluate_model_responses(results_df, batch_size=10, save_path="evaluation_results.csv", model="gpt-4o-mini"):
    """
    Evaluate model responses using OpenAI API.
    
    Args:
        results_df: DataFrame containing model_response and annotated_plan columns
        batch_size: Number of examples to evaluate before saving intermediate results
        save_path: Path to save the results
        model: OpenAI model to use for evaluation ("gpt-3.5-turbo" or "gpt-4o-mini")
    
    Returns:
        DataFrame with evaluation results
    """
    eval_df = results_df.copy()
    eval_df['evaluation'] = None
    eval_df['score'] = None
    
    # Process in batches 
    for i in tqdm(range(0, len(eval_df), batch_size), desc="Evaluating responses"):
        batch = eval_df.iloc[i:i+batch_size]
        
        for idx, row in batch.iterrows():
            if pd.notna(eval_df.loc[idx, 'evaluation']):
                continue
            
            eval_prompt = create_evaluation_prompt(row)
            evaluation = generate_evaluation(eval_prompt, model=model)
            score = extract_score(evaluation)
            
            eval_df.loc[idx, 'evaluation'] = evaluation
            eval_df.loc[idx, 'score'] = score
       
            time.sleep(0.5)
        
        eval_df.to_csv(save_path, index=False)
        print(f"Saved intermediate results after {i+len(batch)} evaluations")
    
    print("\nEvaluation Summary:")
    print(f"Average Score: {eval_df['score'].mean():.2f}")
    print(f"Score Distribution:")
    print(eval_df['score'].value_counts().sort_index())
    
    return eval_df


In [None]:

evaluation_results = evaluate_model_responses(output_test, batch_size=10,model="gpt-4o-mini")
evaluation_results.to_csv("final_evaluation_grpo_results.csv", index=False)

summary = {
    "average_score": float(evaluation_results['score'].mean()),
    "median_score": float(evaluation_results['score'].median()),
    "score_distribution": evaluation_results['score'].value_counts().sort_index().to_dict(),
    "total_evaluated": len(evaluation_results),
    "examples": {
    "best": evaluation_results.loc[evaluation_results['score'].idxmax()].to_dict(),
    "worst": evaluation_results.loc[evaluation_results['score'].idxmin()].to_dict(),
    "average": evaluation_results.loc[(evaluation_results['score'] - evaluation_results['score'].mean()).abs().idxmin()].to_dict()
    }
}
    
with open("evaluation_summary.json", "w") as f: json.dump(summary, f, indent=2)
    
print(f"Evaluation complete. Results saved to final_evaluation_results.csv and evaluation_summary.json")

In [None]:
evaluation_results_baseline = evaluate_model_responses(baseline_test, batch_size=10,model="gpt-4o-mini")
evaluation_results_baseline.to_csv("final_evaluation_baseline_results.csv", index=False)

summary = {
    "average_score": float(evaluation_results_baseline['score'].mean()),
    "median_score": float(evaluation_results_baseline['score'].median()),
    "score_distribution": evaluation_results_baseline['score'].value_counts().sort_index().to_dict(),
    "total_evaluated": len(evaluation_results_baseline),
    "examples": {
    "best": evaluation_results_baseline.loc[evaluation_results['score'].idxmax()].to_dict(),
    "worst": evaluation_results_baseline.loc[evaluation_results['score'].idxmin()].to_dict(),
    "average": evaluation_results_baseline.loc[(evaluation_results['score'] - evaluation_results['score'].mean()).abs().idxmin()].to_dict()
    }
}
    
with open("evaluation_summary_baseline.json", "w") as f: json.dump(summary, f, indent=2)
    
print(f"Evaluation complete. Results saved to final_evaluation_baseline_results.csv and evaluation_summary_baseline.json")

Evaluating responses:   5%|▌         | 1/20 [00:24<07:46, 24.57s/it]

Saved intermediate results after 10 evaluations


Evaluating responses:  10%|█         | 2/20 [00:48<07:14, 24.12s/it]

Saved intermediate results after 20 evaluations


Evaluating responses:  15%|█▌        | 3/20 [01:09<06:24, 22.62s/it]

Saved intermediate results after 30 evaluations


Evaluating responses:  20%|██        | 4/20 [01:31<05:57, 22.37s/it]

Saved intermediate results after 40 evaluations


Evaluating responses:  25%|██▌       | 5/20 [02:00<06:13, 24.91s/it]

Saved intermediate results after 50 evaluations


Evaluating responses:  30%|███       | 6/20 [02:23<05:41, 24.39s/it]

Saved intermediate results after 60 evaluations


Evaluating responses:  35%|███▌      | 7/20 [02:50<05:24, 25.00s/it]

Saved intermediate results after 70 evaluations


Evaluating responses:  40%|████      | 8/20 [03:13<04:54, 24.54s/it]

Saved intermediate results after 80 evaluations


Evaluating responses:  45%|████▌     | 9/20 [03:36<04:24, 24.05s/it]

Saved intermediate results after 90 evaluations


Evaluating responses:  50%|█████     | 10/20 [04:00<03:59, 24.00s/it]

Saved intermediate results after 100 evaluations


Evaluating responses:  55%|█████▌    | 11/20 [04:23<03:34, 23.78s/it]

Saved intermediate results after 110 evaluations


Evaluating responses:  60%|██████    | 12/20 [04:47<03:09, 23.65s/it]

Saved intermediate results after 120 evaluations


Evaluating responses:  65%|██████▌   | 13/20 [05:13<02:51, 24.46s/it]

Saved intermediate results after 130 evaluations


Evaluating responses:  70%|███████   | 14/20 [05:39<02:28, 24.77s/it]

Saved intermediate results after 140 evaluations


Evaluating responses:  75%|███████▌  | 15/20 [06:03<02:02, 24.52s/it]

Saved intermediate results after 150 evaluations


Evaluating responses:  80%|████████  | 16/20 [06:24<01:34, 23.68s/it]

Saved intermediate results after 160 evaluations


Evaluating responses:  85%|████████▌ | 17/20 [06:51<01:14, 24.68s/it]

Saved intermediate results after 170 evaluations


Evaluating responses:  90%|█████████ | 18/20 [07:24<00:54, 27.04s/it]

Saved intermediate results after 180 evaluations


Evaluating responses:  95%|█████████▌| 19/20 [07:45<00:25, 25.41s/it]

Saved intermediate results after 190 evaluations


Evaluating responses: 100%|██████████| 20/20 [08:10<00:00, 24.52s/it]

Saved intermediate results after 200 evaluations

Evaluation Summary:
Average Score: 3.54
Score Distribution:
score
1.0     13
2.0      8
3.0     48
4.0    120
5.0     11
Name: count, dtype: int64





NameError: name 'evaluation_results' is not defined

In [1]:
!nvidia-smi

Unable to determine the device handle for GPU0000:C4:00.0: Unknown Error


In [None]:
evaluation_results_large = evaluate_model_responses(large_test, batch_size=10,model="gpt-4o-mini")
evaluation_results_large.to_csv("final_evaluation_large_results.csv", index=False)
   
summary = {
    "average_score": float(evaluation_results_large['score'].mean()),
    "median_score": float(evaluation_results_large['score'].median()),
    "score_distribution": evaluation_results_large['score'].value_counts().sort_index().to_dict(),
    "total_evaluated": len(evaluation_results_large),
    "examples": {
    "best": evaluation_results_large.loc[evaluation_results_large['score'].idxmax()].to_dict(),
    "worst": evaluation_results_large.loc[evaluation_results_large['score'].idxmin()].to_dict(),
    "average": evaluation_results_large.loc[(evaluation_results_large['score'] - evaluation_results_large['score'].mean()).abs().idxmin()].to_dict()
    }
}
    
with open("evaluation_summary_baseline.json", "w") as f: json.dump(summary, f, indent=2)
    
print(f"Evaluation complete. Results saved to final_evaluation_baseline_results.csv and evaluation_summary_baseline.json")

Evaluating responses:   5%|▌         | 1/20 [00:26<08:22, 26.47s/it]

Saved intermediate results after 10 evaluations


Evaluating responses:  10%|█         | 2/20 [00:49<07:15, 24.19s/it]

Saved intermediate results after 20 evaluations


Evaluating responses:  15%|█▌        | 3/20 [01:14<07:00, 24.75s/it]

Saved intermediate results after 30 evaluations


Evaluating responses:  20%|██        | 4/20 [01:39<06:35, 24.74s/it]

Saved intermediate results after 40 evaluations


Evaluating responses:  25%|██▌       | 5/20 [02:03<06:07, 24.49s/it]

Saved intermediate results after 50 evaluations


Evaluating responses:  30%|███       | 6/20 [02:31<05:59, 25.71s/it]

Saved intermediate results after 60 evaluations


Evaluating responses:  35%|███▌      | 7/20 [02:54<05:22, 24.78s/it]

Saved intermediate results after 70 evaluations


Evaluating responses:  40%|████      | 8/20 [03:23<05:13, 26.15s/it]

Saved intermediate results after 80 evaluations


Evaluating responses:  45%|████▌     | 9/20 [03:48<04:43, 25.80s/it]

Saved intermediate results after 90 evaluations


Evaluating responses:  50%|█████     | 10/20 [04:13<04:15, 25.52s/it]

Saved intermediate results after 100 evaluations


Evaluating responses:  55%|█████▌    | 11/20 [04:41<03:56, 26.26s/it]

Saved intermediate results after 110 evaluations


Evaluating responses:  60%|██████    | 12/20 [05:07<03:29, 26.16s/it]

Saved intermediate results after 120 evaluations


Evaluating responses:  65%|██████▌   | 13/20 [05:34<03:05, 26.53s/it]

Saved intermediate results after 130 evaluations


Evaluating responses:  70%|███████   | 14/20 [06:05<02:47, 27.88s/it]

Saved intermediate results after 140 evaluations


Evaluating responses:  75%|███████▌  | 15/20 [06:33<02:19, 27.93s/it]

Saved intermediate results after 150 evaluations


Evaluating responses:  80%|████████  | 16/20 [07:05<01:56, 29.21s/it]

Saved intermediate results after 160 evaluations


Evaluating responses:  85%|████████▌ | 17/20 [07:31<01:24, 28.19s/it]

Saved intermediate results after 170 evaluations


Evaluating responses:  90%|█████████ | 18/20 [07:56<00:54, 27.33s/it]

Saved intermediate results after 180 evaluations


Evaluating responses:  95%|█████████▌| 19/20 [08:20<00:26, 26.22s/it]

Saved intermediate results after 190 evaluations


Evaluating responses: 100%|██████████| 20/20 [08:44<00:00, 26.23s/it]

Saved intermediate results after 200 evaluations

Evaluation Summary:
Average Score: 3.89
Score Distribution:
score
3.0     25
4.0    172
5.0      3
Name: count, dtype: int64





Evaluation complete. Results saved to final_evaluation_baseline_results.csv and evaluation_summary_baseline.json
