#### Evaluating the accuracy on question reviews:

In [1]:
import json
from agent.state import OverallState
from agent.graphs import execute_graph

from IPython.display import Image, display
from langchain_core.runnables.graph import MermaidDrawMethod

#LOAD Benchmark:
def load_json_file(filename):
    import json
    with open(filename, 'r') as json_file:
        return json.load(json_file)

def read_text_file(filename):
    with open(filename, 'r') as file:
        return file.read()

benchmark_lesson_1 = load_json_file('data/evals/open_questions_lesson_1_input.json')
lesson_1_doc = read_text_file('data/lesson1.txt')

In [2]:
sample_case = benchmark_lesson_1["student_1"]
sample_case_input = {"open_questions": sample_case}

#Attaching all the concepts evaluated
total_concepts_eval = []
for elem in benchmark_lesson_1["student_1"]:
    total_concepts_eval += elem['concepts_evaluated']
unique_list_of_concepts = list(set(total_concepts_eval))

In [3]:
sample_case

[{'id': 0,
  'question': 'What is the role of a prototype program in problem solving?',
  'student_answer': 'High risk problems are address in the prototype program to make sure that the program is feasible.  A prototype may also be used to show a company that the software can be possibly programmed.',
  'concepts_evaluated': ['problemsolving', 'prototyping']},
 {'id': 29,
  'question': 'What stages in the software life cycle are influenced by the testing stage?',
  'student_answer': 'Refining and possibly the design if the testing phase reveals problems in the design. Production can be affected if the program is unworkable in its current form which will lead to a later production time than originally estimated. Also affects coding because after testing you may need to rewrite the code for the program to remove errors.',
  'concepts_evaluated': ['sdlc', 'testing_phase']},
 {'id': 58,
  'question': 'What are the main advantages associated with object-oriented programming?',
  'student_a

In [None]:
from agent.graphs import one_shot_graph
#TEST:
sample_state = OverallState(
    user_input = {"open_questions": sample_case},
    concepts_to_evaluate=json.dumps(unique_list_of_concepts),
    blooms_state="understand",
    lesson_doc = lesson_1_doc,
    messages=[]
)

#Test OneShot:
graph = one_shot_graph()
display(Image(graph.get_graph().draw_mermaid_png(draw_method=MermaidDrawMethod.API,)))

r = execute_graph(sample_state, graph)

KeyboardInterrupt: 

In [None]:
#Test Basic Rag Call:
from agent.graphs import basic_rag_graph
sample_state = OverallState(
    user_input = {"open_questions": sample_case},
    concepts_to_evaluate=json.dumps(unique_list_of_concepts),
    blooms_state="understand",
    lesson_doc = lesson_1_doc,
    messages=[]
)

graph = basic_rag_graph()
display(Image(graph.get_graph().draw_mermaid_png(draw_method=MermaidDrawMethod.API,)))

r = execute_graph(sample_state, graph)

Tool Calls:
  query_lesson (call_TvQBUE3CCi3uZszPeDCCAsdQ)
 Call ID: call_TvQBUE3CCi3uZszPeDCCAsdQ
  Args:
    question: What is the role of a prototype program in problem solving?
  query_lesson (call_KreibVqzFT7JWUBix44cDgHn)
 Call ID: call_KreibVqzFT7JWUBix44cDgHn
  Args:
    question: What stages in the software life cycle are influenced by the testing stage?
  query_lesson (call_iOrIBsXVwMJHl79iWqiRlvvN)
 Call ID: call_iOrIBsXVwMJHl79iWqiRlvvN
  Args:
    question: What are the main advantages associated with object-oriented programming?
  query_lesson (call_Asn0x2YeFBBjM7qWWUoUItN2)
 Call ID: call_Asn0x2YeFBBjM7qWWUoUItN2
  Args:
    question: Where do C++ programs begin to execute?
  query_lesson (call_lY0xb5eqjdqXHal2qvpmJ09z)
 Call ID: call_lY0xb5eqjdqXHal2qvpmJ09z
  Args:
    question: What is a variable?
  query_lesson (call_DwyFYrRzeU65lLvwRh4qqRrk)
 Call ID: call_DwyFYrRzeU65lLvwRh4qqRrk
  Args:
    question: Where are variables declared in a C++ program?
  query_lesson (c

In [None]:
#Test One Shot with reflection:
from agent.graphs import one_shot_with_reflection_graph

sample_state = OverallState(
    user_input = {"open_questions": sample_case},
    concepts_to_evaluate=json.dumps(unique_list_of_concepts),
    blooms_state="understand",
    lesson_doc = lesson_1_doc,
    reflection_steps=0, #add it for reflection
    messages=[]
)

graph = one_shot_with_reflection_graph()
display(Image(graph.get_graph().draw_mermaid_png(draw_method=MermaidDrawMethod.API,)))
r = execute_graph(sample_state, graph)


```json
{
  "evaluated_questions": [
    {
      "question": "What is the role of a prototype program in problem solving?",
      "score": "B",
      "reason": "The answer correctly identifies the role of prototypes in addressing high-risk problems and demonstrating feasibility, but lacks detail on how prototypes are developed and their iterative nature.",
      "cited_paragraph": "A prototype program serves as a preliminary model to test concepts and feasibility, allowing for early detection of potential issues in problem-solving before full-scale development."
    },
    {
      "question": "What stages in the software life cycle are influenced by the testing stage?",
      "score": "A",
      "reason": "The answer provides a thorough understanding of how the testing stage influences refining, design, production, and coding phases, demonstrating a nuanced comprehension of the software development life cycle.",
      "cited_paragraph": "The testing phase in the software life cycle is

#### Run Benchmarks

In [None]:
from agent.graphs import *

def save_json(data, file_name):
    with open(file_name, 'w') as f:
        json.dump(data, f, indent=4)

one_shot_final_responses = {}
graph = one_shot_graph()
for student_id in benchmark_lesson_1.keys():    
    exec_state = OverallState(
         user_input = {"open_questions": benchmark_lesson_1[student_id]},
    concepts_to_evaluate=json.dumps(unique_list_of_concepts),
    blooms_state="understand",
    lesson_doc = lesson_1_doc,
    messages=[]
    )
    try:
        grade_dict = execute_graph(exec_state, graph)
    except Exception as e:
            grade_dict = {"ERROR": e}
    one_shot_final_responses[student_id] = grade_dict

save_json(one_shot_final_responses, 'data/gpt_4o_one_shot_final_responses.json')


{"evaluated_questions":[{"question":"What is the role of a prototype program in problem solving?","score":"B","reason":"The student identifies that prototypes address high-risk problems and demonstrate feasibility. However, they could elaborate on how prototypes provide early feedback and help refine requirements.","cited_paragraph":"Prototyping is used to explore feasibility, gather early feedback, and test assumptions in problem-solving, allowing for adjustments before full-scale development."},{"question":"What stages in the software life cycle are influenced by the testing stage?","score":"A","reason":"The student correctly identifies multiple aspects of the SDLC influenced by testing, such as design, production, and coding, showing a comprehensive understanding of the testing phase's impact.","cited_paragraph":"The testing phase influences the design, coding, and production stages by identifying issues that may require redesign or code modifications, ensuring quality and function

In [None]:
from agent.graphs import *
one_shot_reflection_final_responses = {}
graph = one_shot_with_reflection_graph()

for student_id in benchmark_lesson_1.keys():    
    exec_state = OverallState(
         user_input = {"open_questions": benchmark_lesson_1[student_id]},
    concepts_to_evaluate=json.dumps(unique_list_of_concepts),
    blooms_state="understand",
    lesson_doc = lesson_1_doc,
    reflection_steps=0, #add it for reflection
    messages=[]
    )
    try:
        grade_dict = execute_graph(exec_state, graph)
    except Exception as e:
            grade_dict = {"ERROR": e}
    one_shot_reflection_final_responses[student_id] = grade_dict

save_json(one_shot_reflection_final_responses, 'data/one_shot_reflection_final_responses.json')


```json
{
  "evaluated_questions": [
    {
      "question": "What is the role of a prototype program in problem solving?",
      "score": "B",
      "reason": "The answer demonstrates a mostly correct understanding of the role of prototypes in addressing high-risk problems and feasibility, but lacks depth in explaining how prototyping contributes to overall problem-solving processes.",
      "cited_paragraph": "A prototype in problem solving serves to explore ideas and assess feasibility, allowing stakeholders to visualize and refine requirements. While the student mentions addressing high-risk problems, a more comprehensive explanation could include iterative feedback from users or stakeholders."
    },
    {
      "question": "What stages in the software life cycle are influenced by the testing stage?",
      "score": "A",
      "reason": "The response accurately identifies multiple stages affected by testing, including design, production, and coding, showing a thorough understandi

In [None]:
from agent.graphs import *
graph_rag_final_responses = {}
graph = basic_rag_graph()

for student_id in benchmark_lesson_1.keys():    
    exec_state = OverallState(
         user_input = {"open_questions": benchmark_lesson_1[student_id]},
    concepts_to_evaluate=json.dumps(unique_list_of_concepts),
    blooms_state="understand",
    lesson_doc = lesson_1_doc,
    messages=[]
    )
    try:
        grade_dict = execute_graph(exec_state, graph)
    except Exception as e:
            grade_dict = {"ERROR": e}
    graph_rag_final_responses[student_id] = grade_dict

save_json(graph_rag_final_responses, 'data/graph_rag_final_responses.json')

Tool Calls:
  query_lesson (call_i0ed2dlPPjiZLCUI3AephoOf)
 Call ID: call_i0ed2dlPPjiZLCUI3AephoOf
  Args:
    question: What is the role of a prototype program in problem solving?
  query_lesson (call_PHDcgjGI1205aEFVzUCYNDBf)
 Call ID: call_PHDcgjGI1205aEFVzUCYNDBf
  Args:
    question: What stages in the software life cycle are influenced by the testing stage?
  query_lesson (call_rJFtQxTUVl0MfcMLCzNUm4L7)
 Call ID: call_rJFtQxTUVl0MfcMLCzNUm4L7
  Args:
    question: What are the main advantages associated with object-oriented programming?
  query_lesson (call_SIerQH5IUptK0eLU65WWpF9T)
 Call ID: call_SIerQH5IUptK0eLU65WWpF9T
  Args:
    question: Where do C++ programs begin to execute?
  query_lesson (call_42NjyUzj9xzKqImL8pRy6Kfi)
 Call ID: call_42NjyUzj9xzKqImL8pRy6Kfi
  Args:
    question: What is a variable?
  query_lesson (call_j6MJJkqRY5sHN96Zbe5sEA3z)
 Call ID: call_j6MJJkqRY5sHN96Zbe5sEA3z
  Args:
    question: Where are variables declared in a C++ program?
  query_lesson (c

#### Evaluate

In [20]:
import pandas as pd
def format_dict_to_dataframe(questions_dict, type='output'):
    dfs_list = []
    records = []
    for student_id in questions_dict.keys():    
        to_iterate = questions_dict[student_id]['evaluated_questions'] if type == 'output' else questions_dict[student_id]
        
        for question in to_iterate:
            record = question.copy()
            record['student_id'] = student_id
            records.append(record)

    grade_mapping = {
        'A': 5.0, 'B': 4.0, 'C': 3.0, 'D': 2.0, 'E': 1.0,'F': 0.0
    }

    formated_df = pd.DataFrame(records)
    y_col_name = 'y_pred' if type=='output' else 'y_true'
    formated_df[y_col_name] = formated_df['score'].map(grade_mapping) if type == 'output' else formated_df['ground_truth_score']
    return formated_df
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
import numpy as np
from sklearn.metrics import (
    mean_absolute_error, 
    mean_absolute_percentage_error, 
    mean_squared_error, 
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix
)

def print_error_metrics(df, pred_col='y_pred', true_col='y_true'):
    """
    Prints out error metrics for regression and classification performance
    based on predicted and true values.
    """
    y_pred = df[pred_col]
    y_true = df[true_col]
    
    # Regression Metrics
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    print("Regression Metrics:")
    print(f"  MAE  (Mean Absolute Error): {mae:.2f}")
    print(f"  RMSE (Root Mean Squared Error): {rmse:.2f}")
    
    # Classification Metrics
    print("\nClassification Metrics:")
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    print(f"  Accuracy: {accuracy:.2%}")
    print(f"  Precision (Weighted): {precision:.2%}")
    print(f"  Recall (Weighted): {recall:.2%}")
    print(f"  F1 Score (Weighted): {f1:.2%}")
    print("\nConfusion Matrix:")
    print(pd.DataFrame(conf_matrix, index=[f"True {i}" for i in range(conf_matrix.shape[0])], 
                                      columns=[f"Pred {i}" for i in range(conf_matrix.shape[1])]))


def plot_error_distributions(df, pred_col='y_pred', true_col='y_true'):
    """
    Creates plots to visualize the distribution of errors.
    - A histogram of the absolute errors to see how they cluster.
    - A boxplot of the errors to show spread and outliers.
    """
    y_pred = df[pred_col]
    y_true = df[true_col]
    errors = y_pred - y_true
    absolute_errors = errors.abs()
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Histogram of absolute errors
    sns.histplot(absolute_errors, kde=True, ax=axes[0], bins=10)
    axes[0].set_title("Absolute Error Distribution")
    axes[0].set_xlabel("Absolute Error")
    axes[0].set_ylabel("Frequency")
    
    # Boxplot of absolute errors
    sns.boxplot(x=absolute_errors, ax=axes[1])
    axes[1].set_title("Absolute Error Boxplot")
    axes[1].set_xlabel("Absolute Error")
    
    plt.tight_layout()
    plt.show()

In [19]:
ground_truth_data = load_json_file('data/evals/open_questions_lesson_1_ground_truth.json')

gpt_4o_one_shot = load_json_file('gpt_4o_one_shot_final_responses.json')
one_shot_mini = load_json_file('one_shot_final_responses.json')
one_shot_reflection = load_json_file('one_shot_reflection_final_responses.json')
basic_rag_mini = load_json_file('graph_rag_final_responses.json')

for model_data in [gpt_4o_one_shot, one_shot_mini, one_shot_reflection, basic_rag_mini]:
    df_preds = format_dict_to_dataframe(model_data, 'output')
    df_true = format_dict_to_dataframe(ground_truth_data, type='gt')

    merged_df = df_preds.merge(df_true, on=['student_id','question'], how='left').fillna(0.0)
    print_error_metrics(merged_df, pred_col='y_pred', true_col='y_true')

Regression Metrics:
  MAE  (Mean Absolute Error): 0.84
  RMSE (Root Mean Squared Error): 1.33

Classification Metrics:
  Accuracy: 45.81%
  Precision (Weighted): 55.08%
  Recall (Weighted): 45.81%
  F1 Score (Weighted): 47.73%

Confusion Matrix:
        Pred 0  Pred 1  Pred 2  Pred 3  Pred 4  Pred 5
True 0       2       0       1       0       0       0
True 1       1       0       0       3       2       0
True 2       1       0       0       6       9       5
True 3       2       0       1       6      13       3
True 4       0       0       0      10      16       4
True 5       4       0       0      10      35      69
Regression Metrics:
  MAE  (Mean Absolute Error): 0.87
  RMSE (Root Mean Squared Error): 1.35

Classification Metrics:
  Accuracy: 46.31%
  Precision (Weighted): 57.56%
  Recall (Weighted): 46.31%
  F1 Score (Weighted): 47.62%

Confusion Matrix:
        Pred 0  Pred 1  Pred 2  Pred 3  Pred 4  Pred 5
True 0       1       0       1       1       0       0
True 1       