In [1]:
import os
import json
import re
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM

## Opening the file

In [37]:
with open("full_question_solution_updated.json", "r") as f:
    content = json.load(f)

## Import the tokenizer and the model

In [3]:
model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"
# Load the tokenizer and model.
cache_custom_dir = "/data/gpfs/projects/punim2402/huggingface"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = cache_custom_dir)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir = cache_custom_dir, device_map="cuda")
model.eval()  # set the model to evaluation mode

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 5120)
    (layers): ModuleList(
      (0-47): 48 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=True)
          (k_proj): Linear(in_features=5120, out_features=1024, bias=True)
          (v_proj): Linear(in_features=5120, out_features=1024, bias=True)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((5120,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((5120,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((5120,), eps=1e-06)
    (rotary_emb

## Main function for the Log Prob Calculation

In [40]:
# Main function for the log prob
def compute_log_prob(context_question: str, reference_answer: str) -> float:
    """
    Compute the total log probability of the expected_answer given the question.
    """
    system_prompt = """You are a beginner programming student. Based on your buggy code, you are having a conversation with a teacher 
    that is helping you solve the problem. The teacher is a Socratic Tutor that is leading you to the correct answer with varying level
    of question directness. Based on the final question, do your best to think of the solution to the problem. Provide a response to the 
    final question from the teacher."""

    context = """You are a beginner programming student, and you are trying to debug a piece of code. You are having a conversation with a teacher who is guiding you 
    through the debugging process. The teacher follows the Socratic method, asking questions that lead you to the correct solution. 
    Your teacher's questions vary in directness, ranging from questions that directly point out the error in your code to more exploratory, 
    indirect questions that help you think about concepts or related issues.
    Based on the final question from the teacher, think about the solution to the problem and provide a response to the question.
    Here are the guidelines:
    - A **direct question** is specific and asks for a clear, concrete answer (e.g., "What is wrong with line 3?").
    - An **indirect question** is more general or conceptual and may require you to think more broadly about the issue (e.g., "What do you understand about syntax errors?").
    The goal is to evaluate how the level of **directness** in the teacher's question affects your ability and confidence to identify the solution to the problem."""
    
    # Tokenize the question and answer.
    messages_with_answer = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": context + context_question},
        {"role": "assistant", "content": "Student: "  + reference_answer}
    ]
    input_ids = tokenizer.apply_chat_template(messages_with_answer, return_tensors="pt").to("cuda")

    messages_to_last_utterance = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": context + context_question}
    ]
    input_ids_up_to_user = tokenizer.apply_chat_template(messages_to_last_utterance, return_tensors="pt")
    user_length = input_ids_up_to_user.shape[1]

    # Extract the answer token ids
    answer_token_ids = input_ids[:, user_length:]
    answer_length = answer_token_ids.shape[1]
    
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    # Slice logits to get those corresponding to the answer tokens
    logits_for_answer = logits[:, user_length - 1 : user_length + answer_length - 1, :]
    
    # Compute log probabilities
    log_probs = F.log_softmax(logits_for_answer, dim=-1)
    token_log_probs = log_probs.gather(2, answer_token_ids.unsqueeze(-1)).squeeze(-1)
    total_log_prob = token_log_probs.sum().item()
    
    return total_log_prob

## Running the calculation for each question and solution pair

In [41]:
# Compute and print the log probability for each pair.
results = {}
for filename, value in content.items():
    print(filename)
    results[filename] = {}
    reference_answer = value['solution']

    # Retrieve the levels
    levels = list(value.keys())
    levels.remove('solution')
    
    for level in levels:
        context_question = value[level]
        log_prob = compute_log_prob(context_question, reference_answer)
        results[filename][level] = log_prob

0_1_fibonacci
0_2_fibonacci
0_5_fibonacci
0_6_fibonacci
10_39_xnglxsh
11_40_palindrome
12_41_reversing_a_list
13_42_limit
14_43_used_twice
15_44_sequential_search
15_45_sequential_search
16_46_substring_length
16_56_substring_length
17_47_topk_socratic
18_48_password_validator
19_49_word_counter
19_50_word_counter
1_10_calculating_a_grade
1_11_calculating_a_grade
1_13_calculating _a_grade
1_8_calculating_a_grade
1_9_calculating_a_grade
20_51_spell_checker
21_52_fahrenheit_to_celsius
22_53_cookie_purchase
24_29_factorial_socratic
25_55_insert_to_linked_list
2_18_splitting_cookies
3_20_counting_down
4_22_removing_even_number
4_23_removing_even_numer
4_25_removing_even_numer
4_26_removing_even_numbers
4_28_removing_even_numbers
56_15_compute_average
58_58_splitting_apples
58_59_splitting_apples
59_60_product
5_30_sorted_words
60_61_largest_number
61_62_is_even
62_63_summing_between_integers
63_64_good_dinner
64_65_count_ones
65_66_list_range
66_67_last_index_of
66_68_last_index_of
66_69_l

## Saving output to file

In [43]:
with open("log_prob_score_full_question_testing.json", "w") as f:
    json.dump(results, f)

In [42]:
print(results)

{'0_1_fibonacci': {'least': -152.23660278320312, 'more': -143.716796875, 'most': -150.20327758789062}, '0_2_fibonacci': {'least': -131.60476684570312, 'more': -148.72409057617188, 'most': -148.0623779296875}, '0_5_fibonacci': {'least': -125.0540542602539, 'more': -120.92655181884766, 'most': -125.02173614501953}, '0_6_fibonacci': {'least': -130.5362091064453, 'more': -132.58868408203125, 'most': -142.78533935546875}, '10_39_xnglxsh': {'least': -155.59913635253906, 'more': -150.81544494628906, 'most': -156.98348999023438}, '11_40_palindrome': {'least': -102.663818359375, 'more': -96.79672241210938, 'most': -95.99673461914062}, '12_41_reversing_a_list': {'least': -129.89659118652344, 'more': -136.08409118652344, 'most': -133.07806396484375}, '13_42_limit': {'least': -135.551025390625, 'more': -129.29991149902344, 'most': -131.4855499267578}, '14_43_used_twice': {'least': -101.49087524414062, 'more': -109.9072036743164, 'most': -104.3679428100586}, '15_44_sequential_search': {'least': -13