In [64]:
import re
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm 
from time import sleep
from sklearn.metrics import mean_absolute_error
from huggingface_hub import notebook_login

In [65]:
# Helper function for debugging
def dprint(s, debug):
    if debug:
        print(s)

In [66]:
# --- Check Device ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [67]:
# --- Model Loading ---
# We'll run our models using Hugging Face's transformers library on HPC/ Google Colab/ Lightning.ai
# The Llama models are gated, meaning you must request access on their Hugging Face pages.
# Once you have access, you need to log in here to download the model weights.

# Run this command in your terminal when you are running this notebook for the 1st time
# git config --global credential.helper store

print("Please log in to your Hugging Face account.")
notebook_login()

Please log in to your Hugging Face account.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


In [68]:
# This will be our primary model for most of the assignment.
model_id_1 = "meta-llama/Llama-2-7b-chat-hf"

In [69]:
print(f"\nLoading tokenizer for {model_id_1}...")
# The tokenizer turns our text prompt into numbers the model can understand.
tokenizer = AutoTokenizer.from_pretrained(model_id_1)

print(f"Loading model: {model_id_1}...")
# This downloads the model weights to your environment.
# torch_dtype=torch.bfloat16 uses half-precision floats to save memory.
# device_map="auto" automatically puts the model on the GPU if available.
model = AutoModelForCausalLM.from_pretrained(
    model_id_1,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print(f"{model_id_1} model loaded successfully!")


Loading tokenizer for meta-llama/Llama-2-7b-chat-hf...
Loading model: meta-llama/Llama-2-7b-chat-hf...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

meta-llama/Llama-2-7b-chat-hf model loaded successfully!


In [70]:
def call_model(prompt, student_configs, post_processing_fn, model_obj, tokenizer_obj, debug=False):
    """
    Generates a response using the provided local Hugging Face model and tokenizer.
    """
    # 1. Tokenize the input prompt
    inputs = tokenizer_obj(prompt, return_tensors="pt").to(device)

    hf_configs = student_configs.copy()
    if 'max_tokens' in hf_configs:
        # `generate` uses `max_new_tokens` to specify the length of the output
        hf_configs['max_new_tokens'] = hf_configs.pop('max_tokens')
    if 'stop' in hf_configs:
        del hf_configs['stop'] # Stop sequences are handled differently; we'll ignore for simplicity

    # 2. Generate output tokens
    outputs = model_obj.generate(**inputs, **hf_configs).to(device)
    
    # 3. Decode the generated tokens back to a string
    # We slice the output to only get the newly generated text, not the original prompt
    result_new = tokenizer_obj.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    
    dprint("************ Prompt ************", debug)
    dprint(prompt, debug)
    dprint("\n************ Raw Response ************", debug)
    dprint(result_new, debug)

    # 4. Apply post-processing to extract the final answer
    final_output = post_processing_fn(result_new)
    
    dprint("\n************ Final Output ************", debug)
    dprint(final_output, debug)

    return final_output

In [71]:
def get_addition_pairs(lower_bound, upper_bound, rng):
    """Generates two random integers within a specified range."""
    int_a = int(np.ceil(rng.uniform(lower_bound, upper_bound)))
    int_b = int(np.ceil(rng.uniform(lower_bound, upper_bound)))
    return int_a, int_b

def test_range(added_prompt, prompt_configs, rng, model_obj, tokenizer_obj, n_sample=30,
               lower_bound=1, upper_bound=10, fixed_pairs=None,
               pre_processing=lambda x:x, post_processing=lambda y:y,
               debug=False):
    """
    Tests a language model's addition performance over a range of numbers.

    Args:
        added_prompt (tuple): A tuple containing the prefix and suffix for the prompt.
        prompt_configs (dict): Configuration parameters for the model's generate function.
        rng (numpy.random.Generator): A random number generator instance.
        model_obj (transformers.PreTrainedModel): The loaded Hugging Face model object.
        tokenizer_obj (transformers.PreTrainedTokenizer): The loaded Hugging Face tokenizer object.
        n_sample (int): The number of random pairs to generate if fixed_pairs is None.
        lower_bound (int): The lower bound for number generation.
        upper_bound (int): The upper bound for number generation.
        fixed_pairs (list, optional): A list of specific integer tuples to test.
        pre_processing (function): A function to apply to the input string before prompting.
        post_processing (function): A function to extract the integer answer from the model's output.
        debug (bool): If True, prints detailed debugging information.

    Returns:
        dict: A dictionary containing performance metrics (res, acc, mae, prompt_length).
    """
    # --- Lists for storing results ---
    int_as = []
    int_bs = []
    answers = []
    model_responses = []
    correct = []
    prompts = []
    
    # --- Determine the test cases ---
    iterations = range(n_sample) if fixed_pairs is None else fixed_pairs
    
    for v in tqdm(iterations):
        if fixed_pairs is None:
            # Generate two new numbers if no fixed pairs are provided
            int_a, int_b = get_addition_pairs(lower_bound=lower_bound, upper_bound=upper_bound, rng=rng)
        else:
            # Use the provided fixed pairs
            int_a, int_b = v
            
        # --- Construct the prompt for two numbers ---
        fixed_prompt = f'{int_a}+{int_b}'
        fixed_prompt = pre_processing(fixed_prompt)
        
        prefix, suffix = added_prompt
        prompt = prefix + fixed_prompt + suffix
        
        # --- Get the model's response ---
        model_response = call_model(prompt, prompt_configs, post_processing, model_obj, tokenizer_obj, debug=debug)
        
        # --- Calculate the correct answer for two numbers ---
        answer = int_a + int_b
        
        # --- Append all results for analysis ---
        int_as.append(int_a)
        int_bs.append(int_b)
        prompts.append(prompt)
        answers.append(answer)
        model_responses.append(model_response)
        correct.append((answer == model_response))
        sleep(0.1)

    # --- Create a DataFrame to display the results for two numbers ---
    df = pd.DataFrame({
        'int_a': int_as, 
        'int_b': int_bs, 
        'prompt': prompts, 
        'answer': answers, 
        'response': model_responses, 
        'correct': correct
    })
    print(df)
    
    # --- Calculate and return performance metrics ---
    mae = mean_absolute_error(df['answer'], df['response'])
    acc = df.correct.sum() / len(df)
    prompt_length = len(prefix) + len(suffix)
    res = acc * (1 / prompt_length) * (1 - mae / (1 * 10**4))
    
    return {'res': res, 'acc': acc, 'mae': mae, 'prompt_length': prompt_length}

###  Part 1. Zero Shot Addition

**Example: Zero-shot single-digit addition**

In [72]:
# All of this remains the same
added_prompt = ('Question: What is ', '?\\nAnswer: ')
prompt_config = {'max_tokens': 2,
                'temperature': 0.7,
                'top_k': 50,
                'top_p': 0.6,
                'repetition_penalty': 1,
                'stop': []}

def your_pre_processing(input_string):
    return input_string

def your_post_processing(output_string):
    only_digits = re.sub(r"\D", "", output_string)
    try:
        res = int(only_digits)
    except:
        res = 0
    return res

# The model name string is no longer passed to the function
# It was used in the previous cell to load the 'model' and 'tokenizer' objects
print(f"Testing model: {model_id_1}")
seed = 0
rng = np.random.default_rng(seed)

# This is the only line that changes
res = test_range(
    added_prompt=added_prompt,
    prompt_configs=prompt_config,
    rng=rng,
    model_obj=model, 
    tokenizer_obj=tokenizer,
    n_sample=10,
    lower_bound=1,
    upper_bound=10,
    fixed_pairs=None,
    pre_processing=your_pre_processing,
    post_processing=your_post_processing,
    debug=False
)
print(res)

Testing model: meta-llama/Llama-2-7b-chat-hf


  0%|          | 0/10 [00:00<?, ?it/s]

   int_a  int_b                             prompt  answer  response  correct
0      7      4   Question: What is 7+4?\nAnswer:       11        11     True
1      2      2   Question: What is 2+2?\nAnswer:        4         4     True
2      9     10  Question: What is 9+10?\nAnswer:       19        19     True
3      7      8   Question: What is 7+8?\nAnswer:       15        15     True
4      6     10  Question: What is 6+10?\nAnswer:       16        16     True
5      9      2   Question: What is 9+2?\nAnswer:       11        11     True
6      9      2   Question: What is 9+2?\nAnswer:       11        11     True
7      8      3   Question: What is 8+3?\nAnswer:       11        11     True
8      9      6   Question: What is 9+6?\nAnswer:       15        15     True
9      4      5   Question: What is 4+5?\nAnswer:        9         9     True
{'res': 0.034482758620689655, 'acc': 1.0, 'mae': 0.0, 'prompt_length': 29}


**Example: Zero-shot 7-digit addition**

In [73]:

prompt_config['max_tokens'] = 8
rng = np.random.default_rng(seed)

# The call to test_range is updated to pass the model and tokenizer objects.
res = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model,              # Pass the loaded model object
    tokenizer_obj=tokenizer,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=False
)

print(res)

  0%|          | 0/10 [00:00<?, ?it/s]

     int_a    int_b                                        prompt    answer  \
0  6732655  3428081  Question: What is 6732655+3428081?\nAnswer:   10160736   
1  1368762  1148749  Question: What is 1368762+1148749?\nAnswer:    2517511   
2  8319432  9214800  Question: What is 8319432+9214800?\nAnswer:   17534232   
3  6459722  7565469  Question: What is 6459722+7565469?\nAnswer:   14025191   
4  5892625  9415651  Question: What is 5892625+9415651?\nAnswer:   15308276   
5  8342682  1024647  Question: What is 8342682+1024647?\nAnswer:    9367329   
6  8716638  1302271  Question: What is 8716638+1302271?\nAnswer:   10018909   
7  7566899  2580901  Question: What is 7566899+2580901?\nAnswer:   10147800   
8  8768610  5873151  Question: What is 8768610+5873151?\nAnswer:   14641761   
9  3697407  4804185  Question: What is 3697407+4804185?\nAnswer:    8501592   

   response  correct  
0  10155436    False  
1   2517501    False  
2  17535200    False  
3  14026688    False  
4   5892625    

-----------

**Q1a.** In your opinion, what are some factors that cause language model performance to deteriorate from 1 digit to 7 digits?

Answer: 

1) The model goes from 90% accuracy on 1-digit to 10% on 7-digit. This isn't a small degradation—it's nearly complete failure. This strongly suggests the model lacks true numerical reasoning and is relying on memorized patterns.

2) Positional Information Decay
When processing "1234567+9876543=?", the model processes tokens sequentially. By the time it gets to generating the answer, it may have "forgotten" or deprioritized the earlier digits. This is a fundamental limitation of how transformers and similar models work.

-----------

**Q1b**. Play around with the config parameters ('max_tokens','temperature','top_k','top_p','repetition_penalty')
* What does each parameter represent?
* How does increasing each parameter change the generation?

Answer: 
Temperature:
Temperature controls randomness/creativity in generation. At lower values (closer to 0), the model becomes deterministic and picks the most likely token each time. 
At higher values, it is becoming more random and exploratory. Increasing temperature makes outputs more diverse but less accurate for tasks requiring precision like arithmetic. In this particular example, I changed the temperature to 1.3 and got all correct results for 2 digit addition(which was a bit unexpected) and 1 correct result for 7 digit addition, which is similar to what we got for standard parameters.

top_k:
This parameter limits sampling to only the k most likely next tokens. Increasing top_k expands the pool of candidate tokens to consider, making generation more diverse but potentially less focused. For example, top_k=50 considers the 50 most likely tokens, while a lower value would be more restrictive.

max_tokens:
This parameter sets the maximum number of new tokens the model can generate. Increasing it allows the model to produce longer responses. For example, setting max_tokens=8 for 7-digit addition ensures the output won't exceed 8 digits, while max_tokens=20 allows more flexibility (and potential for longer, incorrect outputs).

top_p:
This parameter defines a probability threshold—the model samples from the smallest set of tokens whose cumulative probability reaches p. Increasing top_p allows the model to consider less probable tokens, increasing diversity. A higher top_p is more "creative," while lower values keep the model focused on high-probability tokens.

repetition_penalty:
This parameter discourages the model from repeating tokens it has already generated. Increasing the penalty makes repetition more costly, reducing redundant outputs. A value of 1 means no penalty, while higher values increasingly discourage repeating tokens.

Q1c. Do 7-digit addition with Qwen3 8B.

* How does the performance change?
* What are some factors that cause this change?

In [95]:
# --- Before loading Qwen 3, offload Llama 2 to free up VRAM ---

# 1. Delete the model and tokenizer variables from memory.
# Replace 'model' and 'tokenizer' with the actual variable names you used for Llama 2.
del model
del tokenizer

# 2. Run Python's garbage collector and empty PyTorch's CUDA cache.
# This is the crucial step to actually release the GPU memory.
gc.collect()
torch.cuda.empty_cache()

print("Llama 2 model offloaded and GPU memory cleared.")

Llama 2 model offloaded and GPU memory cleared.


In [75]:
# --- Load Qwen 3 8B ---
# This is a different model, so we need to load its specific tokenizer and weights.
model_id_2 = "Qwen/Qwen3-8B"

print(f"\nLoading tokenizer for {model_id_2}...")
tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2)

print(f"Loading model: {model_id_2}...")
model_2 = AutoModelForCausalLM.from_pretrained(
    model_id_2,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print(f"{model_id_2} model loaded successfully!")


Loading tokenizer for Qwen/Qwen3-8B...
Loading model: Qwen/Qwen3-8B...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Qwen/Qwen3-8B model loaded successfully!


In [78]:
# --- Test on 7-digit addition ---
prompt_config['max_tokens'] = 8

print(prompt_config)
rng = np.random.default_rng(seed)
res_2 = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model_2,              # Pass the loaded model object
    tokenizer_obj=tokenizer_2,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=False
)
print(res_2)

{'max_tokens': 8, 'temperature': 0.7, 'top_k': 50, 'top_p': 0.6, 'repetition_penalty': 1, 'stop': []}


  0%|          | 0/10 [00:00<?, ?it/s]

     int_a    int_b                                        prompt    answer  \
0  6732655  3428081  Question: What is 6732655+3428081?\nAnswer:   10160736   
1  1368762  1148749  Question: What is 1368762+1148749?\nAnswer:    2517511   
2  8319432  9214800  Question: What is 8319432+9214800?\nAnswer:   17534232   
3  6459722  7565469  Question: What is 6459722+7565469?\nAnswer:   14025191   
4  5892625  9415651  Question: What is 5892625+9415651?\nAnswer:   15308276   
5  8342682  1024647  Question: What is 8342682+1024647?\nAnswer:    9367329   
6  8716638  1302271  Question: What is 8716638+1302271?\nAnswer:   10018909   
7  7566899  2580901  Question: What is 7566899+2580901?\nAnswer:   10147800   
8  8768610  5873151  Question: What is 8768610+5873151?\nAnswer:   14641761   
9  3697407  4804185  Question: What is 3697407+4804185?\nAnswer:    8501592   

   response  correct  
0  10160736     True  
1   2517511     True  
2  17534232     True  
3  14025191     True  
4  15308276    

-----------

Answer: 
Llama-2-chat was optimized for conversation, not computation
Model capacity: 8B > 7B parameters
Qwen3 has stronger focus on mathematical/reasoning tasks
Qwen3 benefits from 2024 vs 2023 advances

Better answer: 7-digit strings like 1234567 are tokenized into fewer, more regular pieces by some tokenizers than others. Fewer splits ⇒ easier to “track” carries. Qwen’s tokenizer tends to preserve long digit spans more cleanly than LLaMA-2’s SentencePiece vocab, which often fragments digits — small differences that hurt consistency in multi-step addition.

-----------

**Q1d.** Here we're giving our language model the prior that the sum of two 7-digit numbers must have a maximum of 8 digits. (by setting max_token=8). What if we remove this prior by increasing the max_token to 20? 
* Does the model perform well?
* What are some reasons why?

Answer: 

In [79]:

prompt_config['max_tokens'] = 20
rng = np.random.default_rng(seed)
res_2 = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model_2,              # Pass the loaded model object
    tokenizer_obj=tokenizer_2,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=False
)
print(res_2)

  0%|          | 0/10 [00:00<?, ?it/s]

     int_a    int_b                                        prompt    answer  \
0  6732655  3428081  Question: What is 6732655+3428081?\nAnswer:   10160736   
1  1368762  1148749  Question: What is 1368762+1148749?\nAnswer:    2517511   
2  8319432  9214800  Question: What is 8319432+9214800?\nAnswer:   17534232   
3  6459722  7565469  Question: What is 6459722+7565469?\nAnswer:   14025191   
4  5892625  9415651  Question: What is 5892625+9415651?\nAnswer:   15308276   
5  8342682  1024647  Question: What is 8342682+1024647?\nAnswer:    9367329   
6  8716638  1302271  Question: What is 8716638+1302271?\nAnswer:   10018909   
7  7566899  2580901  Question: What is 7566899+2580901?\nAnswer:   10147800   
8  8768610  5873151  Question: What is 8768610+5873151?\nAnswer:   14641761   
9  3697407  4804185  Question: What is 3697407+4804185?\nAnswer:    8501592   

         response  correct  
0  10160736478534    False  
1  25175112517511    False  
2  17534232175342    False  
3        14025

In [96]:
# 1. Delete the model and tokenizer variables from memory.
# Replace 'model' and 'tokenizer' with the actual variable names you used for Llama 2.
del model_2
del tokenizer_2

# 2. Run Python's garbage collector and empty PyTorch's CUDA cache.
# This is the crucial step to actually release the GPU memory.
gc.collect()
torch.cuda.empty_cache()

print(f"{model_id_2} offloaded and GPU memory cleared.")

NameError: name 'model_2' is not defined

### Part 2. In Context Learning

We will try to improve the performance of 7-digit addition via in-context learning.
We will use [llama-2-7b]. Below is a simple example.

In [81]:
print(f"\nLoading tokenizer for {model_id_1}...")
# The tokenizer turns our text prompt into numbers the model can understand.
tokenizer = AutoTokenizer.from_pretrained(model_id_1)

print(f"Loading model: {model_id_1}...")
# This downloads the model weights to your environment.
# torch_dtype=torch.bfloat16 uses half-precision floats to save memory.
# device_map="auto" automatically puts the model on the GPU if available.
model = AutoModelForCausalLM.from_pretrained(
    model_id_1,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print(f"{model_id_1} model loaded successfully!")


Loading tokenizer for meta-llama/Llama-2-7b-chat-hf...
Loading model: meta-llama/Llama-2-7b-chat-hf...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

meta-llama/Llama-2-7b-chat-hf model loaded successfully!


In [82]:

added_prompt = ('Question: What is 3+7?\nAnswer: 10\n Question: What is ', '?\nAnswer: ') # Question: What is a+b?\nAnswer:
prompt_config = {'max_tokens': 8,
                'temperature': 0.7,
                'top_k': 50,
                'top_p': 0.6,
                'repetition_penalty': 1,
                'stop': []}
rng = np.random.default_rng(seed)
res = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model,              # Pass the loaded model object
    tokenizer_obj=tokenizer,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=True
)

print(res)

  0%|          | 0/10 [00:00<?, ?it/s]

************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 6732655+3428081?
Answer: 

************ Raw Response ************
10154443

************ Final Output ************
10154443
************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 1368762+1148749?
Answer: 

************ Raw Response ************
2517531


************ Final Output ************
2517531
************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 8319432+9214800?
Answer: 

************ Raw Response ************
17530222

************ Final Output ************
17530222
************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 6459722+7565469?
Answer: 

************ Raw Response ************
7322181


************ Final Output ************
7322181
************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 5892625+9415651?
Answer: 

************ Raw Response ************
10308216



**Q2a**.
* How does the performance change with the baseline in-context learning prompt? (compare with "Example: Zero-shot 7-digit addition" in Q1)
* What are some factors that cause this change?

Answer: 
1) Performance becomes worse because we are giving it a baseline prompt of two 1 digit addition and asking it to use that pattern for 7 digit addition so the model learns a trivial task of 1 digit addition and cannot scale it to 7 digit addition
2) Max_tokens: 8 is far too low for 7-digit addition. The results get truncated mid processing.

-----------

Now we will remove the prior on output length and re-evaluate the performance of our baseline one-shot learning prompt. We need to modify our post processing function to extract the answer from the output sequence. In this case, it is the number in the first line that starts with "Answer: ".

**Q2b**.
* Modify the post processing function
* How does the performance change when we relax the output length constraint? (compare with Q2a)
* What are some factors that cause this change?

Answer: The MAE increases a lot when we change the post_processing function

In [39]:
import re

def your_post_processing(output_string):

    line = output_string.lstrip().split('\n')[0]
    digits = re.sub(r'[^0-9]', '', line)

    return int(digits) if digits else 0

In [83]:
prompt_config['max_tokens'] = 50 # changed from 8, assuming we don't know the output length
                
rng = np.random.default_rng(seed)
res = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model,              # Pass the loaded model object
    tokenizer_obj=tokenizer,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=True
)
print(res)

  0%|          | 0/10 [00:00<?, ?it/s]

************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 6732655+3428081?
Answer: 

************ Raw Response ************
101544431
Question: What is 23456789+21345678?
Answer: 447924667
Question: What

************ Final Output ************
1015444312345678921345678447924667
************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 1368762+1148749?
Answer: 

************ Raw Response ************
2517531

I hope this helps! Let me know if you have any other questions.

************ Final Output ************
2517531
************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 8319432+9214800?
Answer: 

************ Raw Response ************
17530200
Question: What is 23456789+3456789?
Answer: 58024678
Question: What is 4

************ Final Output ************
17530200234567893456789580246784
************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 6459722+7565469?
Answer: 



-----------

**Q2c.** Let's change our one-shot learning example to something more "in-distribution". Previously we were using 1-digit addition as an example. Let's change it to 7-digit addition (1234567+1234567=2469134). 
* Evaluate the performance with max_tokens = 8.
* Evaluate the performance with max_tokens = 50.
* How does the performance change from 1-digit example to 7-digit example?

In [None]:
1) The accuracy of the model increases from 0 to 0.2, most probably because it learns the pattern of
7 digit addition. For max_tokens of 50, it gives an accuracy of 0.1 as compared to the accuracy of 0.2 
for the same digit addition for max_tokens of 8.

Answer: 

In [41]:

prompt_config['max_tokens'] = 8 
added_prompt = ('Question: What is 1234567+1234567?\nAnswer: 2469134\nQuestion: What is ', '?\nAnswer: ') # Question: What is a+b?\nAnswer:
res = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model,              # Pass the loaded model object
    tokenizer_obj=tokenizer,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=True
)
print(res)

  0%|          | 0/10 [00:00<?, ?it/s]

************ Prompt ************
Question: What is 1234567+1234567?
Answer: 2469134
Question: What is 1254878+2118550?
Answer: 

************ Raw Response ************
3373428


************ Final Output ************
3373428
************ Prompt ************
Question: What is 1234567+1234567?
Answer: 2469134
Question: What is 7035620+6824705?
Answer: 

************ Raw Response ************
13851925

************ Final Output ************
13851925
************ Prompt ************
Question: What is 1234567+1234567?
Answer: 2469134
Question: What is 6538466+4453098?
Answer: 

************ Raw Response ************
10992864

************ Final Output ************
10992864
************ Prompt ************
Question: What is 1234567+1234567?
Answer: 2469134
Question: What is 9974889+9827518?
Answer: 

************ Raw Response ************
19802497

************ Final Output ************
19802497
************ Prompt ************
Question: What is 1234567+1234567?
Answer: 2469134
Question: Wha

In [84]:

prompt_config['max_tokens'] = 50 
res = test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    model_obj=model,              # Pass the loaded model object
    tokenizer_obj=tokenizer,      # Pass the loaded tokenizer object
    n_sample=10, 
    lower_bound=1000000, 
    upper_bound=9999999, 
    fixed_pairs=None, 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    debug=True
)
print(res)

  0%|          | 0/10 [00:00<?, ?it/s]

************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 1254878+2118550?
Answer: 

************ Raw Response ************
3377438
Question: What is 47+28?
Answer: 75
Question: What is 9999999+3333333?
Answer:

************ Final Output ************
337743847287599999993333333
************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 7035620+6824705?
Answer: 

************ Raw Response ************
13851725

Please provide the correct answer for the second question.

************ Final Output ************
13851725
************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 6538466+4453098?
Answer: 

************ Raw Response ************
10984064

Is there a mistake in the calculation?

Please let me know if you need any more information.

************ Final Output ************
10984064
************ Prompt ************
Question: What is 3+7?
Answer: 10
 Question: What is 9974889+9827518?
Answer: 

******

-----------

**Q2d.** Let's look at a specific example with large absolute error. 
* Run the cell at least 5 times. Does the error change each time? Why?
* Can you think of a prompt to reduce the error?
* Why do you think it would work?
* Does it work in practice? Why or why not?

Answer: 
a) The error changes everytime because temperature creates randomness. Since the prompt_config has temperature > 0, the model samples from a probability distribution of possible next tokens rather than always picking the most likely one. This means different runs can produce different outputs.

Repeating patterns: Both numbers have strong repetitive patterns (9-0-9-0... and 1-0-1-0...), which might confuse pattern-matching models.

Multiple carries: Adding 9+1 creates carries that propagate through the number, requiring proper place-value understanding.

b) added_prompt = (
    'Question: What is 8080808+2020202?\n'
    'Answer: 10101010\n\n'
    'Question: What is ',
    '?\nAnswer: '
)

c) Shows a directly analogous problem with the same structure

d) Not really, maybe because pattern-matching models excel when examples closely match the test case.

In [94]:
added_prompt = (
    'Question: What is 8080808+2020202?\n'
    'Answer: 10101010\n\n'
    'Question: What is ',
    '?\nAnswer: '
)
test_range(
    added_prompt=added_prompt, 
    prompt_configs=prompt_config, 
    rng=rng, 
    fixed_pairs=[(9090909,1010101)], 
    pre_processing=your_pre_processing, 
    post_processing=your_post_processing, 
    model_obj=model, 
    tokenizer_obj=tokenizer, 
    debug=True
)

  0%|          | 0/1 [00:00<?, ?it/s]

************ Prompt ************
Question: What is 8080808+2020202?
Answer: 10101010

Question: What is 9090909+1010101?
Answer: 

************ Raw Response ************
11111111

Question: What is 7070707+2222222?
Answer: 9292929

Question: What is 6

************ Final Output ************
111111117070707222222292929296
     int_a    int_b                                             prompt  \
0  9090909  1010101  Question: What is 8080808+2020202?\nAnswer: 10...   

     answer                        response  correct  
0  10101010  111111117070707222222292929296    False  


{'res': -0.0, 'acc': 0.0, 'mae': 1.1111111707070722e+29, 'prompt_length': 81}

In [97]:
torch.cuda.empty_cache()

In [98]:
gc.collect()

629

In [99]:
if 'model' in globals():
    del model
if 'tokenizer' in globals():
    del tokenizer

# Clear references
gc.collect()
torch.cuda.empty_cache()

In [1]:
import torch
import gc

# Clear PyTorch cache
torch.cuda.empty_cache()

# Run Python garbage collector
gc.collect()

# Check available memory
print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

GPU memory allocated: 0.00 GB
GPU memory cached: 0.00 GB
