In [None]:
import os
import torch
import transformers
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

HF_TOKEN = os.getenv("HF_TOKEN")

# model_name = "meta-llama/Llama-3.2-1B"
# model_name = "meta-llama/Llama-3.2-1B-Instruct"
# model_name = "meta-llama/Llama-3.2-3B"
model_name = "meta-llama/Llama-3.2-3B-Instruct"
# model_name = "google/gemma-2-2b"
# model_name = "google/gemma-2-2b-it"
# model_name = "google/gemma-2-9b"
# model_name = "google/gemma-2-9b-it"

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    output_hidden_states=True,  # Enable hidden states
    token=HF_TOKEN,
)
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name,
    token=HF_TOKEN,
)

print(model, model.config)




VBox(children=(Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s],))

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm

In [2]:
from src.util.json_io import *

train_qnas = load_jsonlines(f'data/gsm8k/train.jsonl')
test_qnas = load_jsonlines(f'data/gsm8k/test.jsonl')
len(train_qnas), len(test_qnas)

(7473, 1319)

In [None]:
import random; rseed = 42; random.seed(rseed)

nshot_prompt = f""
for top_logit_indices in random.sample(range(len(train_qnas)), 8):
    nshot_prompt += f"Question: {train_qnas[top_logit_indices]['question']}\nAnswer: {train_qnas[top_logit_indices]['answer']}\n\n"

print(nshot_prompt)

Question: For every 12 cans you recycle, you receive $0.50, and for every 5 kilograms of newspapers, you receive $1.50. If your family collected 144 cans and 20 kilograms of newspapers, how much money would you receive?
Answer: There are 144/12 = <<144/12=12>>12 sets of 12 cans that the family collected.
So, the family would receive $0.50 x 12 = $<<0.50*12=6>>6 for the cans.
There are 20/5 = <<20/5=4>>4 sets of 5 kilograms of newspapers that the family collected.
So, the family would receive $1.50 x 4 = $<<1.50*4=6>>6 for the newspapers.
Therefore, the family would receive a total of $6 + $6 = $<<6+6=12>>12.
#### 12

Question: Betty picked 16 strawberries. Matthew picked 20 more strawberries than Betty and twice as many as Natalie. They used their strawberries to make jam. One jar of jam used 7 strawberries and they sold each jar at $4. How much money were they able to make from the strawberries they picked?
Answer: Matthew picked 16 + 20 = <<16+20=36>>36 strawberries.
Natalie picked 3

In [16]:
def question_to_prompt(question):
    return f"{nshot_prompt}Question: {question} Let's think step by step.\nAnswer: "

sample_i = 5
print(question_to_prompt(test_qnas[sample_i]['question']))

from src.util.gsm8k_helper import *
print('Answer:', extract_num_from_ans(test_qnas[sample_i]['answer']))
print('Answer in integer:', extract_num_from_ans(test_qnas[sample_i]['answer']))

Question: For every 12 cans you recycle, you receive $0.50, and for every 5 kilograms of newspapers, you receive $1.50. If your family collected 144 cans and 20 kilograms of newspapers, how much money would you receive?
Answer: There are 144/12 = <<144/12=12>>12 sets of 12 cans that the family collected.
So, the family would receive $0.50 x 12 = $<<0.50*12=6>>6 for the cans.
There are 20/5 = <<20/5=4>>4 sets of 5 kilograms of newspapers that the family collected.
So, the family would receive $1.50 x 4 = $<<1.50*4=6>>6 for the newspapers.
Therefore, the family would receive a total of $6 + $6 = $<<6+6=12>>12.
#### 12

Question: Betty picked 16 strawberries. Matthew picked 20 more strawberries than Betty and twice as many as Natalie. They used their strawberries to make jam. One jar of jam used 7 strawberries and they sold each jar at $4. How much money were they able to make from the strawberries they picked?
Answer: Matthew picked 16 + 20 = <<16+20=36>>36 strawberries.
Natalie picked 3

In [None]:
def generate_answer(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_length=input_ids.shape[1] + 512,
            do_sample=True, top_k=1,
            # temperature=1e-4,
            eos_token_id=tokenizer.encode(text='\n\n', add_special_tokens=False)[0],
            pad_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True, 
            output_logits=True, 
            output_hidden_states=True,
        )

    output_text = tokenizer.decode(outputs.sequences[0])
    generated_answer = output_text.split('Answer: ')[-1].split('\n\n')[0]
    generated_len = len(outputs.logits)

    k = 3
    topk_indices = torch.zeros((generated_len, k), dtype=torch.long)
    topk_logits = torch.zeros((generated_len, k))
    topk_probabilities = torch.zeros((generated_len, k))

    # Iterate over each sequence position to find the top-3 indices and their logits and probabilities
    for seq_idx, logits_tensor in enumerate(outputs.logits): # outputs.logits: (seq_length, batch_size, vocab_size)
        logits = logits_tensor[0]  # score_tensor.shape: (batch_size, vocab_size)
        
        top_logit_values, top_logit_indices = torch.topk(logits, k=3)
        
        topk_indices[seq_idx] = top_logit_indices  # Indices of the top-3 tokens
        topk_logits[seq_idx] = top_logit_values  # Logits of the top-3 tokens
        topk_probabilities[seq_idx] = torch.nn.functional.softmax(logits, dim=-1)[top_logit_indices]  # Probabilities of the top-3 tokens

    return {
        'generated_answer': generated_answer,
        'generated_indices': outputs.sequences[0][input_ids.shape[1]:],
        'generated_tokens': [tokenizer.decode(i) for i in outputs.sequences[0][input_ids.shape[1]:]],
        'generated_token_len': len(outputs.sequences[0][input_ids.shape[1]:]),
        'topk_indices': topk_indices,
        'topk_tokens': [[tokenizer.decode(i) for i in row] for row in topk_indices],
        'topk_logits': topk_logits,
        'topk_probabilities': topk_probabilities,
        'vocab_size': outputs.logits[0].shape[-1],
    }

In [100]:
generate_answer(question_to_prompt(test_qnas[sample_i]))

{'generated_answer': '60% of the price of one glass is 60/100 * 5 = $<<60/100*5=3>>3.\nKylar will buy 8 glasses at the discounted price of $3 each, so he will pay 8 * $3 = $<<8*3=24>>24 for those glasses.\nKylar will buy 8 glasses at the regular price of $5 each, so he will pay 8 * $5 = $<<8*5=40>>40 for those glasses.\nIn total, Kylar will pay $24 + $40 = $<<24+40=64>>64 for the glasses.\n#### 64',
 'generated_indices': tensor([ 1399,     4,   315,   279,  3430,   315,   832,  9168,   374,   220,
          1399,    14,  1041,   353,   220,    20,   284,   400,  2501,  1399,
            14,  1041,     9,    20,    28,    18,  2511,    18,   627,    42,
          4010,   277,   690,  3780,   220,    23, 29247,   520,   279, 48774,
          3430,   315,   400,    18,  1855,    11,   779,   568,   690,  2343,
           220,    23,   353,   400,    18,   284,   400,  2501,    23,     9,
            18,    28,  1187,  2511,  1187,   369,  1884, 29247,   627,    42,
          4010,   277, 

In [101]:
print("* Sample Question:", test_qnas[sample_i]['question'])
print("* Expected Answer:", test_qnas[sample_i]['answer'])

* Sample Question: Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?
* Expected Answer: The discount price of one glass is 60/100 * 5 = $<<60/100*5=3>>3.
If every second glass is cheaper, that means Kylar is going to buy 16 / 2 = <<16/2=8>>8 cheaper glasses.
So for the cheaper glasses, Kylar is going to pay 8 * 3 = $<<8*3=24>>24.
And for the regular-priced glasses, Kylar will pay 8 * 5 = $<<8*5=40>>40.
So in total Kylar needs to pay 24 + 40 = $<<24+40=64>>64 for the glasses he wants to buy.
#### 64
