In [1]:
!python -V

Python 3.11.9


In [2]:
## Uncomment the following lines to install the required packages
# !python.exe -m pip install --upgrade pip
# !pip install --upgrade jupyter ipywidgets
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# !pip install transformers
# !pip install bitsandbytes
# !pip install accelerate
# !pip install flash-attn

## HuggingFace Setting

For Windows users, type the following command in Command Prompt:

```
setx HF_TOKEN "your_token_here"
```

For macOS users, type the following command in Terminal:

```
export HF_TOKEN="your_token_here"
```

In [3]:
import os
HF_TOKEN = os.getenv("HF_TOKEN")
HF_TOKEN[:3]+'...'

'hf_...'

## Select Model and Data

In [4]:
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct"  # 77.4
# model_name = "meta-llama/Llama-3.1-8B-Instruct"  # 81.2
# model_name = "meta-llama/Llama-3.2-1B-Instruct"  # 35.3
model_name = "meta-llama/Llama-3.2-3B-Instruct"  # 73.2

In [5]:
from src.util.json_io import *

nshot_data = load_jsonlines('data/gsm8k/train.jsonl')

question_data_path = 'gsm8k/train' # For creating contrastive training data
# question_data_path = 'gsm8k/test' # For final evaluation

question_data = load_jsonlines(f'data/{question_data_path}.jsonl')

## Load Model

In [6]:
import transformers
import torch

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    token=HF_TOKEN,
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name, 
    token=HF_TOKEN, 
)

generator = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id,
    max_new_tokens=1024,
    # do_sample=True,
    # temperature=0.6,
    # top_p=0.9,
)

def get_response(chats): 
    gen_text = generator(chats)[0]  # First return sequence
    return gen_text['generated_text'][-1]['content']

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Load Data

In [7]:
from src.util.gsm8k_helper import *
N_SHOT = 8

messages = nshot_chats(nshot_data=nshot_data, n=N_SHOT, question=question_data[0]['question'])  # 8-shot prompt
messages

[{'role': 'user',
  'content': 'Question: For every 12 cans you recycle, you receive $0.50, and for every 5 kilograms of newspapers, you receive $1.50. If your family collected 144 cans and 20 kilograms of newspapers, how much money would you receive?'},
 {'role': 'assistant',
  'content': 'Answer: There are 144/12 = <<144/12=12>>12 sets of 12 cans that the family collected.\nSo, the family would receive $0.50 x 12 = $<<0.50*12=6>>6 for the cans.\nThere are 20/5 = <<20/5=4>>4 sets of 5 kilograms of newspapers that the family collected.\nSo, the family would receive $1.50 x 4 = $<<1.50*4=6>>6 for the newspapers.\nTherefore, the family would receive a total of $6 + $6 = $<<6+6=12>>12.\n#### 12'},
 {'role': 'user',
  'content': 'Question: Betty picked 16 strawberries. Matthew picked 20 more strawberries than Betty and twice as many as Natalie. They used their strawberries to make jam. One jar of jam used 7 strawberries and they sold each jar at $4. How much money were they able to make fr

In [8]:
response = get_response(messages)
print(response)

pred_ans = extract_ans_from_response(response)
pred_ans

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Answer: In April, Natalia sold 48 clips.
In May, she sold half as many, which is 48/2 = 24 clips.
To find the total number of clips sold, add the clips sold in April and May: 48 + 24 = 72
#### 72


72

In [9]:
ground_truth = question_data[0]['answer']
print(ground_truth)

true_ans = extract_ans_from_response(ground_truth)
true_ans

Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72


72

In [10]:
messages = nshot_chats(nshot_data=nshot_data, n=N_SHOT, question=question_data[1]['question'], prompt="")  # 8-shot prompt
print(messages)

response = get_response(messages)
print(response)

pred_ans = extract_ans_from_response(response)
pred_ans

[{'role': 'user', 'content': 'Question: For every 12 cans you recycle, you receive $0.50, and for every 5 kilograms of newspapers, you receive $1.50. If your family collected 144 cans and 20 kilograms of newspapers, how much money would you receive?'}, {'role': 'assistant', 'content': 'Answer: There are 144/12 = <<144/12=12>>12 sets of 12 cans that the family collected.\nSo, the family would receive $0.50 x 12 = $<<0.50*12=6>>6 for the cans.\nThere are 20/5 = <<20/5=4>>4 sets of 5 kilograms of newspapers that the family collected.\nSo, the family would receive $1.50 x 4 = $<<1.50*4=6>>6 for the newspapers.\nTherefore, the family would receive a total of $6 + $6 = $<<6+6=12>>12.\n#### 12'}, {'role': 'user', 'content': 'Question: Betty picked 16 strawberries. Matthew picked 20 more strawberries than Betty and twice as many as Natalie. They used their strawberries to make jam. One jar of jam used 7 strawberries and they sold each jar at $4. How much money were they able to make from the s

10

In [11]:
import re

file_suffix = ''
file_suffix += '-' + re.sub(r'[\/:*?"<>|]', '_', model_name).strip()
file_suffix += '-' + re.sub(r'[\/:*?"<>|]', '_', question_data_path).strip()

log_path = f'log/errors' + file_suffix + '.txt'
contrastive_path = f'data/our_contrastive/{question_data_path}' + file_suffix + '.jsonl'

import os

if not os.path.exists(os.path.dirname(log_path)):
    os.makedirs(os.path.dirname(log_path))
# with open(log_path, 'w') as log_file:
#     log_file.write('')

if not os.path.exists(os.path.dirname(contrastive_path)):
    os.makedirs(os.path.dirname(contrastive_path))
# with open(contrastive_path, 'w') as contrastive_file:
#     contrastive_file.write('')

In [12]:
from tqdm import tqdm

total = correct = 0
for qna in tqdm(question_data[:]):

    messages = nshot_chats(nshot_data=nshot_data, n=N_SHOT, question=qna['question'])
    response = get_response(messages)
    
    pred_ans = extract_ans_from_response(response)
    true_ans = extract_ans_from_response(qna['answer'])
    
    total += 1
    if pred_ans != true_ans:
        # To see what happened
        with open(log_path, 'a', encoding='utf-8') as log_file:
            log_file.write(f"[Question]\n{qna['question']}\n\n")
            log_file.write(f"[{model_name}]\n{response}\n\n")
            log_file.write(f"[Ground Truth]\n{qna['answer']}\n\n")
            log_file.write(f"Current Accuracy: {correct/total:.3f}\n\n\n")

        # To create contrastive training data
        with open(contrastive_path, 'a', encoding='utf-8') as contrastive_file:
            json.dump({
                'question': qna['question'],
                'answer_correct': qna['answer'],
                'answer_incorrect': response,
            }, contrastive_file, ensure_ascii=False)
            contrastive_file.write('\n')
    else:
        correct += 1

print(f"Total Accuracy: {correct/total:.3f}")

  0%|          | 8/7473 [00:42<11:26:06,  5.51s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 7473/7473 [13:40:19<00:00,  6.59s/it]   

Total Accuracy: 0.828



