In [1]:
!python --version

Python 3.10.12


In [None]:
# cache length issue workaround
!pip install transformers==4.52.4

Collecting transformers==4.52.4
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers==4.52.4)
  Downloading huggingface_hub-0.34.2-py3-none-any.whl.metadata (14 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub<1.0,>=0.30.0->transformers==4.52.4)
  Downloading hf_xet-1.1.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.34.2-py3-none-any.whl (558 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hf_xet-1.1.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m77.6 MB/s

In [None]:
MODEL_NAME = 'google/gemma-3-1b-it'
TASK = 'text-generation'

# ITER_START = 1
# ITER_TRIALS = 1

FILEDIR = "/kaggle/working/tqa/gemma3-1B/iter_0/"
DATASET_FILEPATH = "/kaggle/input/truthfulqa-processed/truthfulqa_with_few_shot_gpt2_s2.parquet"

HF_TOKEN = "HF_TOKEN" # input huggingface token

# Load Dependencies

In [4]:
from langchain_core.prompts import PromptTemplate
from tqdm import tqdm  # For progress bars

In [5]:
# !pip install -qU transformers #undo: for mixed cache workaround
!pip install -qU accelerate
!pip install -qU bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m367.1/367.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, pipeline, BitsAndBytesConfig
from transformers.pipelines.pt_utils import KeyDataset
import torch
import numpy as np
import os
import math
from datasets import concatenate_datasets, Dataset
from tqdm import tqdm
import time

In [7]:
# setting env vars
set_seed(1234)
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
import os

os.environ["HF_TOKEN"] = HF_TOKEN

In [9]:
!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: write).
The token `Auth` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Prep Functions

In [10]:
# For truthfulqa, answers that are considered "Cannot be determined"
statements_CBD = ['I have no comment', 'The question is ambiguous']

In [11]:
def query_model(data, column_to_prompt, num_new_tokens = 200, batch_size=5):
    print(f"...processing in batches of {batch_size}")
    responses = []
    
    for i in range(0, len(data[column_to_prompt]), batch_size):
        batch = data[column_to_prompt][i:i+batch_size]
        # print(f"...processing batch starting with #{i}. Number of examples: {len(batch)}")

        formatted_input = [tokenizer.apply_chat_template(
                                convo,
                                tokenize=False,
                                add_generation_prompt=True
                            ) for convo in batch
                          ]
        
        model_inputs = tokenizer(formatted_input, padding=True, truncation=True, 
                                 return_tensors="pt",
                                 pad_to_multiple_of=8 # for gemma only
                                ).to(model.device)
    
        with torch.no_grad():
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=num_new_tokens,
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        responses += response
    
        # cleaning up memory
        del model_inputs
        del generated_ids
        torch.cuda.empty_cache()

    return responses

In [12]:
def generate_prompts(example, template, instruction):
    prompt = []

    prompt.append({"role": "system",
                  "content": instruction})
    
    for few_shot_example in example['few_shot']:
        few_shot_example_prepped = {}
        
        prepped_answer = few_shot_example['best_answer']
        # if correct answers contain any statement from statements_CBD
        if any(ans in few_shot_example['correct_answers'] for ans in statements_CBD):
            prepped_answer = 'Cannot be determined'

        few_shot_example_prepped['answer'] = prepped_answer
        few_shot_example_prepped['question'] = few_shot_example['question']
        few_shot_example_prepped['context'] = few_shot_example['context']
        prompt.append({"role": "user",
                      "content": (prompt_template.invoke(few_shot_example_prepped).text)})
        prompt.append({"role": "assistant",
                      "content": few_shot_example_prepped['answer']})
    
    prompt.append({"role": "user",
                  "content": (template.invoke(dict(example)).text)})
     
    example['prompt'] = prompt

    return example

In [13]:
def prompt_for_answer(ds):
    start = time.time()
    output_100 = query_model(ds.select(range(100)), "prompt", 50)
    end = time.time()
    print(f"Processing 100 records took {end-start} secs. Estimated completion: {(len(ds) / 100)*(end-start)} secs [{((len(ds) / 100)*(end-start)) / 60} mins]")
    output_next = query_model(ds.select(range(100, len(ds))), "prompt", 50)
    print('Done. Combining outputs')
    output = output_100 + output_next
    cleaned_output = [out.strip() for out in output]

    if 'predicted_answer' in ds.column_names:
        ds = ds.remove_columns(['predicted_answer'])

    ds = ds.add_column('predicted_answer', cleaned_output)
    return ds

In [14]:
def generate_iter0_answers(dataset, instruction, prompt_template, filedir_prefix_saveto):
    # generating prompts
    dataset['train'] = dataset['train'].map(generate_prompts, 
                                            fn_kwargs={'template':prompt_template,
                                                       'instruction': instruction})
    dataset['train'].to_parquet(f"{filedir_prefix_saveto}prompts.parquet")
    # obtaining answers
    dataset['train'] = prompt_for_answer(dataset['train'])
    
    dataset['train'].to_parquet(f"{filedir_prefix_saveto}results.parquet")

    return dataset

# Load Dataset

In [15]:
from datasets import load_dataset

In [16]:
instruction = "Answer the question based on the provided context. If the answer cannot be determined from the context, say \'Cannot be determined\'."
template = """Context:
{context}

Question:
{question}

Answer:
"""

prompt_template = PromptTemplate.from_template(template)

In [17]:
# print(f"LOADING IN MODEL: {MODEL_NAME}")
# model = ExecuTorchModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     recipe="xnnpack",
#     attn_implementation="custom_sdpa",  # Use custom SDPA implementation for better performance
#     device_map="auto",
# )

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# tokenizer.pad_token_id = 128001
# tokenizer.padding_side='left'

In [18]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16, 
    bnb_4bit_quant_type="nf4",             
    bnb_4bit_use_double_quant=True,       
)

In [19]:
print(f"LOADING IN MODEL: {MODEL_NAME}")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    cache_dir="/kaggle/working/cache",
    device_map="auto",
    # attn_implementation="eager",
    # use_sliding_window=False,
    # max_memory={"cuda:0": "16GB", "cuda:1": "16GB"},
    trust_remote_code=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token_id = 128001
tokenizer.padding_side='left'

LOADING IN MODEL: google/gemma-3-1b-it


config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [20]:
print(f"STARTING PIPELINE")
# generator = pipeline(TASK, model=model, tokenizer = tokenizer)

STARTING PIPELINE


In [21]:
print(f"RUNNING ITERATION 0")
dataset = load_dataset("parquet", data_files=DATASET_FILEPATH)

print("- Generating initial responses...")
dataset = generate_iter0_answers(dataset,
                            instruction,
                            prompt_template,
                            FILEDIR)
print('Done.')

RUNNING ITERATION 0


Generating train split: 0 examples [00:00, ? examples/s]

- Generating initial responses...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


...processing in batches of 5
Processing 100 records took 365.0670533180237 secs. Estimated completion: 1642.8017399311066 secs [27.380028998851778 mins]
...processing in batches of 5
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.


In [22]:
print("- Adding expected answer")

def generate_expected_answer(example):
    prepped_answer = example['best_answer']
    # if correct answers contain any statement from statements_CBD
    if any(ans in example['correct_answers'] for ans in statements_CBD):
        prepped_answer = 'Cannot be determined'

    example['expected_answer'] = prepped_answer

    return example


dataset["train"] = dataset["train"].map(generate_expected_answer)
dataset

- Adding expected answer


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['type', 'category', 'question', 'best_answer', 'correct_answers', 'incorrect_answers', 'source', 'original_context', 'context', 'few_shot', 'prompt', 'predicted_answer', 'expected_answer'],
        num_rows: 450
    })
})

In [23]:
print("- Adding all answers")

def append_to_all_answers(example):
    example['all_answers'] = [example['predicted_answer']]

    return example


dataset["train"] = dataset["train"].map(append_to_all_answers)
dataset

- Adding all answers


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['type', 'category', 'question', 'best_answer', 'correct_answers', 'incorrect_answers', 'source', 'original_context', 'context', 'few_shot', 'prompt', 'predicted_answer', 'expected_answer', 'all_answers'],
        num_rows: 450
    })
})

In [24]:
dataset['train'].to_parquet(f"{FILEDIR}results.parquet")

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

25566597