In [1]:
!python --version

Python 3.10.12


In [None]:
MODEL_NAME = 'Qwen/Qwen2.5-0.5B-Instruct-AWQ'
TASK = 'text-generation'

# ITER_START = 1
# ITER_TRIALS = 1

FILEDIR = "/kaggle/working/qwen2-0.5B/iter_0/"
DATASET_FILEPATH = "/kaggle/input/squad-v2-processed/squad_2_with_few_shot_gpt2_5000.parquet"

HF_TOKEN = "HF_TOKEN" # input huggingface token

# Load Dependencies

In [3]:
from langchain_core.prompts import PromptTemplate
from tqdm import tqdm  # For progress bars

In [4]:
!pip install -qU transformers
!pip install -qU accelerate
!pip install -qU bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.4/481.4 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
!pip install autoawq

Collecting autoawq
  Downloading autoawq-0.2.8.tar.gz (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.6/71.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting triton (from autoawq)
  Downloading triton-3.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Collecting transformers<=4.47.1,>=4.45.0 (from autoawq)
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting zstandard (from autoawq)
  Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading triton-3.3.0-cp310-cp310

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, pipeline, BitsAndBytesConfig
from transformers.pipelines.pt_utils import KeyDataset
import torch
import numpy as np
import os
import math
from datasets import concatenate_datasets, Dataset
from tqdm import tqdm
import time

In [7]:
# setting env vars
set_seed(1234)
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
import os

os.environ["HF_TOKEN"] = HF_TOKEN

In [9]:
!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `Auth` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Prep Functions

In [10]:
def query_model(data, column_to_prompt, num_new_tokens = 200, batch_size=40):
    print(f"...processing in batches of {batch_size}")
    responses = []
    
    for i in range(0, len(data[column_to_prompt]), batch_size):
        batch = data[column_to_prompt][i:i+batch_size]
        # print(f"...processing batch starting with #{i}. Number of examples: {len(batch)}")

        formatted_input = [tokenizer.apply_chat_template(
                                convo,
                                tokenize=False,
                                add_generation_prompt=True
                            ) for convo in batch
                          ]
        
        model_inputs = tokenizer(formatted_input, padding=True, truncation=True, return_tensors="pt").to(model.device)
    
        with torch.no_grad():
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=num_new_tokens,
            )
        
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        responses += response
    
        # cleaning up memory
        del model_inputs
        del generated_ids
        torch.cuda.empty_cache()

    return responses

In [11]:
def generate_prompts(example, template, instruction):
    prompt = []

    prompt.append({"role": "system",
                  "content": instruction})
    
    for few_shot_example in example['few_shot']:
        few_shot_example_prepped = {}
        few_shot_example_prepped['answer'] = few_shot_example['answers']['text'][0] if len(few_shot_example['answers']['text']) > 0 else 'Cannot be determined'
        few_shot_example_prepped['question'] = few_shot_example['question']
        few_shot_example_prepped['context'] = few_shot_example['context']
        prompt.append({"role": "user",
                      "content": (prompt_template.invoke(few_shot_example_prepped).text)})
        prompt.append({"role": "assistant",
                      "content": few_shot_example_prepped['answer']})
    
    prompt.append({"role": "user",
                  "content": (template.invoke(dict(example)).text)})
     
    example['prompt'] = prompt

    return example

In [12]:
def prompt_for_answer(ds):
    start = time.time()
    output_100 = query_model(ds.select(range(100)), "prompt", 50)
    end = time.time()
    print(f"Processing 100 records took {end-start} secs. Estimated completion: {(len(ds) / 100)*(end-start)} secs [{((len(ds) / 100)*(end-start)) / 60} mins]")
    output_next = query_model(ds.select(range(100, len(ds))), "prompt", 50)
    print('Done. Combining outputs')
    output = output_100 + output_next
    cleaned_output = [out.strip() for out in output]

    if 'predicted_answer' in ds.column_names:
        ds = ds.remove_columns(['predicted_answer'])

    ds = ds.add_column('predicted_answer', cleaned_output)
    return ds

In [13]:
def generate_iter0_answers(dataset, instruction, prompt_template, filedir_prefix_saveto):
    # generating prompts
    dataset['train'] = dataset['train'].map(generate_prompts, 
                                            fn_kwargs={'template':prompt_template,
                                                       'instruction': instruction})
    dataset['train'].to_parquet(f"{filedir_prefix_saveto}prompts.parquet")
    # obtaining answers
    dataset['train'] = prompt_for_answer(dataset['train'])
    
    dataset['train'].to_parquet(f"{filedir_prefix_saveto}results.parquet")

    return dataset

# Load Dataset

In [14]:
from datasets import load_dataset

In [15]:
instruction = "Answer the question based on the provided context. If the answer cannot be determined from the context, say \'Cannot be determined\'."
template = """Context:
{context}

Question:
{question}

Answer:
"""

prompt_template = PromptTemplate.from_template(template)

In [16]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16, 
    bnb_4bit_quant_type="nf4",             
    bnb_4bit_use_double_quant=True,       
)

In [17]:
print(f"LOADING IN MODEL: {MODEL_NAME}")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    # quantization_config=quantization_config,
    torch_dtype=torch.float16,
    cache_dir="/kaggle/working/cache",
    device_map="auto",
    # use_sliding_window=False,
    # max_memory={"cuda:0": "16GB", "cuda:1": "16GB"},
    trust_remote_code=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token_id = 128001
tokenizer.padding_side='left'

LOADING IN MODEL: Qwen/Qwen2.5-0.5B-Instruct-AWQ


config.json:   0%|          | 0.00/837 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/731M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [18]:
print(f"STARTING PIPELINE")
# generator = pipeline(TASK, model=model, tokenizer = tokenizer)

STARTING PIPELINE


In [19]:
print(f"RUNNING ITERATION 0")
dataset = load_dataset("parquet", data_files=DATASET_FILEPATH)

print("- Generating initial responses...")
dataset = generate_iter0_answers(dataset,
                            instruction,
                            prompt_template,
                            FILEDIR)
print('Done.')

RUNNING ITERATION 0


Generating train split: 0 examples [00:00, ? examples/s]

- Generating initial responses...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 14.138617753982544 secs. Estimated completion: 706.9308876991272 secs [11.78218146165212 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Done.


In [20]:
print("- Adding expected answer")

def generate_expected_answer(example):
    if len(example['answers']['text']) > 0:
        example['expected_answer'] = example['answers']['text'][0]
    else:
        example['expected_answer'] = 'Cannot be determined'

    return example


dataset["train"] = dataset["train"].map(generate_expected_answer)
dataset

- Adding expected answer


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'few_shot', 'prompt', 'predicted_answer', 'expected_answer'],
        num_rows: 5000
    })
})

In [21]:
print("- Adding all answers")

def append_to_all_answers(example):
    example['all_answers'] = [example['predicted_answer']]

    return example


dataset["train"] = dataset["train"].map(append_to_all_answers)
dataset

- Adding all answers


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'few_shot', 'prompt', 'predicted_answer', 'expected_answer', 'all_answers'],
        num_rows: 5000
    })
})

In [22]:
dataset['train'].to_parquet(f"{FILEDIR}results.parquet")

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

28830699