In [1]:
!python --version

Python 3.10.12


In [None]:
MODEL_NAME = 'Qwen/Qwen2.5-0.5B-Instruct-AWQ'
TASK = 'text-generation'

ITER_START = 1
ITER_TRIALS = 11

# activate when feedback_results is saved in current iter
SKIP_FEEDBACK = False
# running test
RUN_LIMITED_TEST = False

FILEDIR_PREFIX = "/kaggle/working/tqa/qwen2-0.5B/iter_"
DATASET_FILEPATH_PREFIX = "/kaggle/input/selfrefl-results-tqa/qwen2-0.5B/iter_"

HF_TOKEN = "HF_TOKEN" # input huggingface token

# Load Dependencies

In [3]:
from langchain_core.prompts import PromptTemplate
from tqdm import tqdm  # For progress bars

In [4]:
!pip install -qU transformers
!pip install -qU accelerate
!pip install -qU bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m367.1/367.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
!pip install autoawq

Collecting autoawq
  Downloading autoawq-0.2.9.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting triton (from autoawq)
  Downloading triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Collecting zstandard (from autoawq)
  Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (155.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.6/155.6 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m87.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for co

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, pipeline, BitsAndBytesConfig
from transformers.pipelines.pt_utils import KeyDataset
import torch
import numpy as np
import os
import math
from datasets import concatenate_datasets, Dataset
from tqdm import tqdm
import time

In [7]:
# setting env vars
set_seed(1234)
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
import os

os.environ["HF_TOKEN"] = HF_TOKEN

In [9]:
!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: write).
The token `Auth` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Prep Functions

In [10]:
def query_model(data, column_to_prompt, num_new_tokens = 200, batch_size=40):
    print(f"...processing in batches of {batch_size}")
    responses = []
    
    for i in range(0, len(data[column_to_prompt]), batch_size):
        batch = data[column_to_prompt][i:i+batch_size]
        # print(f"...processing batch starting with #{i}. Number of examples: {len(batch)}")

        formatted_input = [tokenizer.apply_chat_template(
                                convo,
                                tokenize=False,
                                add_generation_prompt=True
                            ) for convo in batch
                          ]
        
        model_inputs = tokenizer(formatted_input, padding=True, truncation=True, return_tensors="pt").to(model.device)
    
        with torch.no_grad():
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=num_new_tokens,
            )
        
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        responses += response
    
        # cleaning up memory
        del model_inputs
        del generated_ids
        torch.cuda.empty_cache()

    return responses

In [11]:
def append_to_all_answers(example):
    example['all_answers'].append(example['predicted_answer'])

    return example

## Feedback functions

In [12]:
def generate_feedback_prompts(example, template, instruction, fs):
    prompt = []

    prompt.append({"role": "system",
                  "content": instruction})
    
    for fs_example in fs:
        prompt.append({"role": "user",
                      "content": (template.invoke(fs_example).text)})
        prompt.append({"role": "assistant",
                      "content": fs_example['predicted_answer_feedback']})
    
    prompt.append({"role": "user",
                  "content": (template.invoke(dict(example)).text)})
     
    example['feedback_prompt'] = prompt

    return example

In [13]:
def prompt_for_feedback(ds):
    start = time.time()
    output_100 = query_model(ds.select(range(100)), "feedback_prompt", 200)
    end = time.time()
    print(f"Processing 100 records took {end-start} secs. Estimated completion: {(len(ds) / 100)*(end-start)} secs [{((len(ds) / 100)*(end-start)) / 60} mins]")
    output_next = query_model(ds.select(range(100, len(ds))), "feedback_prompt", 200)
    print('Done. Combining outputs')
    output = output_100 + output_next
    # cleaned_output = [out[0]['generated_text'][2]['content'].strip() for out in output]
    cleaned_output = [out.strip() for out in output]

    if 'predicted_answer_feedback' in ds.column_names:
        ds = ds.remove_columns(['predicted_answer_feedback'])

    ds = ds.add_column('predicted_answer_feedback', cleaned_output)
    return ds

In [14]:
def generate_feedback(dataset, feedback_instruction, feedback_prompt_template, feedback_fs, filedir_prefix_saveto):
    # generating feedback prompts
    dataset['train'] = dataset['train'].map(generate_feedback_prompts, 
                                            fn_kwargs={'template':feedback_prompt_template,
                                                       'instruction': feedback_instruction,
                                                       'fs': feedback_fs})
    dataset['train'].to_parquet(f"{filedir_prefix_saveto}feedback_prompts.parquet")
    # obtaining feedback
    dataset['train'] = prompt_for_feedback(dataset['train'])

    return dataset

## Refine functions

In [15]:
def generate_refine_prompts(example, template, instruction, fs):
    prompt = []
    
    prompt.append({"role": "system",
                  "content": instruction})

    for fs_example in fs:
        prompt.append({"role": "user",
                      "content": (template.invoke(fs_example).text)})
        prompt.append({"role": "assistant",
                      "content": fs_example['refined_answer']})
    
    prompt.append({"role": "user",
                  "content": (template.invoke(dict(example)).text)})
     
    example['refine_prompt'] = prompt

    return example

In [16]:
def prompt_for_refinement(ds):
    start = time.time()
    output_100 = query_model(ds.select(range(100)), "refine_prompt", 50)
    end = time.time()
    print(f"Processing 100 records took {end-start} secs. Estimated completion: {(len(ds) / 100)*(end-start)} secs [{((len(ds) / 100)*(end-start)) / 60} mins]")
    output_next = query_model(ds.select(range(100, len(ds))), "refine_prompt", 50)
    print('Done. Combining outputs')
    output = output_100 + output_next
    # cleaned_output = [out[0]['generated_text'][2]['content'].strip() for out in output]
    cleaned_output = [out.strip() for out in output]

    if 'predicted_answer' in ds.column_names:
        ds = ds.remove_columns(['predicted_answer'])

    ds = ds.add_column('predicted_answer', cleaned_output)
    
    return ds

In [17]:
def generate_refined(dataset, refine_instruction, refine_prompt_template, refine_fs, filedir_prefix_saveto):
    # generating refine prompts
    dataset['train'] = dataset['train'].map(generate_refine_prompts, 
                                            fn_kwargs={'template':refine_prompt_template,
                                                      'instruction': refine_instruction,
                                                       'fs': refine_fs})
    dataset['train'].to_parquet(f"{filedir_prefix_saveto}refine_prompts.parquet")
    # obtaining refined prompts
    dataset['train'] = prompt_for_refinement(dataset['train'])
    return dataset

# Load Dataset

In [18]:
from datasets import load_dataset

In [19]:
static_fs = [
    {
        "context": "\"Don't cross your eyes -- they'll stick that way!\" That's something most of us have heard from our mothers at one time or another. Can they actually stick? Let's take a look at how your eyes work.  Your eyeballs are controlled by six muscles. When you look up, down, left or right, the muscles attached to your eyeballs make this movement happen. When you cross your eyes, you're simply telling your muscles to move your eyes inward together. This is something you naturally do when you look at something that's very close to your face.  So was mom's warning just a ruse so you would stop tormenting your younger brother with another one of your gross-out maneuvers? The answer to that question is yes. Although crossing your eyes for an extended period of time might cause a temporary strain on your eye muscles, no medical evidence suggests that they would stick that way. You would most likely suffer from some eye spasms or twitches, and your eyes might feel a bit fatigued, but they would certainly return to normal within an hour or so.",
        "question": "What happens if you cross your eyes for too long?",
        "predicted_answer": "If you cross your eyes for too long, you might suffer from some eye spasms or twitches, and your eyes might feel a bit fatigued. However, these effects are temporary and your eyes would most likely return to normal within an hour or so. There is no medical evidence suggesting that your eye muscles would stick in the position you crossed them to.",
        "predicted_answer_feedback": """To assess the quality of the answer, we assign a score to each trait:
1. Relevancy - how well does the answer address the question: Very well. The answer states that eye spasms or twitches and fatique may result from crossing your eyes for too long. 10/10
2. Consistency - does the answer match what is said in the context: Yes, the context explicitly states you might suffer from eye spasms or twitches and eye fatique which is also what is stated in the answer. 10/10
3. Brevity - does the answer only include the answer and nothing else: No, in addition to answering the question, the answer goes on to explain the effects are temporary and that there is no evidence of eyes stick in the position you crossed them to. Both these extra statements could be removed from the answer to make it more concise. 5/10""",
        "refined_answer": "If you cross your eyes for too long, you might suffer from some temporary eye spasms or twitches, and your eyes might feel a bit fatigued."
    },
    {
        "context": "![Torvosaurus](https://th-thumbnailer.cdn-si- edu.com/funiAgO9NV_TfzmFbfM4RSgWedM=/1000x750/filters:no_upscale\(\)/https://tf- cmsv2-smithsonianmag-media.s3.amazonaws.com/filer/20120412014020torvosaurus- thumb.jpg) About 83 million years separated Late Jurassic icons—such as this Torvosaurus—from Cretaceous celebrities like Tyrannosaurus. Photo by the author  You can’t understand dinosaurs without a sense of time. We need to know when a dinosaur lived to comprehend how it fits into what paleontologist William Diller Matthew called “life’s splendid drama.” But we throw around Deep Time estimates, framed in millions of years, so often that it’s easy to become inured to the wider context of life’s history.  The Mesozoic Era, which lasted from about 250 million to 66 million years ago, is often called the Age of Dinosaurs. As a kid, this brought to mind one endless summer when dinosaurs flourished. And many of the books I read picked one environment from three different periods within the era to represent dinosaur life. Little _Coelophysis_ was the canonical Triassic dinosaur; the huge sauropods and theropods of the Morrison Formation represented the Jurassic, and a Cretaceous _Tyrannosaurus_ versus _Triceratops_ face-off ultimately capped off the succession. With the periods juxtaposed this way, millions of years didn’t seem so very long.  But let’s unpack some of that scenery. _Diplodocus_ , _Apatosaurus_ , _Allosaurus_ , _Stegosaurus_ and their neighbors roamed western North America about 150 million years ago. This slice of time falls in the latter portion of the Jurassic. The traditional representatives of the latest Cretaceous scene— _Tyrannosaurus_ and _Triceratops_ —did not evolve until about 67 million years ago. By themselves, these dates are just labels, but think of them falling along evolution’s timeline. About 83 million years separated _Apatosaurus_ from _Tyrannosaurus_ and _Allosaurus_ from _Triceratops_. The so-called Age of Mammals—which began when the non-avian dinosaurs were wiped out—has been going on for about 66 million years. **Less time separates us from _Tyrannosaurus rex_ than separated _T. rex_ from _Stegosaurus_.**  Consider how much life has changed in the past 66 million years. Archaic mammals flourished and ultimately went extinct long before anything like the world’s modern fauna appeared. Saber-fanged, knobbly-headed herbivores such as _Uintatherium_, lemur-like primates called adapiforms, razor-jawed carnivores known as creodonts and many other strange forms proliferated and disappeared. Even lineages familiar to us today, such as horses, rhinos and elephants, evolved and diversified and are now represented by just remnants of what once existed.  The time between the last _Triceratops_ and now has seen radical evolutionary changes. Now think of the 83 million years between the Jurassic and Cretaceous titans. During that time, the first flowering plants bloomed; the fish-like ichthyosaurs disappeared as plesiosaurs and mosasaurs became the predominant predators of the seas; vast herds of hadrosaurs and ceratopsids occupied places once dominated by sauropods; tiny tyrant dinosaurs transformed into apex predators, and early birds established themselves in ever-greater variety alongside their dinosaurian kin. These are just a few highlights, and that is part of the wonder and frustration of tracking the history of life on earth. We are offered only glimpses of an ever-changing picture, and when viewed separately, it’s easy to forget how those snippets relate to each other. But when we can step back, and consider how all those snippets run together, the long and ever-changing history of life on our planet seems all the more fantastic.  Get the latest **Science** stories in your inbox.",
        "question": "Who lived closer in time: the Stegosaurus and the T-Rex, or the T-Rex and humans?",
        "predicted_answer": "The Stegosaurus and the T-Rex lived closer in time to each other compared to the T-Rex and humans.",
        "predicted_answer_feedback": """To assess the quality of the answer, we assign a score to each trait:
1. Relevancy - how well does the answer address the question: Very well. The answer chooses one of the options provided in the question that is more true. 10/10
2. Consistency - does the answer match what is said in the context: Not well. The answer directly contradicts what is stated in the context. The context explicitly states that \"Less time separates us from _Tyrannosaurus rex_ than separated _T. rex_ from _Stegosaurus_.\" 0/10
3. Brevity - does the answer only include the answer and nothing else: Yes, the answer only includes the relevant details to answer the question and nothing else. 10/10""",
        "refined_answer": "The T-Rex and humans lived closer in time compared to the T-Rex and the Stegosaurus."
    },
    {
        "context": "country, or at the same level of accuracy, the resulting numerical comparisons may create misleading conclusions. Furthermore, the addition of figures from all countries may not equal the world total.  Areas that form integral parts of sovereign states, such as the countries of the United Kingdom, are counted as part of the sovereign states concerned. Not included are other entities that are not sovereign states, such as the European Union,[a] and independent territories that do not have permanent populations, such as the Chagos Archipelago and various countries' claims to Antarctica.[2]  ## Sovereign states and dependencies by population  Note: A numbered rank is assigned to the 193 member states of the United Nations, plus the two observer states to the United Nations General Assembly. Dependent territories and constituent countries that are parts of sovereign states are not assigned a numbered rank. In addition, sovereign states with limited recognition are included, but not assigned a number rank.    ## See also  ## Explanatory notes  1. **^** The European Union is a _sui generis_ supranational union whose sovereign members delegate to it by treaty certain powers that are often exercised by sovereign states. Its combined population has been estimated at 447,319,916 on 1 January 2020, and it would be ranked 3rd if it were included in the list. It has 5.53% of the world's population — see \"Eurostat-Tables, Graphs and Maps Interface（TGM）table\". European Commission. 2. **^** According to UN estimates, India surpassed China by the end of Apr 2023.[4] 3. **^** Refers to Mainland China; excludes China's special administrative regions of Hong Kong and Macau, which returned to Chinese sovereignty in 1997 and 1999, respectively. 4. **^** Includes the population of the India-administered union territories of Jammu and Kashmir, and Ladakh. 5. **^** Includes the 50 states and the District of Columbia, but excludes the territories of the United States. 6. **^** Includes the population of Pakistan-administered Azad Kashmir and Gilgit-Baltistan. 7. **^** Includes the Republic of Crimea and Sevastopol, administrative areas on the Crimean Peninsula occupied by Russia. The Ukrainian government and most of the world's other states consider the Crimean Peninsula part of Ukraine's territory. 8. **^** Includes the integral 18 regions of France (including 5 overseas departments and regions). Excludes France's 5 overseas collectivities: French Polynesia, Saint Barthélemy, Saint Martin, Saint Pierre and Miquelon, and Wallis and Futuna, and the sui generis collectivity of New Caledonia, which are shown separately. The French Southern and Antarctic Lands (an Antarctic territorial claim hosting only government officials and research station staff) and Clipperton Island (an uninhabited state private property of France) are not listed at all due to their extraordinary nature. 9. **^** Excludes the three British Crown Dependencies and the 14 British Overseas Territories, listed separately. Four British Overseas Territories are not listed due to their extraordinary nature. The four not listed are _British Antarctic Territory_ (an Antarctic territorial claim hosting only government officials and research station staff), the British Indian Ocean Territory (a military base), South Georgia and the South Sandwich Islands (hosts only government officials and research station staff), and the Sovereign Base Areas of Akrotiri and Dhekelia (a military base where permanent residency is limited to citizens of the Republic of Cyprus). 10. **^** Includes Zanzibar. 11. **^** Excludes the disputed territory of Western Sahara (Sahrawi Arab Democratic Republic). 12. **^** Excludes the Autonomous Republic of Crimea and the city of Sevastopol, Ukraine's _de jure_ territory which is occupied by Russia (however, includes the Donbas region, partly occupied by Russia). 13. **^** Excludes the external territories of Christmas Island, the Cocos (Keeling) Islands, and Norfolk Island.",
        "question": "What is the population of the country?",
        "predicted_answer": "Cannot be determined. The context provided does not specify the population of a particular country. It discusses various aspects of population counts, including sovereign states, dependencies, and territories, but does not give specific population numbers for individual countries without additional context.",
        "predicted_answer_feedback": """To assess the quality of the answer, we assign a score to each trait:
1. Relevancy - how well does the answer address the question: Very well. The answer \"Cannot be determined\" can be used to answer any question. 10/10
2. Consistency - does the answer match what is said in the context: Well. The context does not provide any insight into what country the question may be referring to so \"Cannot be determined\" is appropriate based on the given context. 10/10
3. Brevity - does the answer only include the answer and nothing else: No, everything after \"Cannot be determined\" can be omitted since that phrase already completely addresses the question. 5/10""",
        "refined_answer": "Cannot be determined"
    }
]

In [20]:
feedback_instruction = """We want to iteratively improve the provided answers. To help improve, score each answer on a scale of 1 to 10 on the traits:
1) Relevancy - how well does the answer address the question. 10 out of 10 means the answer directly responds to the question. 5 out of 10 means the answer partially responds to the question but not entirely. 0 out of 10 means the answer does not respond to the question.
2) Consistency - does the answer match what is said in the context. 10 out of 10 means the answer of the question is explicitly stated in the context. 5 out of 10 means the answer to the question is not supported or proven by the context. 0 out of 10 means the answer to the question contradicts what is written in the context. 
3) Brevity - does the answer only include the answer and nothing else. 10 out of 10 means the answer only contains the response to the question. 5 out of 10 means the answer contains the response to the question and a portion of the question. 0 out of 10 means the answer contains unnecessarily details that can be omitted.

Note that if the answer to the question cannot be determined from the context, the answer should say 'Cannot be determined'."""

feedback_template = """Context:
{context}

Question:
{question}

Answer:
{predicted_answer}

Scores:
"""

feedback_prompt_template = PromptTemplate.from_template(feedback_template)

In [21]:
refine_instruction = "Use the feedback to improve the answer to the question in such a way that obtains a perfect score for each evaluated trait. Output only the refined answer and nothing else. If the answer achieved a perfect score, simply return the initial answer. Note that if the answer cannot be determined from the context, only say 'Cannot be determined' and nothing else."

refine_template = """Context:
{context}

Question:
{question}

Answer:
{predicted_answer}

Feedback:
{predicted_answer_feedback}

Refined Answer:
"""

refine_prompt_template = PromptTemplate.from_template(refine_template)

In [22]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16, 
    bnb_4bit_quant_type="nf4",             
    bnb_4bit_use_double_quant=True,       
)

In [23]:
print(f"LOADING IN MODEL: {MODEL_NAME}")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    # quantization_config=quantization_config,
    torch_dtype=torch.float16,
    cache_dir="/kaggle/working/cache",
    device_map="auto",
    # use_sliding_window=False,
    # max_memory={"cuda:0": "16GB", "cuda:1": "16GB"},
    trust_remote_code=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token_id = 128001
tokenizer.padding_side='left'

LOADING IN MODEL: Qwen/Qwen2.5-0.5B-Instruct-AWQ


config.json:   0%|          | 0.00/837 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/731M [00:00<?, ?B/s]

I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/



generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [24]:
print(f"STARTING PIPELINE")
# generator = pipeline(TASK, model=model, tokenizer = tokenizer)

STARTING PIPELINE


  and should_run_async(code)


In [25]:
for iter_i in range(ITER_START, ITER_START + ITER_TRIALS):
    print(f"RUNNING ITERATION {iter_i}")
    if iter_i == ITER_START:
        # loading appropriate dataset from trial 0
        dataset_filepath = f"{DATASET_FILEPATH_PREFIX}{iter_i-1}/results.parquet"
    else:
        dataset_filepath = f"{FILEDIR_PREFIX}{iter_i-1}/results.parquet"
    
    dataset = load_dataset("parquet", data_files=dataset_filepath)

    if RUN_LIMITED_TEST:
        dataset['train'] = dataset['train'].select(range(150))
        print("Running a test: computing 150 examples.")
        
    filedir_prefix_saveto = f"{FILEDIR_PREFIX}{iter_i}/"
    # feedback
    print("- Generating feedback...")
    if iter_i == ITER_START and SKIP_FEEDBACK:
        dataset = load_dataset("parquet", data_files=f"{DATASET_FILEPATH_PREFIX}{iter_i}/results_feedback.parquet")
        print('SKIPPED: Retrieved feedback responses from dataset.')
    else:
        dataset = generate_feedback(dataset,
                                    feedback_instruction,
                                    feedback_prompt_template,
                                    static_fs,
                                    filedir_prefix_saveto)
        
        dataset['train'].to_parquet(f"{filedir_prefix_saveto}results_feedback.parquet")
    # refine
    print("- Refining answer...")
    dataset = generate_refined(dataset,
                              refine_instruction,
                              refine_prompt_template,
                              static_fs,
                              filedir_prefix_saveto)
    
    print("- Adding to all answers...")
    dataset["train"] = dataset["train"].map(append_to_all_answers)
    dataset['train'].to_parquet(f"{filedir_prefix_saveto}results.parquet")
    
    print('Done.')

RUNNING ITERATION 1


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

/usr/local/lib/python3.10/dist-packages/langchain_core/load/serializable.py:289: PydanticDeprecatedSince211: Accessing this attribute on the instance is deprecated, and will be removed in Pydantic V3. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  field = inst.model_fields.get(key)
/usr/local/lib/python3.10/dist-packages/langchain_core/load/serializable.py:213: PydanticDeprecatedSince211: Accessing this attribute on the instance is deprecated, and will be removed in Pydantic V3. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  if k in self.model_fields and self.model_fields[k].exclude:


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 249.5076072216034 secs. Estimated completion: 1122.7842324972153 secs [18.713070541620255 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

/usr/local/lib/python3.10/dist-packages/langchain_core/load/serializable.py:289: PydanticDeprecatedSince211: Accessing this attribute on the instance is deprecated, and will be removed in Pydantic V3. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  field = inst.model_fields.get(key)
/usr/local/lib/python3.10/dist-packages/langchain_core/load/serializable.py:213: PydanticDeprecatedSince211: Accessing this attribute on the instance is deprecated, and will be removed in Pydantic V3. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  if k in self.model_fields and self.model_fields[k].exclude:


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 133.09249305725098 secs. Estimated completion: 598.9162187576294 secs [9.981936979293824 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
RUNNING ITERATION 2


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 269.0195949077606 secs. Estimated completion: 1210.5881770849228 secs [20.176469618082045 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 132.2480776309967 secs. Estimated completion: 595.1163493394852 secs [9.918605822324754 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
RUNNING ITERATION 3


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 267.7864627838135 secs. Estimated completion: 1205.0390825271606 secs [20.083984708786012 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 131.13803887367249 secs. Estimated completion: 590.1211749315262 secs [9.835352915525437 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
RUNNING ITERATION 4


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 267.05138993263245 secs. Estimated completion: 1201.731254696846 secs [20.028854244947432 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 131.98320126533508 secs. Estimated completion: 593.9244056940079 secs [9.89874009490013 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
RUNNING ITERATION 5


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 267.5585539340973 secs. Estimated completion: 1204.0134927034378 secs [20.066891545057295 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 131.16681933403015 secs. Estimated completion: 590.2506870031357 secs [9.837511450052261 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
RUNNING ITERATION 6


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 267.20536041259766 secs. Estimated completion: 1202.4241218566895 secs [20.040402030944826 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 132.4031822681427 secs. Estimated completion: 595.8143202066422 secs [9.930238670110702 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
RUNNING ITERATION 7


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 268.2963237762451 secs. Estimated completion: 1207.333456993103 secs [20.122224283218383 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 131.39350652694702 secs. Estimated completion: 591.2707793712616 secs [9.854512989521027 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
RUNNING ITERATION 8


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 267.89034056663513 secs. Estimated completion: 1205.506532549858 secs [20.091775542497636 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 131.4083960056305 secs. Estimated completion: 591.3377820253372 secs [9.855629700422288 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
RUNNING ITERATION 9


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 268.127206325531 secs. Estimated completion: 1206.5724284648895 secs [20.109540474414825 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 133.11418867111206 secs. Estimated completion: 599.0138490200043 secs [9.983564150333404 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
RUNNING ITERATION 10


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 267.3120536804199 secs. Estimated completion: 1202.9042415618896 secs [20.048404026031495 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 131.53273153305054 secs. Estimated completion: 591.8972918987274 secs [9.86495486497879 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
RUNNING ITERATION 11


Generating train split: 0 examples [00:00, ? examples/s]

- Generating feedback...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 268.25248169898987 secs. Estimated completion: 1207.1361676454544 secs [20.11893612742424 mins]
...processing in batches of 40
Done. Combining outputs


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

- Refining answer...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

...processing in batches of 40
Processing 100 records took 132.26910519599915 secs. Estimated completion: 595.2109733819962 secs [9.920182889699936 mins]
...processing in batches of 40
Done. Combining outputs
- Adding to all answers...


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Done.
