In [110]:
import json
import os
import pprint
from tqdm import tqdm
from data_adaptor import DataAdaptor

In [None]:
# DO THIS make funtion for taking finetuning to alpaca
# run data_genrator but change adaptor
# custom function for alpaca finetuning in adaptor
# maybe don't use finetuning data in google drive, sets

In [111]:
N_EXAMPLES = -1
SPLIT = 'test'

# 2WikiMultihopQA
Authors only use question-answer information. The context is not provided.
> "we use only the question-answer pairs from these datasets, not any passages of relevant text that they contain. These datasets both contain 2-hop compositional questions sourced from facts that appear in Wikipedia articles." - Press, et al.

Authors do not use this dataset to measure compositionality gap which requires known sub-questions and answers to measure.
> "Note that the rest of this section shows that elicitive prompts improve performance but does not show that they narrow the compositionality gap since we lack sub-questions for datasets other than CC." - Press, et al.

In [112]:
from data_loaders import load_2WikiMultihopQA

In [113]:
# Load the training data
wiki_sample = load_2WikiMultihopQA(n_examples=N_EXAMPLES, split=SPLIT)

In [114]:
len(wiki_sample)

12576

In [115]:
# Print the first example in pretty format
print(json.dumps(wiki_sample[0], indent=4))

{
    "_id": "8a65901a0bdb11eba7f7acde48001122",
    "type": "compositional",
    "question": "Which country the director of film One Law For The Woman is from?",
    "context": [
        [
            "Dell Henderson",
            [
                "George Delbert \"Dell\" Henderson (July 5, 1877 \u2013 December 2, 1956) was a Canadian-American actor, director, and writer.",
                "He began his long and prolific film career in the early days of silent film."
            ]
        ],
        [
            "Inez Mee Boren",
            [
                "Inez Mee Boren( born November 2, 1880) was the director of the Woman's City Club of Oakland."
            ]
        ],
        [
            "John Farrell (businessman)",
            [
                "John Farrell is the director of YouTube in Latin America."
            ]
        ],
        [
            "Peter Levin",
            [
                "Peter Levin is an American director of film, television and theatre."
      

Let's look at the evidence required to answer the question. It's possible we create prompt examples using these evidences to insert into the fine-tuning set. For example,

**Examplar:**
```
Question: Are director of film Move (1970 Film) and director of film Méditerranée (1963 Film) from the same country?
Are follow up questions needed here: Yes.
Follow up: Who is the director of Move (1970 film)?
Intermediate answer: Stuart Rosenberg.
Follow up: Who is the director of Méditerranée (1963 film)?
Intermediate answer: Jean-Daniel Pollet
Follow up: What is the country of citizenship of Stuart Rosenberg?
Intermediate answer: American
Follow up: What is the country of citizenship of Jean-Daniel Pollet?
Intermediate answer: French
So the final answer is: no
```

In [116]:
wiki_sample[0]['supporting_facts']

[]

### Adapting to Self-Ask Examplar

In [117]:
wiki_adaptor = DataAdaptor(dataset="2WikiMultihopQA")

In [118]:
# wiki_examplars = wiki_adaptor.generate_examplars(wiki_sample, strategy="self-ask")
#for examplar in wiki_examplars:
 #   print(examplar)

### Adapting to Self-Ask Training Example
We can augment the target texts in the dataset with the self-ask rationale to fine-tune a language model to generate text with the self-ask rationale.

In [119]:
#wiki_training_examples = wiki_adaptor.generate_training_examples(wiki_sample, strategy="self-ask")
#for training_example in wiki_training_examples[:5]:
 #   print(json.dumps(training_example, indent=4))

In [120]:
# print(wiki_training_examples[0]["prompt"])
# print(wiki_training_examples[0]["target"])

### Direct Prompting Training Examples
Simply provide the facts and ask the question. No thought variable or rationale involved.

In [121]:
direct_training_examples = wiki_adaptor.generate_training_examples(
    wiki_sample,
    strategy="direct"
)
len(direct_training_examples)

Generating 2WikiMultihopQA direct training examples: 100%|██████████████████████████| 12576/12576 [00:00<00:00, 317255.70it/s]
Structuring 2WikiMultihopQA direct training examples: 100%|██████████████████████████| 12576/12576 [00:00<00:00, 12621.86it/s]


12576

In [122]:
print("--------- Augmented Prompt ---------")
print(direct_training_examples[0]["prompt"])
print("--------- Target ---------")
print(direct_training_examples[0]["target"])

--------- Augmented Prompt ---------

Question: Which country the director of film One Law For The Woman is from?
Answer:
--------- Target ---------



### Augment with In-Context Examplars and Self-Ask Rationale Targets
We can combine the two above to create an augmented fine-tuning dataset:
1. Prompt text has in-context examplars
2. Target text has the self-ask rationale

In [123]:
# training_examplars = wiki_examplars[:4]
# augmented_example = wiki_adaptor.generate_training_examples(
#     wiki_sample[0], 
#     strategy="self-ask", 
#     examplars=training_examplars
#     )[0]
# print("--------- Augmented Prompt ---------")
# print(augmented_example["prompt"])
# print("--------- Target ---------")
# print(augmented_example["target"])

In [124]:
# training_examplars = wiki_examplars[:2]
# augmented_examples = wiki_adaptor.generate_training_examples(
#     wiki_sample, 
#     strategy="self-ask",
#     examplars=training_examplars
#     )

# # look at token counts in prompt (context size)
# print("context size")
# for example in augmented_examples:
#     print(example["num_prompt_tokens"])

# # look at token counts
# print("total tokens")
# for example in augmented_examples:
#     print(example["num_tokens"])

In [125]:
# print(augmented_examples[1]["prompt"])

# Format for alpaca

In [127]:
example

{'prompt': '\nQuestion: Which country the director of film One Law For The Woman is from?\nAnswer:',
 'target': '',
 'num_prompt_tokens': 18,
 'num_target_tokens': 0,
 'num_tokens': 18}

In [126]:
alpaca_training_examples = []
#for example in direct_training_examples:
for example in tqdm(direct_training_examples, desc="Formatting examples for alpaca"):
    prompt_split = example['prompt'].split('\n\n')
    current_example = {'instruction': prompt_split[1],
        'input': prompt_split[0],
        'output': example['target']}
    alpaca_training_examples.append(current_example)

pprint.pprint(alpaca_training_examples[:5])

Formatting examples for alpaca:   0%|                                                               | 0/12576 [00:00<?, ?it/s]


IndexError: list index out of range

In [None]:
os.getcwd()

In [None]:
with open(f'data/Alpaca-LoRa/{SPLIT}_alpaca.json', 'w') as f:
    json.dump(alpaca_training_examples, f)