In [None]:
import json 
import os 
import gzip
from torch.utils.data import DataLoader
from torch.utils.data import IterableDataset
from datasets import load_dataset
from transformers import GPT2Tokenizer
from customTransformers import DecodeTransformer 
from utils.common import save_file_text, read_file_text

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
tokenizer = GPT2Tokenizer.from_pretrained(
    "gpt2",
    local_files_only=True
)
tokenizer.pad_token = tokenizer.eos_token

In [2]:
ds = load_dataset("roneneldan/TinyStories", split="train")

Mid Training QA + generative

In [3]:
REAL_NAMES = [
  "Tim",
  "Tom",
  "Sam",
  "Bob",
  "Ben",
  "Max",
  "Jack",
  "Leo",
  "Alex",
  "Anna",
  "Amy",
  "Emma",
  "Lily",
  "Lucy",
  "Mia",
  "Ella",
  "Sarah",
  "John",
  "Mary"
]
DATASET_PATH = "../CustomDatasets/story.json"

In [4]:
def generate_qa_from_story(story: str):
    qas = []

    sentences = story.split(".")
    sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
    list = {}  

    for s in sentences:
        tokens = s.split()
        if len(tokens) < 3:
            continue 

        name = tokens[0]

        if name not in REAL_NAMES:
            continue

        if name.istitle():
            qas.append({
                "q": f"who is {name.lower()} ?",
                "a": s.strip() + "."
            })

        if "is" in tokens or "was" in tokens:
            qas.append({
                "q": f"what is {name.lower()} doing ?",
                "a": s.strip() + "."
            })

    return qas


def convert_tinystories(dataset, max_samples=50_000):
    output = []

    for ex in dataset:
        story = ex["text"].strip()
        qa = generate_qa_from_story(story)

        if len(qa) == 0:
            continue

        output.append({
            "story": story,
            "qa": qa
        })

        if len(output) >= max_samples:
            break

    return output

In [5]:
storyqa = convert_tinystories(ds, max_samples=100000)

In [7]:
# assert not os.path.exists(DATASET_PATH) , "Not"
if not os.path.exists(DATASET_PATH): 
    print("Packing")
    save_file_text(storyqa, DATASET_PATH)

Packing


In [8]:
storyqa_data = read_file_text(DATASET_PATH)

In [11]:
len(storyqa_data) , storyqa_data[:10]

(100000,
 [{'story': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.',
   'qa': [{'q': 'who is lily ?',
     'a': 'Lily wanted to share the needle with her mom, so she could sew a button on her shirt.'},
    {'q': 'who is lily ?',
     'a': 'Lily went to her mom and said, "Mom, I found this needle.'}]},
  {'story': 'Once upon a time, ther

In [12]:
import json
import random

random.seed(42)

def normalize(text):
    return " ".join(text.strip().lower().split())

def build_sft(): 
    data = read_file_text(DATASET_PATH)

    sft = []
    seen_pairs = set()

    for ex in data:
        for qa in ex["qa"]:
            pair_key = (
                normalize(qa["q"]),
                normalize(qa["a"])
            )

            if pair_key in seen_pairs:
                continue

            seen_pairs.add(pair_key)
            sft.append({
                "prompt": qa["q"].strip(),
                "response": qa["a"].strip()
            })

    UNKNOWN_NAMES = [
        "billy", "alex", "john", "mark", "peter",
        "sarah", "lucas", "james", "emma2", "tom2"
    ]

    for name in UNKNOWN_NAMES:
        for template in [
            f"who is {name} ?",
            f"what is {name} doing ?",
            f"tell me about {name}"
        ]:
            sft.append({
                "prompt": template,
                "response": f"I don't know who {name.capitalize()} is."
            })

    random.shuffle(sft)
    save_file_text(sft, "sft.json")
    print(f"SFT samples: {len(sft)}")
    return sft


In [13]:
sftdata = build_sft()

SFT samples: 271779


In [14]:
len(storyqa_data) , len(sftdata)

(100000, 271779)

In [16]:
sftdata

[{'prompt': 'who is lily ?',
  'response': 'Lily and Tom were friends who liked to play with loops.'},
 {'prompt': 'who is lily ?',
  'response': 'Lily loved the beach because she could build sandcastles and look for seashells.'},
 {'prompt': 'who is alex ?',
  'response': 'Alex said he knew the perfect person to ask - his grandpa! He said his grandpa was really brave and he loved going on his raft.'},
 {'prompt': 'who is jack ?',
  'response': 'Jack felt so refreshed after his bath and Mum and Dad smiled, glad to have been so patient.'},
 {'prompt': 'who is amy ?',
  'response': 'Amy loved to play with her friends in the park.'},
 {'prompt': 'who is tim ?',
  'response': 'Tim picked up the crazy toy and started to play.'},
 {'prompt': 'who is jack ?',
  'response': 'Jack asked "Can we try it?" and Uncle Joe said "Yes of course!" They raced to the game and started playing.'},
 {'prompt': 'who is lily ?',
  'response': "Lily knew that the puppy didn't belong to anyone, so she decided to