### JSONL Dataset Checker
Upload your JSONL formatted dataset as "input.txt" and execute the following two code blocks to:
1. See how many training examples are in your dataset.
2. Filter and find prompt-completion pairs that are greater than 2048 tokens. Read more: https://docs.forefront.ai/forefront/master/key-concepts#tokens
3. Filter and find prompt-completion pairs that aren't formatted correctly.
4. Filter and find completions that don't start with a whitespace character (" "). Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data
5. Filter and find prompts that don't end in a common separator. Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data
6. Filter and find completions that don't end with "<|endoftext|>".
7. Split dataset into training and validation sets to use the validation examples as test prompts.


In [None]:
!pip3 install transformers

In [12]:
from transformers.utils.dummy_pt_objects import FlaubertWithLMHeadModel
import json
from transformers import GPT2TokenizerFast
from collections import Counter

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
i = 0
num_dict = {
  "too_long": 0, 
  "bad_examples": 0, 
  "no_whitespace": 0,
  "no_separator": 0,
  "no_end": 0,
  # Number of test examples to remove from the training set
  "test_prompts": 5
}
arr_dict = {
  "too_long": [], 
  "bad_examples": [], 
  "no_whitespace": [],
  "no_separator": [],
  "no_end": []
}
separators = []
used_separators = {}
separator = ""

"""
Check for fixed separator.
"""
with open('input.txt') as fin:
  for line in fin:
    try:
      data = json.loads(line)
    except:
      continue
      
    prompt = data['prompt']
    separator = prompt.rsplit('\n', 1)[1]
    separators.append(separator)

used_separators = (Counter(separators))
max_value = max(used_separators.values())

if max_value < (i*0.9):
  print(f'Error: No fixed separator is used at the end of prompts. Used separators: {used_separators}')

max_key = [k for k, v in used_separators.items() if v == max_value]
fixed_separator = max_key[0]

"""
Filter and find training examples with common errors. Output filtered examples to train.txt and test_prompts.txt.
"""
with open('input.txt') as fin:
  with open('train.txt', 'w') as ftrain:
    with open('test_prompts.txt', 'w') as ftest_prompts:
      for line in fin:
        i += 1
        try:
          data = json.loads(line)
        except:
          num_dict['bad_examples'] += 1
          arr_dict['bad_examples'].append(i)
          continue
          
        prompt = data['prompt']
        completion = data['completion']
        full = f'{prompt}{completion}'

        # Filter prompts that don't end with the common separator
        separator = prompt.rsplit('\n', 1)[1]
        if separator != fixed_separator:
          num_dict['no_separator'] += 1
          arr_dict['no_separator'].append(i)
          continue

        # Filter completions that don't end with <|endoftext|>
        if not completion.endswith('<|endoftext|>'):
          num_dict['no_end'] += 1
          arr_dict['no_end'].append(i)
          continue

        # Filter completions that don't start with a whitespace
        if not completion.startswith(" "):
          num_dict['no_whitespace'] += 1
          arr_dict['no_whitespace'].append(i)
          continue

        # Filter prompt-completion pairs that are too long (2048 tokens or greater)
        length = len(tokenizer.encode(full))
        if length > 2047:
          num_dict['too_long'] += 1
          arr_dict['too_long'].append(i)
          continue

        # Filter prompt-completion pairs to use for test prompts
        if (i <= num_dict['test_prompts']):
          ftest_prompts.write(json.dumps({'prompt': prompt, 'completion': completion}))
          ftest_prompts.write('\n')
          continue

        ftrain.write(json.dumps({'prompt': prompt, 'completion': completion}))
        ftrain.write('\n')


filtered_examples = sum(num_dict.values()) - num_dict['test_prompts']
export_examples = i - filtered_examples
train_examples = export_examples - num_dict['test_prompts']

"""
Messages
"""
if i > 0:
  print(f'\n\nYour dataset contains {i} prompt-completion pairs.\n\n###\n\n')
else:
  print('Your dataset has no prompt-completion pairs.\n\n###\n\n')

if num_dict['bad_examples'] > 0:
  print(f'Error: {num_dict["bad_examples"]} prompt-completion pairs are formatted incorrectly. These are rows: {arr_dict["bad_examples"]}\n\n')
else:
  print('All prompt-completion pairs are formatted properly.\n\n')

if num_dict['too_long'] > 0:
  print(f'Error: {num_dict["too_long"]} prompt-completion pairs exceed 2048 tokens. These are rows: {arr_dict["too_long"]}. Read more: https://docs.forefront.ai/forefront/master/key-concepts#tokens \n\n')
else:
  print('There are no prompt-completions pairs that exceed 2048 tokens. Read more: https://docs.forefront.ai/forefront/master/key-concepts#tokens \n\n')

if num_dict['no_whitespace'] > 0:
  print(f'Error: {num_dict["no_whitespace"]} completions don\'t start with a whitespace character (" "). These are rows: {arr_dict["no_whitespace"]}. Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data \n\n')
else:
  print('All completions start with a whitespace character. Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data \n\n')

if num_dict['no_separator'] > 0:
  print(f'Error: {num_dict["no_separator"]} prompt don\'t end with the common separator. These are rows: {arr_dict["no_separator"]}. Fixed separator: "{fixed_separator}". Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data \n\n')
else:
  print(f'All prompts end with a common separator. Fixed separator: "{fixed_separator}".\n\n')

if num_dict['no_end'] > 0:
  print(f'Error: {num_dict["no_end"]} completions don\'t end with <\|endoftext\|>. These are rows: {arr_dict["no_end"]}. Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data \n\n###\n\n')
else:
  print('All completions end with "<|endoftext|>".\n\n\n###\n\n')

if num_dict['bad_examples'] > 0:
  print(f'{filtered_examples} prompt-completion pairs have been removed due to errors. {export_examples} prompt-completion pairs are being exported:\n\n\ntrain.txt: {train_examples} training examples\n\n\ntest_prompts.txt: {num_dict["test_prompts"]} test examples\n\n\nDownload train.txt and test_prompts.txt to start fine-tuning: https://docs.forefront.ai/forefront/guides/fine-tuning#train-a-new-fine-tuned-model \n\n\nIf you don\'t see the files after completion, refresh the page.')
else:
  print(f'{export_examples} prompt-completion pairs are being exported:\n\n\ntrain.txt: {train_examples} training examples\n\n\ntest_prompts.txt: {num_dict["test_prompts"]} test examples\n\n\nDownload train.txt and test_prompts.txt to start fine-tuning: https://docs.forefront.ai/forefront/guides/fine-tuning#train-a-new-fine-tuned-model \n\n\nIf you don\'t see the files after completion, refresh the page.')

Token indices sequence length is longer than the specified maximum sequence length for this model (1043 > 1024). Running this sequence through the model will result in indexing errors




Your dataset contains 9293 prompt-completion pairs.

###


Error: 6 prompt-completion pairs are formatted incorrectly. These are rows: [1, 2, 3, 4, 9292, 9293]


There are no prompt-completions pairs that exceed 2048 tokens. Read more: https://docs.forefront.ai/forefront/master/key-concepts#tokens 


All completions start with a whitespace character. Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data 


All prompts end with a common separator. Fixed separator: "Review:".


All completions end with "<|endoftext|>".


###


6 prompt-completion pairs have been removed due to errors. 9287 prompt-completion pairs are being exported:


train.txt: 9282 training examples


test_prompts.txt: 5 test examples


Download train.txt and test_prompts.txt to start fine-tuning: https://docs.forefront.ai/forefront/guides/fine-tuning#train-a-new-fine-tuned-model 


If you don't see the files after completion, refresh the page.
