### JSONL Dataset Checker
Upload your JSONL formatted dataset as "input.txt" and execute the following two code blocks to:
1. See how many training examples are in your dataset.
2. Filter and find prompt-completion pairs that are greater than 2048 tokens. Read more: https://docs.forefront.ai/forefront/master/key-concepts#tokens
3. Filter and find prompt-completion pairs that aren't formatted correctly.
4. Filter and find completions that don't start with a whitespace character (" "). Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data
5. Filter and find prompts that don't end in a common separator. Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data
6. Filter and find completions that don't end with "<|endoftext|>".
7. Split dataset into training and validation sets to use the validation examples as test prompts.


In [None]:
!pip3 install transformers

In [None]:
from transformers.utils.dummy_pt_objects import FlaubertWithLMHeadModel
import json
from transformers import GPT2TokenizerFast
from collections import Counter

"""
Upload your JSONL formatted dataset as input.txt and execute this code to:
1. See how many training examples are in your dataset.
2. Filter prompt-completion pairs that are greater than 2048 tokens.
3. Filter completions that don't start with a whitespace character (" ").
4. Filter prompts that don't end in a common separator.
5. Split your dataset into training and validation sets to use the validation examples as test prompts.
"""

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
i = 0
num_too_long = 0
num_bad_examples = 0
num_no_whitespace = 0
num_no_separator = 0
num_no_end = 0
arr_too_long = []
arr_bad_examples = []
arr_no_whitespace = []
arr_no_separator = []
arr_no_end = []
separators = []
used_separators = {}
separator = ""

# Number of test examples to be removed from the training set
num_test_prompts = 5

# Find the most commonly used separator
with open('input.txt') as fin:
  for line in fin:
    try:
      data = json.loads(line)
    except:
      continue
      
    prompt = data['prompt']
    separator = prompt.rsplit('\n', 1)[1]
    separators.append(separator)

used_separators = (Counter(separators))
max_value = max(used_separators.values())
# Check that a separator is used at the end of prompts more than 90% of the time
if max_value < (i*0.9):
  print(f'Error: No common separator is used at the end of prompts. Separators: {used_separators}')
max_key = [k for k, v in used_separators.items() if v == max_value]
common_separator = max_key[0]

with open('input.txt') as fin:
  with open('train.txt', 'w') as ftrain:
    with open('test_prompts.txt', 'w') as ftest_prompts:
      for line in fin:
        i += 1
        try:
          data = json.loads(line)
        except:
          num_bad_examples += 1
          arr_bad_examples.append(i)
          continue
          
        prompt = data['prompt']
        completion = data['completion']
        full = f'{prompt}{completion}'

        # Filter prompts that don't end with the common separator
        separator = prompt.rsplit('\n', 1)[1]
        if separator != common_separator:
          num_no_separator += 1
          arr_no_separator.append(i)
          continue

        # Filter completions that don't end with <|endoftext|>
        if not completion.endswith('<|endoftext|>'):
          num_no_end += 1
          arr_no_end.append(i)
          continue

        # Filter prompt-completion pairs to use for test prompts
        if (i <= num_test_prompts):
          ftest_prompts.write(json.dumps({'prompt': prompt, 'completion': completion}))
          ftest_prompts.write('\n')
          continue

        # Filter completions that don't start with a whitespace
        if not completion.startswith(" "):
          num_no_whitespace += 1
          arr_no_whitespace.append(i)
          continue

        # Filter prompt-completion pairs that are too long (2048 tokens or greater)
        length = len(tokenizer.encode(full))
        if length > 2047:
          num_too_long += 1
          arr_too_long.append(i)
          continue

        ftrain.write(json.dumps({'prompt': prompt, 'completion': completion}))
        ftrain.write('\n')

filtered_examples = num_bad_examples + num_too_long + num_no_whitespace + num_no_separator + num_no_end
total_examples = i - filtered_examples
train_examples = total_examples - num_test_prompts

if i > 0:
  print(f'\n\nYour dataset contains {i} prompt-completion pairs.\n\n###\n\n')
else:
  print('Your dataset has no prompt-completion pairs.\n\n###\n\n')

if num_bad_examples > 0:
  print(f'Error: {num_bad_examples} prompt-completion pairs are formatted incorrectly. These are rows: {arr_bad_examples}\n\n')
else:
  print('All prompt-completion pairs are formatted properly.\n\n')

if num_too_long > 0:
  print(f'Error: {num_too_long} prompt-completion pairs exceed 2048 tokens. These are rows: {arr_too_long}. Read more: https://docs.forefront.ai/forefront/master/key-concepts#tokens \n\n')
else:
  print('There are no prompt-completions pairs that exceed 2048 tokens. Read more: https://docs.forefront.ai/forefront/master/key-concepts#tokens \n\n')

if num_no_whitespace > 0:
  print(f'Error: {num_no_whitespace} completions don\'t start with a whitespace character (" "). These are rows: {arr_no_whitespace}. Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data \n\n')
else:
  print('All completions start with a whitespace character. Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data \n\n')

if num_no_separator > 0:
  print(f'Error: {num_no_separator} prompt don\'t end with the common separator. These are rows: {arr_no_separator}. Common separator: "{common_separator}". Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data \n\n')
else:
  print(f'All prompts end with a common separator. Common separator: "{common_separator}".\n\n')

if num_no_end > 0:
  print(f'Error: {num_no_separator} completions don\'t end with <\|endoftext\|>. These are rows: {arr_no_separator}. Read more: https://docs.forefront.ai/forefront/guides/fine-tuning#prepare-training-data \n\n###\n\n')
else:
  print('All completions end with "<|endoftext|>".\n\n\n###\n\n')

if num_bad_examples > 0:
  print(f'{filtered_examples} prompt-completion pairs have been removed due to errors. {total_examples} prompt-completion pairs are being exported:\n\n\ntrain.txt: {train_examples} training examples\n\n\ntest_prompts.txt: {num_test_prompts} test examples\n\n\nDownload train.txt and test_prompts.txt to start fine-tuning: https://docs.forefront.ai/forefront/guides/fine-tuning#train-a-new-fine-tuned-model \n\n\nIf you don\'t see the files after completion, refresh the page.')
else:
  print(f'{total_examples} prompt-completion pairs are being exported:\n\n\ntrain.txt: {train_examples} training examples\n\n\ntest_prompts.txt: {num_test_prompts} test examples\n\n\nDownload train.txt and test_prompts.txt to start fine-tuning: https://docs.forefront.ai/forefront/guides/fine-tuning#train-a-new-fine-tuned-model \n\n\nIf you don\'t see the files after completion, refresh the page.')