### Because even the good model sometimes produces bad output, this notebook filters out all the very bad cases like special characters and repetitions.

In [1]:
import os
import json
import string
import re

# Define the folder paths
good_data_folder = "./good_data/"
bad_data_file = "./bad_data/extracted_bad_data.json"

# Initialize a list to hold the bad data
bad_data = []

In [2]:
# Function to check for high percentage of non-alphanumeric characters
def is_nonsensical(text, threshold):  
    non_alphanumeric_count = sum(1 for char in text if not char.isalnum() and char not in string.whitespace)
    if len(text) == 0:
        return True
    ratio = non_alphanumeric_count / len(text)
    return ratio > threshold

# Function to detect extremely long sequences of gibberish
def has_long_gibberish_sequence(text, max_gibberish_length):  # Define a limit for gibberish sequences
    return len(re.sub(r'[a-zA-Z0-9\s]', '', text)) > max_gibberish_length

# Function to check for long words
def has_long_word(text, max_word_length):
    words = re.findall(r'\b\w+\b', text)
    return any(len(word) > max_word_length for word in words)

# Function to check for excessive repetition of words or characters
def has_excessive_repetition(text, repetition_threshold):  # Tolerance of up to 30% repetition
    words = text.split()
    unique_words = set(words)
    if len(words) == 0:
        return True
    repetition_ratio = len(unique_words) / len(words)
    return repetition_ratio < (1 - repetition_threshold / 100)

# Function to check for long sequences of special characters
def contains_long_special_character_sequence(text, max_sequence_length=8):  # Reduce tolerance to 8 consecutive special chars
    return bool(re.search(r'[^a-zA-Z0-9\s]{' + str(max_sequence_length) + ',}', text))

# Combined filtering function
def filter_nonsense(text, special_char_threshold=0.4, max_word_length=50, repetition_threshold=30, max_special_char_sequence=8, max_gibberish_length=100):
    if is_nonsensical(text, special_char_threshold):
        return True
    if has_long_word(text, max_word_length):
        return True
    if has_excessive_repetition(text, repetition_threshold):
        return True
    if contains_long_special_character_sequence(text, max_special_char_sequence):
        return True
    if has_long_gibberish_sequence(text, max_gibberish_length):
        return True
    return False

# Function to process each file and filter out bad data
def process_files(good_data_folder, bad_data_file):
    # Iterate over every file in the folder
    for filename in os.listdir(good_data_folder):
        file_path = os.path.join(good_data_folder, filename)

        # Skip directories
        if os.path.isdir(file_path):
            continue

        # Check if it's a JSON file
        if not filename.endswith(".json"):
            continue

        # Load the file
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError:
                print(f"Could not decode JSON from {file_path}")
                continue

        filtered_data = []
        for entry in data:
            input_text = entry.get("input", "")
            output_text = entry.get("output", "")

            # Check if the output is nonsensical or contains gibberish
            if filter_nonsense(output_text):
                bad_data.append({"input": input_text, "output": output_text})
            else:
                filtered_data.append(entry)

        # Overwrite the original file with filtered data
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(filtered_data, f, ensure_ascii=False, indent=4)
    
    # Save the bad data to a separate file
    with open(bad_data_file, 'w', encoding='utf-8') as f:
        json.dump(bad_data, f, ensure_ascii=False, indent=4)

    print(f"Filtered bad data saved to {bad_data_file}")

In [3]:
process_files(good_data_folder, bad_data_file)

Filtered bad data saved to ./bad_data/extracted_bad_data.json
