In [None]:
from datasets import load_dataset
from utils import preprocess_function, encode_rare_chars
from tqdm import tqdm

In [None]:
path = '../datasets/wikisql'
dataset = load_dataset(path+'/data')

In [None]:
preprocessed_dataset = dataset.map(preprocess_function, batched=True, batch_size=2048)
preprocessed_dataset

In [None]:
train_data = preprocessed_dataset["train"]
val_data = preprocessed_dataset["validation"]
test_data = preprocessed_dataset["test"]
val_data

In [None]:
# Combine inputs and labels into a single list of text data
text_data = []
for sample in tqdm(preprocessed_dataset["train"]):
    text_data.append(sample['input_text'])
    text_data.append(sample['label_text'])

# Save text data to a plain text file
output_file = "tokenizer_training_data2.txt"
sep_count = 0
newline_count = 0
with open(output_file, "w", encoding="utf-8") as f:
    for line in text_data:
        f.write(line.strip() + "\n")
        newline_count = newline_count + 1
        sep_count = sep_count + line.count('[SEP]')

print(sep_count, newline_count)
print(f"Data written to {output_file}")

In [None]:
from collections import Counter
import json

file_path = 'tokenizer_training_data2.txt'

# Step 1: Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Step 2: Count occurrences of each character
char_counts = Counter(text)

# Correct for inserted newlines
print(char_counts["\n"])
char_counts["\n"] -= newline_count
print(char_counts["\n"])

# Step 3: Separate characters and their frequencies
characters = list(char_counts.keys())
frequencies = list(char_counts.values())

# Step 4: Get the most common characters
most_common_chars = char_counts.most_common()

# Step 5: Calculate cumulative coverage
total_characters = sum(char_counts.values())
cumulative_frequencies = []
cumulative_sum = 0

for _, freq in most_common_chars:
    cumulative_sum += freq
    cumulative_frequencies.append((cumulative_sum / total_characters) * 100)

# Step 6: Define a character coverage threshold
coverage_threshold = 99.85  # Adjust this value as needed

# Find the index of the last character needed to fulfill the coverage
coverage_index = next(i for i, freq in enumerate(cumulative_frequencies) if freq >= coverage_threshold)

# Get the set of characters within the coverage
covered_chars = set(char for char, _ in most_common_chars[:coverage_index + 1])

print(covered_chars)
print(len(covered_chars))

# Step 7: Create a mapping for all characters (covered map to themselves, uncovered to special sequence)
out_of_coverage_chars = set(characters) - covered_chars
mapping = {char: f"[MAP]{i}[/MAP]" for i, char in enumerate(out_of_coverage_chars)}

# Step 8: Save the mapping and reverse mapping
reverse_mapping = {v: k for k, v in mapping.items()}

In [None]:
mapping_file_path = 'mapping.json'
reverse_mapping_file_path = 'reverse_mapping.json'

with open(mapping_file_path, 'w', encoding='utf-8') as mapping_file:
    json.dump(mapping, mapping_file, ensure_ascii=False, indent=4)

with open(reverse_mapping_file_path, 'w', encoding='utf-8') as reverse_mapping_file:
    json.dump(reverse_mapping, reverse_mapping_file, ensure_ascii=False, indent=4)

In [None]:
preprocessed_encoded_dataset = preprocessed_dataset.map(lambda batch: encode_rare_chars(batch, mapping), batched=True, batch_size=2048)

In [None]:
preprocessed_dataset["train"]["label_text"][46344]

In [None]:
preprocessed_encoded_dataset["train"]["label_text"][46344]

In [None]:
# Combine inputs and labels into a single list of text data
text_data = []
for sample in tqdm(preprocessed_encoded_dataset["train"]):
    text_data.append(sample['input_text'])
    text_data.append(sample['label_text'])

# Save text data to a plain text file
output_file = "encoded_tokenizer_training_data.txt"
sep_count = 0
map_count = 0
newline_count = 0
with open(output_file, "w", encoding="utf-8") as f:
    for line in text_data:
        f.write(line.strip() + "\n")
        newline_count = newline_count + 1
        sep_count = sep_count + line.count('[SEP]')
        map_count = map_count + line.count('[MAP]')

print(sep_count, map_count, newline_count)
print(f"Data written to {output_file}")