In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from datasets import load_dataset
from utils import preprocess_function, tokenize, filter_function
from transformers import T5Tokenizer, PreTrainedTokenizerFast, convert_slow_tokenizer
from lib.dbengine import DBEngine
from lib.query import Query
import re

In [None]:
path = '../datasets/wikisql/data'
dataset = load_dataset(path)
train_data = dataset["train"]
val_data = dataset["validation"]

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("google/t5-efficient-tiny")
tokenizer = PreTrainedTokenizerFast(tokenizer_object=convert_slow_tokenizer.convert_slow_tokenizer(T5Tokenizer("tokenizers/sp_2k_bpe_1.model", legacy=False)))
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
preprocessed_dataset = dataset.map(preprocess_function, batched=True, batch_size=2048)
tokenized_dataset = preprocessed_dataset.map(lambda batch: tokenize(batch, tokenizer, input_max_length=None, output_max_length=None, padding="do_not_pad"), batched=True, batch_size=2048, load_from_cache_file=False)

In [None]:
tokenized_train_data = tokenized_dataset["train"]
tokenized_val_data = tokenized_dataset["validation"]

In [None]:
def sequence_length_histograms(data):
    # Calculate sequence lengths for inputs and labels
    input_lengths = [len(seq) for seq in data["input_ids"]]
    label_lengths = [len(seq) for seq in data["labels"]]
    
    # Determine bins
    bins_inputs = max(input_lengths) - min(input_lengths) + 1
    bins_labels = max(label_lengths) - min(label_lengths) + 1
    
    # Create side-by-side histograms
    fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharey=True)
    
    # Plot histogram for input lengths
    axes[0].hist(input_lengths, bins=bins_inputs, alpha=0.7, color='blue', edgecolor='black')
    axes[0].axvline(np.mean(input_lengths), color='red', linestyle='dashed', linewidth=1, label='Mean Length')
    axes[0].set_xlabel('Input Sequence Length')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Histogram of Input Sequence Lengths')
    axes[0].legend()
    
    # Plot histogram for label lengths
    axes[1].hist(label_lengths, bins=bins_labels, alpha=0.7, color='green', edgecolor='black')
    axes[1].axvline(np.mean(label_lengths), color='red', linestyle='dashed', linewidth=1, label='Mean Length')
    axes[1].set_xlabel('Label Sequence Length')
    axes[1].set_title('Histogram of Label Sequence Lengths')
    axes[1].legend()
    
    # Show the plots
    plt.tight_layout()
    plt.show()

In [None]:
sequence_length_histograms(tokenized_train_data)

In [None]:
import pandas as pd

file_path = 'tokenizer_training_data.txt'

# Step 1: Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Step 2: Count occurrences of each character
char_counts = Counter(text)

# Step 3: Separate characters and their frequencies
characters = list(char_counts.keys())
frequencies = list(char_counts.values())

# Step 4: Plot the full histogram without labels
plt.figure(figsize=(12, 6))
plt.bar(range(len(characters)), frequencies, color='skyblue')
plt.title('Character Frequency Histogram (Full)')
plt.xlabel('Character Index')
plt.ylabel('Frequency')
plt.tight_layout()  # Adjust layout to fit
plt.show()

# Step 5: Get the most common characters
most_common_count = 80  # Adjust this number as needed
most_common_chars = char_counts.most_common(most_common_count)

# Step 6: Calculate cumulative coverage
total_characters = sum(char_counts.values())
cumulative_frequencies = []
cumulative_sum = 0

for _, freq in most_common_chars:
    cumulative_sum += freq
    cumulative_frequencies.append((cumulative_sum / total_characters) * 100)

# Step 7: Create a table for the most common characters
table_data = {
    "Character": [repr(c[0]) for c in most_common_chars],
    "Frequency": [c[1] for c in most_common_chars],
    "Coverage (%)": [round(cov, 2) for cov in cumulative_frequencies],
}
table = pd.DataFrame(table_data)

# Print the table
print(f"Top {most_common_count} Most Common Characters and Cumulative Coverage:")
print(table.to_string(index=False))

In [None]:
input_lengths = [len(seq) for seq in tokenized_train_data["input_ids"]]
sorted_input_indices = sorted(range(len(input_lengths)), key=lambda i: input_lengths[i])
label_lengths = [len(seq) for seq in tokenized_train_data["labels"]]
sorted_label_indices = sorted(range(len(label_lengths)), key=lambda i: label_lengths[i])

In [None]:
tokenized_train_data[sorted_label_indices[6]]

In [None]:
filtered_train_data = tokenized_train_data.filter(lambda sample: filter_function(sample, tokenizer))
filtered_train_data

In [None]:
sequence_length_histograms(filtered_train_data)

In [None]:
db_path = '../datasets/wikisql/tables/train/train.db'
db_engine = DBEngine(db_path)

def empty_response_filter(batch):
    keep = []
    for table, sql in zip(batch["table"], batch["sql"]):
        sql["conds"] = list(zip(sql["conds"]["column_index"], sql["conds"]["operator_index"], sql["conds"]["condition"]))
        query = Query.from_dict(sql)
        gold_result = db_engine.execute_query(table["id"], query)
        if gold_result == [None]:
            keep.append(1)
        else:
            keep.append(0)

    return keep

In [None]:
wrong_data = train_data.filter(empty_response_filter, batched=True)
wrong_data

In [None]:
from copy import deepcopy

db_path = '../datasets/wikisql/tables/train/train.db'
db_engine = DBEngine(db_path)

def fix_commas(batch):
    replaced = 0
    total = 0
    for batch_idx, (sql, table) in enumerate(zip(batch["sql"], batch["table"])):
        sql_copy = deepcopy(sql)
        sql_copy["conds"] = list(zip(sql["conds"]["column_index"], sql["conds"]["operator_index"], sql["conds"]["condition"]))
        query = Query.from_dict(sql_copy)
        gold_result = db_engine.execute_query(table["id"], query)
        if gold_result == [None]:
            for cond_idx, (column_idx, condition) in enumerate(zip(sql["conds"]["column_index"], sql["conds"]["condition"])):
                total = total + 1
                if table["types"][column_idx] == 'text':
                    fixed_cond, count = re.subn(r'(?<!\s)(?:(?<!\d),|,(?!\d))', ' ,', condition)
                    batch["sql"][batch_idx]["conds"]["condition"][cond_idx] = fixed_cond
                    replaced = replaced + count
    print(f'Replaced {replaced} out of {total} conditions.')
    return batch

maybe_fixed_data = train_data.map(fix_commas, batched=True, batch_size=9000)

In [None]:
def fix_commas(batch):
    replaced = 0
    total = 0
    for batch_idx, (sql, table) in enumerate(zip(batch["sql"], batch["table"])):
        for cond_idx, (column, condition) in enumerate(zip(sql["conds"]["column_index"], sql["conds"]["condition"])):
            total = total + 1
            if table["types"][column] == 'text':
                batch["sql"][batch_idx]["conds"]["condition"][cond_idx], count = re.subn(r'(?<!\s)(?:(?<!\d),|,(?!\d))', ' ,', condition)
                replaced = replaced + count
    print(f'Replaced {replaced} out of {total} conditions.')
    return batch

maybe_fixed_data = train_data.map(fix_commas, batched=True, batch_size=None)

In [None]:
wrong_fixed_data = maybe_fixed_data.filter(empty_response_filter, batched=True)
wrong_fixed_data

In [None]:
import pandas as pd

data = {
    "ID": [table["id"] for table in wrong_fixed_data["table"]],
    "Header": [table["header"] for table in wrong_fixed_data["table"]],
    "Question": wrong_fixed_data["question"],
    "SQL Statement": [sql["human_readable"] for sql in wrong_fixed_data["sql"]]
}

df = pd.DataFrame(data)
df.to_csv('wrong_fixed_data.csv', index=False)

In [None]:
# Flatten the dataset to count unique tokens
all_tokens = [token for seq in tokenized_train_data["input_ids"] for token in seq]
unique_tokens = set(all_tokens)
print(f"Number of unique tokens: {len(unique_tokens)}")

In [None]:
# Token frequency distribution
print("Calculating token frequencies...")
token_counts = Counter(all_tokens)

# Plot histogram of token counts
print("Plotting token frequency histogram...")
frequencies = list(token_counts.values())
plt.hist(frequencies, bins=32, alpha=0.7, color='green', edgecolor='black', log=True)
plt.xlabel('Token Frequency')
plt.ylabel('Count (log scale)')
plt.title('Histogram of Token Frequencies')
plt.show()

In [None]:
# Top 20 most common tokens
print("Top 10 most common tokens:")
top_10_tokens = token_counts.most_common(20)
print(f"{'Token':<15}{'Count':<10}")
print("-" * 25)
for token, count in top_10_tokens:
    print(f"{str(tokenizer.decode(token)):<15}{count:<10}")