# Check the SQL-Create-Context Dataset for Issues

In [1]:
from huggingface_hub import login
from datasets import load_dataset, DatasetDict
import evaluate
from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import pandas as pd

login(token="") # place your own token here

In [2]:
dataset = DatasetDict({ 'train': load_dataset("b-mc2/sql-create-context", split='train[:80%]'),
                            'validation': load_dataset("b-mc2/sql-create-context", split='train[-20%:-10%]'),
                            'test': load_dataset("b-mc2/sql-create-context", split='train[-10%:]'),
                      })

def format_dataset(example):
    return {'input': 'schema: \n' + example['context'][:420] + '\n\ntranslate to SQL: ' + example['question'], 'target': example['answer']}

formatted_dataset = dataset.map(format_dataset, remove_columns=dataset['train'].column_names)

In [3]:
train_df = pd.DataFrame(formatted_dataset['train'].to_dict())
validation_df = pd.DataFrame(formatted_dataset['validation'].to_dict())
test_df = pd.DataFrame(formatted_dataset['test'].to_dict())

In [4]:
duplicate_rows = train_df[train_df.duplicated(keep='first')]

print(f"Found {len(duplicate_rows)} duplicate rows in the test dataset")
print(f"Duplicates represent {(len(duplicate_rows) / len(train_df) * 100):.2f}% of your test set")

Found 0 duplicate rows in the test dataset
Duplicates represent 0.00% of your test set


In [5]:
duplicate_rows = validation_df[validation_df.duplicated(keep='first')]

print(f"Found {len(duplicate_rows)} duplicate rows in the test dataset")
print(f"Duplicates represent {(len(duplicate_rows) / len(validation_df) * 100):.2f}% of your test set")

Found 0 duplicate rows in the test dataset
Duplicates represent 0.00% of your test set


In [6]:
duplicate_rows = test_df[test_df.duplicated(keep='first')]

print(f"Found {len(duplicate_rows)} duplicate rows in the test dataset")
print(f"Duplicates represent {(len(duplicate_rows) / len(test_df) * 100):.2f}% of your test set")

Found 0 duplicate rows in the test dataset
Duplicates represent 0.00% of your test set


In [7]:
matching_targets = train_df.merge(test_df, on='target', suffixes=('_train', '_test'))
matching_inputs = train_df.merge(test_df, on='input', suffixes=('_train', '_test'))

print(f"Found {len(matching_inputs)} rows with matching input values")
print(f"Found {len(matching_targets)} rows with matching target values")

print(f"This represents {(len(matching_inputs) / len(test_df) * 100):.2f}% of the test set")
print(f"This represents {(len(matching_targets) / len(test_df) * 100):.2f}% of the test set")

Found 0 rows with matching input values
Found 0 rows with matching target values
This represents 0.00% of the test set
This represents 0.00% of the test set


In [8]:
matching_targets = validation_df.merge(test_df, on='target', suffixes=('_train', '_test'))
matching_inputs = validation_df.merge(test_df, on='input', suffixes=('_train', '_test'))

print(f"Found {len(matching_inputs)} rows with matching input values")
print(f"Found {len(matching_targets)} rows with matching target values")

print(f"This represents {(len(matching_inputs) / len(test_df) * 100):.2f}% of the test set")
print(f"This represents {(len(matching_targets) / len(test_df) * 100):.2f}% of the test set")

Found 0 rows with matching input values
Found 0 rows with matching target values
This represents 0.00% of the test set
This represents 0.00% of the test set


In [9]:
matching_inputs

Unnamed: 0,input,target_train,target_test
