In [None]:
! pip install datasets fuzzywuzzy dataset

Download `dataset.py` and `utils.py` from the repository by original paper authors [here](https://github.com/tzshi/squall/tree/main/model)

In [None]:
import torch
from dataset import load_dataset, em_process

In [None]:
DATASET_PATH = "squall.json"

#### Load and explore SQUALL dataset

In [None]:
train_data = load_dataset(DATASET_PATH)

In [None]:
train_data[0]

In [None]:
# alignment annotation of NLQ with SQL typed labels

train_data[0]['nl_ralign']

In [None]:
# tokenized NLQ (Natural Language question)

train_data[0]['nl']

In [None]:
# columns of table headers for this NLQ

train_data[0]['columns']

In [None]:
train_data[0]['columns'][0]

### Transform into BIO format for NER task

In [None]:
SAMPLE_SIZE = 50
BIO_LABEL_MAP = {
    "None": "O",
    "Keyword": "O",
    "Column": "B-COLUMN",
    "Literal": "B-LITERAL"
}

#### Get tokenized nlq for 1st K rows

In [None]:
nlq_list = []
for i in range(SAMPLE_SIZE):
  nlq_list.append(train_data[i]['nl'])

In [None]:
len(nlq_list)

In [None]:
nlq_list[:5]

#### Get column list for the same samples above

In [None]:
columns_list = []
for i in range(SAMPLE_SIZE):
  col_list_per_nlq = []
  for column_list in train_data[i]['columns']:
    col_list_per_nlq.append(column_list[0])
  columns_list.append(col_list_per_nlq)

In [None]:
len(columns_list)

In [None]:
columns_list[8:15]

#### Get SQL type labels for each mention in tokenized NLQ

In [None]:
sql_labels = []

for i in range(SAMPLE_SIZE):
  sql_label_per_token = []
  for token in train_data[i]['nl_ralign']:
    sql_label_per_token.append(BIO_LABEL_MAP[token[0]])
  sql_labels.append(sql_label_per_token)

In [None]:
len(sql_labels)

In [None]:
sql_labels[:5]

#### Get question IDs

In [None]:
qids = []
for i in range(SAMPLE_SIZE):
  qids.append(train_data[i]['nt'])

In [None]:
len(qids)

In [None]:
qids[:10]

### Convert sample to datasets instance for model training

In [None]:
import pandas as pd

from datasets import load_dataset, Dataset, DatasetDict
from datasets import Features, ClassLabel, Sequence, Value

In [None]:
# Collect above samples into a dictionary of features

squall_dict = {
    "ner_tags": sql_labels,
    "nl": nlq_list,
    "nt": qids,
    "columns": columns_list
}

In [None]:
# Define features - data type for each value for keys in above dictionary
squall_features = Features({
 "nl": Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 "columns": Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 "ner_tags": Sequence(feature=ClassLabel(names=['O', 'B-COLUMN', 'I-COLUMN',
                                                'B-LITERAL', 'I-LITERAL'],
                                              id=None), length=-1, id=None),
 "nt": Value(dtype='string', id=None)
})

In [None]:
squall_dataset = Dataset.from_dict(
    mapping=squall_dict,
    features=squall_features,
    split='train'
)

In [None]:
squall_dataset_df = DatasetDict()
squall_dataset_df['train'] = squall_dataset

In [None]:
squall_dataset_df

In [None]:
squall_dataset_df['train'][0]

Notice that the ner_tags are automatically converted to integers from their NER labels provided in the dictionary form. Defining the features with `Sequence` & `ClassLabel` instances enables the `dataset` to infer the data type & perform conversion wherever neccessary.

In [None]:
# NER tags for a sample of 1st 5 cases (with NER tags)
squall_dict['ner_tags'][:5]

# Save this dataset to disk
squall_dataset_df.save_to_disk("squall_sample_dataset")

### References
* SQUALL paper - https://arxiv.org/pdf/2010.11246
* https://github.com/tzshi/squall
* https://huggingface.co/docs/datasets/en/about_dataset_features
* https://huggingface.co/docs/datasets/v2.19.0/en/package_reference/main_classes#datasets.Sequence
* https://huggingface.co/docs/datasets/v2.19.0/en/package_reference/main_classes#datasets.Dataset