In [1]:
from datasets import list_datasets
from tqdm import tqdm
from datasets import list_datasets, load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## load dataset
dataset = load_dataset('wikisql')

Downloading data: 100%|██████████| 7.71M/7.71M [00:01<00:00, 5.84MB/s]
Downloading data: 100%|██████████| 3.63M/3.63M [00:02<00:00, 1.72MB/s]
Downloading data: 100%|██████████| 25.2M/25.2M [00:06<00:00, 4.13MB/s]
Generating test split: 100%|██████████| 15878/15878 [00:00<00:00, 112738.58 examples/s]
Generating validation split: 100%|██████████| 8421/8421 [00:00<00:00, 157508.04 examples/s]
Generating train split: 100%|██████████| 56355/56355 [00:00<00:00, 178846.25 examples/s]


In [3]:
dataset

DatasetDict({
    test: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 15878
    })
    validation: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 8421
    })
    train: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 56355
    })
})

In [7]:
## iterate over train set and create question/sql pairs
train_set = []
for row in tqdm(dataset['train']):
  # print(row)
  # break
  data = {
      'question': row['question'],
      'sql': row['sql']['human_readable'],
      'table_header': row['table']['header'],
      'table_header_types': row['table']['types'],
      'conds': row['sql']['conds']['condition'],
      'rows': row['table']['rows']
          }

  train_set.append(data)

100%|██████████| 56355/56355 [00:08<00:00, 6373.84it/s]


In [8]:
## iterate over validation set and create question/sql pairs
validation_set = []
for row in tqdm(dataset['validation']):
  data = {
      'question': row['question'],
      'sql': row['sql']['human_readable'],
      'table_header': row['table']['header'],
      'table_header_types': row['table']['types'],
      'conds': row['sql']['conds']['condition'],
      'rows': row['table']['rows']
          }
  validation_set.append(data)


100%|██████████| 8421/8421 [00:01<00:00, 4641.85it/s]


In [9]:
## iterate over test set and create question/sql pairs
test_set = []
for row in tqdm(dataset['test']):
  data = {
      'question': row['question'],
      'sql': row['sql']['human_readable'],
      'table_header': row['table']['header'],
      'table_header_types': row['table']['types'],
      'conds': row['sql']['conds']['condition'],
      'rows': row['table']['rows']
          }
  test_set.append(data)


100%|██████████| 15878/15878 [00:01<00:00, 9244.72it/s]


In [10]:
train_df, validation_df, test_df = pd.DataFrame(train_set), pd.DataFrame(validation_set), pd.DataFrame(test_set)

In [11]:
train_df.to_csv('~/ML_Projects/text-sql/data/train/wikisql_train.csv', index=False), validation_df.to_csv('~/ML_Projects/text-sql/data/validation/wikisql_val.csv', index=False), test_df.to_csv('~/ML_Projects/text-sql/data/test/wikisql_test.csv', index=False)

(None, None, None)

In [12]:
train_df.table_header.values


array([list(['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes']),
       list(['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes']),
       list(['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes']),
       ...,
       list(['Date', 'Time', 'Competition', 'Opponent', 'Ground', 'Score']),
       list(['Name', 'Years', 'Gender', 'Locality', 'Authority', 'Decile']),
       list(['Name', 'Circuit', 'Date', 'Winning driver', 'Winning constructor', 'Report'])],
      dtype=object)

In [17]:
train_df

Unnamed: 0,question,sql,table_header,table_header_types,conds,rows
0,Tell me what the notes are for South Australia,SELECT Notes FROM table WHERE Current slogan =...,"[State/territory, Text/background colour, Form...","[text, text, text, text, text, text]",[SOUTH AUSTRALIA],"[[Australian Capital Territory, blue/white, Ya..."
1,What is the current series where the new serie...,SELECT Current series FROM table WHERE Notes =...,"[State/territory, Text/background colour, Form...","[text, text, text, text, text, text]",[New series began in June 2011],"[[Australian Capital Territory, blue/white, Ya..."
2,What is the format for South Australia?,SELECT Format FROM table WHERE State/territory...,"[State/territory, Text/background colour, Form...","[text, text, text, text, text, text]",[South Australia],"[[Australian Capital Territory, blue/white, Ya..."
3,Name the background colour for the Australian ...,SELECT Text/background colour FROM table WHERE...,"[State/territory, Text/background colour, Form...","[text, text, text, text, text, text]",[Australian Capital Territory],"[[Australian Capital Territory, blue/white, Ya..."
4,how many times is the fuel propulsion is cng?,SELECT COUNT Fleet Series (Quantity) FROM tabl...,"[Order Year, Manufacturer, Model, Fleet Series...","[text, text, text, text, text, text]",[CNG],"[[1992-93, Gillig, Phantom (High Floor), 444-4..."
...,...,...,...,...,...,...
56350,What time was the match played with a score of...,SELECT Time FROM table WHERE Score = 3-2,"[Date, Time, Competition, Opponent, Ground, Sc...","[text, text, text, text, text, text]",[3-2],"[[17 July 2007, 15:30 GMT, Friendly Match, Chi..."
56351,On which ground did the team play Aston Villa?,SELECT Ground FROM table WHERE Opponent = asto...,"[Date, Time, Competition, Opponent, Ground, Sc...","[text, text, text, text, text, text]",[aston villa],"[[17 July 2007, 15:30 GMT, Friendly Match, Chi..."
56352,What kind of competition was it at San Siro at...,SELECT Competition FROM table WHERE Ground = s...,"[Date, Time, Competition, Opponent, Ground, Sc...","[text, text, text, text, text, text]","[san siro, 18:30 gmt]","[[17 July 2007, 15:30 GMT, Friendly Match, Chi..."
56353,What is the total number of decile for the red...,SELECT COUNT Decile FROM table WHERE Name = re...,"[Name, Years, Gender, Locality, Authority, Dec...","[text, text, text, text, text, real]",[redwood school],"[[Amesbury School, 1–6, Coed, Churton Park, St..."


In [20]:
train_df['question'].loc[0], train_df['sql'].loc[0],train_df['table_header'].loc[0]

('Tell me what the notes are for South Australia ',
 'SELECT Notes FROM table WHERE Current slogan = SOUTH AUSTRALIA',
 ['State/territory',
  'Text/background colour',
  'Format',
  'Current slogan',
  'Current series',
  'Notes'])

In [21]:
train_df.shape

(56355, 6)

In [22]:
validation_df.shape

(8421, 6)

In [23]:
test_df.shape

(15878, 6)