In [1]:
import json
import re
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# read data from .json file

with open("plain_statement_5000.json") as f:
    json_data = json.load(f)

plain_sql = [item['sql'] for item in json_data]
plain_sql = [sql.lower() for sql in plain_sql]

In [3]:
# split data into tokens

pattern = r'[\s()\-,:;]'
string_literal_pattern = r"'([^']*)'"
placeholder = "<string>"

# replace content inside single quotes by <string>
plain_sql_ph = [re.sub(string_literal_pattern, placeholder, sql) for sql in plain_sql]

# split the statements with placeholder
tokenized_sql = [re.split(pattern, sql) for sql in plain_sql_ph]

# remove empty tokens
tokenized_sql = [token for token in tokenized_sql if token]

# replace string literals and numbers by placeholder
for sql in tokenized_sql:
    for i, token in enumerate(sql):
        if re.match(r'^[\'\"].*[\'\"]$', token):  # Check if token is a string literal
            sql[i] = '<string>'
        elif re.match(r'^[0-9]+(\.[0-9]+)?$', token):  # Check if token is a number
            sql[i] = '<number>'

In [9]:
# build the vocab
#vocab = set()
vocab = {}
for sql in tokenized_sql:
    '''
    for element in sql:
        if element in vocab.keys():
            vocab[element] += 1
        else:
            vocab[element] = 1
    '''
    vocab.update(sql)

# count vectorization
vectorizer = CountVectorizer(vocabulary=vocab)
X = vectorizer.fit_transform(plain_sql)

In [10]:
print(vocab)

{'select': 5000, 'sum': 3367, '"orders"."o_custkey"': 2335, '+': 3372, '"orders"."o_shippriority"': 613, '': 25229, 'from': 5000, '"lineitem"': 3673, 'join': 14927, '"partsupp"': 4069, 'on': 14927, '"lineitem"."l_partkey"': 3757, '=': 20847, '"partsupp"."ps_partkey"': 6701, 'and': 12147, '"lineitem"."l_suppkey"': 3729, '"partsupp"."ps_suppkey"': 7137, '"supplier"': 2816, '"supplier"."s_suppkey"': 3380, '"part"': 2369, '"part"."p_partkey"': 2930, '"orders"': 2666, '"lineitem"."l_orderkey"': 2981, '"orders"."o_orderkey"': 2951, '"nation"': 1652, '"supplier"."s_nationkey"': 2187, '"nation"."n_nationkey"': 2095, 'where': 5000, '>=': 3917, '<number>': 8699, '<=': 3838, 'count': 3337, '*': 3337, '"lineitem"."l_linenumber"': 591, '"lineitem"."l_tax"': 643, '"lineitem"."l_linestatus"': 214, '<string>': 5359, '"lineitem"."l_quantity"': 625, '!=': 3472, '"lineitem"."l_discount"': 641, '"customer"': 1695, '"customer"."c_custkey"': 2227, '"customer"."c_acctbal"': 550, '"partsupp"."ps_availqty"': 1