In [12]:
import json
import re
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
# read data from .json file

with open("plain_statement_5000.json") as f:
    json_data = json.load(f)

plain_sql = [item['sql'] for item in json_data]
plain_sql = [sql.lower() for sql in plain_sql]

In [14]:
# split data into tokens

pattern = r'[\s()\-,:;]'
string_literal_pattern = r"'([^']*)'"
placeholder = "<string>"

# replace content inside single quotes by <string>
plain_sql_ph = [re.sub(string_literal_pattern, placeholder, sql) for sql in plain_sql]

# split the statements with placeholder
tokenized_sql = [re.split(pattern, sql) for sql in plain_sql_ph]

# remove empty tokens
tokenized_sql = [token for token in tokenized_sql if token]

# replace string literals and numbers by placeholder
for sql in tokenized_sql:
    for i, token in enumerate(sql):
        if re.match(r'^[\'\"].*[\'\"]$', token):  # Check if token is a string literal
            sql[i] = '<string>'
        elif re.match(r'^[0-9]+(\.[0-9]+)?$', token):  # Check if token is a number
            sql[i] = '<number>'

In [15]:
# build the vocab
vocab = set()
for sql in tokenized_sql:
    vocab.update(sql)

# count vectorization
vectorizer = CountVectorizer(vocabulary=vocab)
X = vectorizer.fit_transform(plain_sql)

In [16]:
print(vocab)

{'', 'sum', 'join', '=', '>=', '<=', 'on', 'from', 'where', 'count', '<string>', '*', 'and', '+', '!=', 'avg', 'select', '<number>'}
