In [1]:
from engine import client, data_explore, get_parser, generate_sql, restructure_query, parse_tree_to_sql, query_sql, query_nosql, SQLToMongoConverter
from warnings import filterwarnings
filterwarnings('ignore')


This package provides tools for converting natural language queries into SQL and NoSQL database queries 
for retrieving results. It operates without the use of learnable parameters, neural networks, or embeddings, 
focusing instead on rule-based natural language parsing.

Features:
1. Supports natural language descriptions of queries that involve combinations of:
   - `SELECT` statements
   - Table names
   - Column names
   - `WHERE`, `GROUP BY`, and `ORDER BY` clauses
   Column names can include aggregated operations such as `min`, `max`, `median`, `mean`, `stddev`, `variance`, 
   `count`, and `distinct`.

2. Enforces rules for SQL queries:
   - When `WHERE`, `GROUP BY`, or `ORDER BY` clauses are used, the keywords **must appear** explicitly 
     in their respective clauses.
   - Multi-word column names are enclosed in backticks (e.g., `` `column name` ``).
   - Aggregation parameters with multi-word descriptions are supported using formats like `minimum-of`, 
     `standard-devia

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


In [None]:
"""
SQL db -> JSONs -> schema -> SQL gen1
                          -> NL-SQL parser -> SQL gen2
"""

schema_description=data_explore('SQL/thrombosis_prediction')
query = "select sum of symptoms from examinations where the id > 3853710 grouped by people's thrombosis status" 
parser, corrs = get_parser(schema_description, 'Examination')
query = restructure_query(query, corrs)
tokens = query.split()
tree = next(parser.parse(tokens))
sql = parse_tree_to_sql(tree)
query_sql('SQL/thrombosis_prediction/thrombosis_prediction.sqlite', sql)


"""

NoSQL db -> JSONs -> schema -> SQL gen1 -> NoSQL gen1
                            -> NL-SQL parser -> SQL gen2 -> NoSQL gen2

"""


schema_description=data_explore('NoSQL/formula_1')
query = "select maximum wins from constructorStandings where position placed is < 3 and grouped by constructorId" 
parser, corrs = get_parser(schema_description, 'constructorStandings')
query = restructure_query(query, corrs)
tokens = query.split()
tree = next(parser.parse(tokens))
sqlt = parse_tree_to_sql(tree)
nosql = SQLToMongoConverter().convert_to_mongo(sqlt)
query_nosql('formula_1', nosql)

_ = "end"

In [1]:
from test_cases import NoSQLTestCases, SQLTestCases

RULES FOR NLQ (NATURAL LANGUAGE QUERY)
*1 Always use KEYWORDS [where, order by, group by] in NL; slight deviations allowed
*2 Always join multiword operators with '-' [standard-deviation, minimum-of /etc]; other connectors allowed except space
*3 Where clauses don't support pure cat field comparisions 



[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


In [2]:
tester = SQLTestCases(db_name="debit_card_specialization")
cases = tester.t
_ = list(map(tester, cases))

FileNotFoundError: [Errno 2] No such file or directory: 'SQL/debit_card_specialization'

In [3]:
tester = NoSQLTestCases(db_name="california_schools")
cases = tester.t
tester(('frpm', "from frpm find all the `Country Name`"))

####################################################################################################
NL Query :  from frpm find all the `Country Name`
Reconstructed Query :  frpm find `County Name`
Parse Tree :  (S (TABLE frpm) (CLAUSE ) (CMD find) (COLUMN (CAT `County Name`)))
SQL :  SELECT `County Name` FROM frpm 
NoSQL :  frpm.aggregate([{'$project': {'County Name': 1}}])
Results :  [{'_id': ObjectId('67455a282a2b11d96a7923d7'), 'County Name': 'Alameda'}, {'_id': ObjectId('67455a282a2b11d96a7923d8'), 'County Name': 'Alameda'}, {'_id': ObjectId('67455a282a2b11d96a7923d9'), 'County Name': 'Alameda'}]


[{'_id': ObjectId('67455a282a2b11d96a7923d7'), 'County Name': 'Alameda'},
 {'_id': ObjectId('67455a282a2b11d96a7923d8'), 'County Name': 'Alameda'},
 {'_id': ObjectId('67455a282a2b11d96a7923d9'), 'County Name': 'Alameda'}]

In [5]:
print(tester.cfg)


    S -> CMD "*" TABLE CLAUSE | CMD COLUMN TABLE CLAUSE | CMD AGG COLUMN TABLE CLAUSE | CMD "*" CLAUSE TABLE | CMD COLUMN CLAUSE TABLE| CMD AGG COLUMN CLAUSE TABLE 
    CLAUSE -> "where" COLUMN NOP VALUE CLAUSE | "group" "by" COLUMN CLAUSE | "order" "by" COLUMN CLAUSE 
    CLAUSE -> 
    COLUMN -> CAT COLUMN | NUM COLUMN | NOMCAT COLUMN | ORDCAT COLUMN | CAT AGG COLUMN | NUM AGG COLUMN | NOMCAT AGG COLUMN | ORDCAT AGG COLUMN
    COLUMN -> CAT | NUM | NOMCAT | ORDCAT
    CAT -> "_id" | "`Academic" "Year`" | "CDSCode" | "`Charter" "Funding" "Type`" | "`Charter" "School" "Number`" | "`County" "Code`" | "`County" "Name`" | "`District" "Name`" | "`District" "Type`" | "`Educational" "Option" "Type`" | "`High" "Grade`" | "`Low" "Grade`" | "`NSLP" "Provision" "Status`" | "`School" "Code`" | "`School" "Name`" | "`School" "Type`"
    NUM -> "`Enrollment" "(Ages" "5-17)`" | "`Enrollment" "(K-12)`" | "`FRPM" "Count" "(Ages" "5-17)`" | "`FRPM" "Count" "(K-12)`" | "`Free" "Meal" "Count" "(Ages" "5-

In [2]:
tester = SQLTestCases("thrombosis_prediction", 5)

#("constructorStandings", "where points = 7 in constructorStandings, get the minimum position")
#("constructorStandings", "get me a summation of wins in `constructor standings` where position placed is > 3 grouped by constructorid"),
#("constructorStandings", "extract all distinct raceids happned in constructiorstandings")
("constructorStandings", "get maximum points that is scored, grouped by constructorid from constructorstandings"),
        
_ = list(map(tester, [("Patient", "From the patient records, where sex of person is = 'M' get me his bitrthday"),
        ("Patient", "Get me the id of patients where their `first date` is < '1985-10-01' and group this by diagnosis results."),
        ("Patient", "In the ecords, find diagnosis of patients grouped by their sex.")
]))


####################################################################################################
NL Query :  From the patient records, where sex of person is = 'M' get me his bitrthday




Reconstructed Query :  Patient where SEX = <string> get Birthday
Parse Tree :  (S
  (TABLE Patient)
  (CLAUSE
    where
    (COLUMN (CAT SEX))
    (NOP =)
    (VALUE <string>)
    (CLAUSE ))
  (CMD get)
  (COLUMN (CAT Birthday)))
SQL :  SELECT Birthday FROM Patient WHERE SEX = 'M'
Results :  [('1937-11-24',), ('1923-07-25',), ('1967-03-25',), ('1934-09-03',), ('1939-08-03',)]
####################################################################################################
NL Query :  Get me the id of patients where their `first date` is < '1985-10-01' and group this by diagnosis results.




Reconstructed Query :  get ID Patient where `First Date` < <string> group by Diagnosis
Parse Tree :  (S
  (CMD get)
  (COLUMN (NOMCAT ID))
  (TABLE Patient)
  (CLAUSE
    where
    (COLUMN (CAT `First Date`))
    (NOP <)
    (VALUE <string>)
    (CLAUSE group by (COLUMN (CAT Diagnosis)) (CLAUSE ))))
SQL :  SELECT ID FROM Patient WHERE `First Date` < '1985-10-01' GROUP BY Diagnosis
Results :  [(549080,), (1007813,), (1218725,), (1491832,), (1531448,)]
####################################################################################################
NL Query :  In the ecords, find diagnosis of patients grouped by their sex.
Reconstructed Query :  find Diagnosis Patient group by SEX
Parse Tree :  (S
  (CMD find)
  (COLUMN (CAT Diagnosis))
  (TABLE Patient)
  (CLAUSE group by (COLUMN (CAT SEX)) (CLAUSE )))
SQL :  SELECT Diagnosis FROM Patient GROUP BY SEX
Results :  [('SLE',), ('RA susp.',), ("Raynaud's phenomenon",)]



    S -> CMD "*" TABLE CLAUSE | CMD COLUMN TABLE CLAUSE | CMD AGG COLUMN TABLE CLAUSE | CMD "*" CLAUSE TABLE | CMD COLUMN CLAUSE TABLE| CMD AGG COLUMN CLAUSE TABLE | CLAUSE CMD COLUMN TABLE | CLAUSE CMD AGG COLUMN TABLE | CLAUSE CMD COLUMN TABLE CLAUSE | CLAUSE CMD AGG COLUMN TABLE CLAUSE | TABLE CMD COLUMN CLAUSE | TABLE CMD AGG COLUMN CLAUSE | TABLE CLAUSE CMD COLUMN | TABLE CLAUSE CMD AGG COLUMN | TABLE CLAUSE CMD COLUMN CLAUSE | TABLE CLAUSE CMD AGG COLUMN CLAUSE | CLAUSE CMD COLUMN CLAUSE TABLE | CLAUSE CMD AGG COLUMN CLAUSE TABLE | CLAUSE TABLE CMD COLUMN | CLAUSE TABLE CMD AGG COLUMN | CLAUSE TABLE CMD COLUMN CLAUSE | CLAUSE TABLE CMD AGG COLUMN CLAUSE
    CLAUSE -> "where" COLUMN NOP VALUE CLAUSE | "group" "by" COLUMN CLAUSE | "order" "by" COLUMN CLAUSE 
    CLAUSE -> 
    COLUMN -> CAT COLUMN | NUM COLUMN | NOMCAT COLUMN | ORDCAT COLUMN | CAT AGG COLUMN | NUM AGG COLUMN | NOMCAT AGG COLUMN | ORDCAT AGG COLUMN
    COLUMN -> CAT | NUM | NOMCAT | ORDCAT
    CAT -> "_id" | "name"

In [8]:
import re
def _split_ignore_quoticks(s):
    return re.findall(r"'[^']*'|`[^`]*`|\S+", s)


def _split_ignore_ticks(s):
    return re.findall(r"`[^`]*`|\S+", s)

_split_ignore_quoticks("Extract `Percent-(%)-Eligible-FRPM-(Ages-5-17)` where name = 'praneeth'")

['Extract',
 '`Percent-(%)-Eligible-FRPM-(Ages-5-17)`',
 'where',
 'name',
 '=',
 "'praneeth'"]

In [3]:
print(tester.cfg)


    S -> CMD "*" TABLE CLAUSE | CMD COLUMN TABLE CLAUSE | CMD AGG COLUMN TABLE CLAUSE | CMD "*" CLAUSE TABLE | CMD COLUMN CLAUSE TABLE| CMD AGG COLUMN CLAUSE TABLE 
    CLAUSE -> "where" COLUMN NOP VALUE CLAUSE | "group" "by" COLUMN CLAUSE | "order" "by" COLUMN CLAUSE 
    CLAUSE -> 
    COLUMN -> CAT COLUMN | NUM COLUMN | NOMCAT COLUMN | ORDCAT COLUMN | CAT AGG COLUMN | NUM AGG COLUMN | NOMCAT AGG COLUMN | ORDCAT AGG COLUMN
    COLUMN -> CAT | NUM | NOMCAT | ORDCAT
    CAT -> "_id" | "Academic-Year" | "CDSCode" | "Charter-Funding-Type" | "Charter-School-Number" | "County-Code" | "County-Name" | "District-Name" | "District-Type" | "Educational-Option-Type" | "High-Grade" | "Low-Grade" | "NSLP-Provision-Status" | "School-Code" | "School-Name" | "School-Type"
    NUM -> "Enrollment-(Ages-5-17)" | "Enrollment-(K-12)" | "FRPM-Count-(Ages-5-17)" | "FRPM-Count-(K-12)" | "Free-Meal-Count-(Ages-5-17)" | "Free-Meal-Count-(K-12)" | "Percent-(%)-Eligible-FRPM-(Ages-5-17)" | "Percent-(%)-Eligible

In [3]:
int("'123'")

ValueError: invalid literal for int() with base 10: "'123'"

In [4]:
print(tester.parser)

AttributeError: 'NoSQLTestCases' object has no attribute 'parser'

In [7]:
tester.schema_description

{'transactions_1k': {'Amount': [1000,
   'int',
   'num',
   'non-unique',
   'non-null',
   0,
   264,
   []],
  'CardID': [1000, 'int', 'cat', 'non-unique', 'non-null', 99745, 753582, []],
  'CustomerID': [1000,
   'int',
   'cat',
   'non-unique',
   'non-null',
   7626,
   49788,
   []],
  'Date': [1000,
   'str',
   'date',
   'non-unique',
   'non-null',
   '2012-08-23',
   '2012-08-24',
   []],
  'GasStationID': [1000, 'int', 'cat', 'non-unique', 'non-null', 48, 3869, []],
  'Price': [1000,
   'float',
   'num',
   'non-unique',
   'non-null',
   15.41,
   5762.49,
   []],
  'ProductID': [1000, 'int', 'cat', 'non-unique', 'non-null', 2, 23, []],
  'Time': [1000,
   'str',
   'time',
   'non-unique',
   'non-null',
   '03:53:00',
   '23:59:00',
   []],
  'TransactionID': [1000,
   'int',
   'ordinal-cat',
   'unique',
   'non-null',
   1,
   1000,
   []]},
 'products': {'Description': [200,
   'str',
   'cat',
   'non-unique',
   'non-null',
   -1,
   -1,
   ['Rucní zadání', 'Naf

In [5]:
a = set([4, 5, 6])
b = set([1, 2, 3])

a = a.union(b)
a

{1, 2, 3, 4, 5, 6}

In [4]:
import re
re.match('[^a-zA-Z0-9\s-]*([\w\s-]+)[^a-zA-Z0-9\s-]*', "'SME'.").group(1).replace("-", " ")

'SME'

In [5]:
from engine import STRING_STORE

STRING_STORE

RULES FOR NLQ (NATURAL LANGUAGE QUERY)
*1 Always use KEYWORDS [where, order by, group by] in NL; slight deviations allowed
*2 Always join multiword operators with '-' [standard-deviation, minimum-of /etc]; other connectors allowed except space
*3 Where clauses don't support pure cat field comparisions 



[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


[]

In [None]:
tester = SQLTestCases('thrombosis_prediction')
cases = tester.t 
list(map(tester, cases))

t


####################################################################################################
NL Query :  Get the average of ANA where Thrombosis = 1 from Examination.




Reconstructed Query :  get average ANA where Thrombosis = <number> Examination
Parse Tree :  (S
  (CMD get)
  (AGG average)
  (COLUMN (NOMCAT ANA))
  (CLAUSE
    where
    (COLUMN (NOMCAT Thrombosis))
    (NOP =)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Examination))
SQL :  SELECT AVG(ANA) FROM Examination WHERE Thrombosis = 1
Results :  [(639.2452830188679,)]
####################################################################################################
NL Query :  Retrieve the sum of ANA where Thrombosis = 2 in Examination.




Reconstructed Query :  retrieve summation ANA where Thrombosis = <number> Examination
Parse Tree :  (S
  (CMD retrieve)
  (AGG summation)
  (COLUMN (NOMCAT ANA))
  (CLAUSE
    where
    (COLUMN (NOMCAT Thrombosis))
    (NOP =)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Examination))
SQL :  SELECT SUM(ANA) FROM Examination WHERE Thrombosis = 2
Results :  [(27848,)]
####################################################################################################
NL Query :  Fetch unique values of Symptoms where Thrombosis = 3 from Examination.




Reconstructed Query :  fetch Symptoms where Thrombosis = <number> Examination
Parse Tree :  (S
  (CMD fetch)
  (COLUMN (CAT Symptoms))
  (CLAUSE
    where
    (COLUMN (NOMCAT Thrombosis))
    (NOP =)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Examination))
SQL :  SELECT Symptoms FROM Examination WHERE Thrombosis = 3
Results :  [('thrombocytepenia',), ('thrombocytopenia',), ('thrombocytopenia',)]
####################################################################################################
NL Query :  Fetch distinct Diagnosis values where Thrombosis = 2 in Examination.




Reconstructed Query :  fetch distinct Diagnosis where Thrombosis = <number> Examination
Parse Tree :  (S
  (CMD fetch)
  (AGG distinct)
  (COLUMN (CAT Diagnosis))
  (CLAUSE
    where
    (COLUMN (NOMCAT Thrombosis))
    (NOP =)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Examination))
SQL :  SELECT DISTINCT(Diagnosis) FROM Examination WHERE Thrombosis = 2
Results :  [(None,), ('SLE',), ('SLE+Psy',)]
####################################################################################################
NL Query :  Fetch the maximum value of GOT where GOT > 100 from Laboratory.




Reconstructed Query :  fetch maximum GOT where GOT > <number> Laboratory
Parse Tree :  (S
  (CMD fetch)
  (AGG maximum)
  (COLUMN (NOMCAT GOT))
  (CLAUSE
    where
    (COLUMN (NOMCAT GOT))
    (NOP >)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Laboratory))
SQL :  SELECT MAX(GOT) FROM Laboratory WHERE GOT > 100
Results :  [(21480,)]
####################################################################################################
NL Query :  Get the average of CPK where TG > 100 from Laboratory.




Reconstructed Query :  get average CPK where TG > <number> Laboratory
Parse Tree :  (S
  (CMD get)
  (AGG average)
  (COLUMN (NOMCAT CPK))
  (CLAUSE
    where
    (COLUMN (NOMCAT TG))
    (NOP >)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Laboratory))
SQL :  SELECT AVG(CPK) FROM Laboratory WHERE TG > 100
Results :  [(119.23266219239373,)]
####################################################################################################
NL Query :  Retrieve the average value of ALB where HGB > 12 from Laboratory.




Reconstructed Query :  retrieve average ALB where HGB > <number> Laboratory
Parse Tree :  (S
  (CMD retrieve)
  (AGG average)
  (COLUMN (NUM ALB))
  (CLAUSE
    where
    (COLUMN (NUM HGB))
    (NOP >)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Laboratory))
SQL :  SELECT AVG(ALB) FROM Laboratory WHERE HGB > 12
Results :  [(4.281120716270456,)]
####################################################################################################
NL Query :  Get the total count of RBC where T-BIL > 0.2 in Laboratory.




Reconstructed Query :  get count RBC where T-BIL > <number> Laboratory
Parse Tree :  (S
  (CMD get)
  (AGG count)
  (COLUMN (NUM RBC))
  (CLAUSE
    where
    (COLUMN (NUM T-BIL))
    (NOP >)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Laboratory))
SQL :  SELECT COUNT(RBC) FROM Laboratory WHERE T-BIL > 0.2
Results :  []
####################################################################################################
NL Query :  Fetch the range of UA values where CRP = 2 in Laboratory.




Reconstructed Query :  fetch RA UA where CRP = <number> Laboratory
Parse Tree :  (S
  (CMD fetch)
  (COLUMN (CAT RA) (COLUMN (NUM UA)))
  (CLAUSE
    where
    (COLUMN (CAT CRP))
    (NOP =)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Laboratory))
SQL :  SELECT RA, UA FROM Laboratory WHERE CRP = 2
Results :  [('-', 7.0), (None, 8.3), (None, 6.6)]
####################################################################################################
NL Query :  Retrieve the minimum value of ALP where ALP > 15 in Laboratory.




Reconstructed Query :  retrieve minimum ALP where ALP > <number> Laboratory
Parse Tree :  (S
  (CMD retrieve)
  (AGG minimum)
  (COLUMN (NOMCAT ALP))
  (CLAUSE
    where
    (COLUMN (NOMCAT ALP))
    (NOP >)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Laboratory))
SQL :  SELECT MIN(ALP) FROM Laboratory WHERE ALP > 15
Results :  [(22,)]
####################################################################################################
NL Query :  Fetch the total count of PatientID where SEX = 1 from Patient.




Reconstructed Query :  fetch count ID where SEX = <number> Patient
Parse Tree :  (S
  (CMD fetch)
  (AGG count)
  (COLUMN (NOMCAT ID))
  (CLAUSE
    where
    (COLUMN (CAT SEX))
    (NOP =)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Patient))
SQL :  SELECT COUNT(ID) FROM Patient WHERE SEX = 1
Results :  [(0,)]
####################################################################################################
NL Query :  Get all unique values of SEX where ID = 4060811 from Patient.




Reconstructed Query :  get SEX where ID = <number> Patient
Parse Tree :  (S
  (CMD get)
  (COLUMN (CAT SEX))
  (CLAUSE
    where
    (COLUMN (NOMCAT ID))
    (NOP =)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Patient))
SQL :  SELECT SEX FROM Patient WHERE ID = 4060811
Results :  [('F',)]
####################################################################################################
NL Query :  Fetch all non-null Description values where PatientID = 1124385 from Patient.




Reconstructed Query :  fetch Description where ID = <number> Patient
Parse Tree :  (S
  (CMD fetch)
  (COLUMN (CAT Description))
  (CLAUSE
    where
    (COLUMN (NOMCAT ID))
    (NOP =)
    (VALUE <number>)
    (CLAUSE ))
  (TABLE Patient))
SQL :  SELECT Description FROM Patient WHERE ID = 1124385
Results :  [('1997-07-31',)]


[[(639.2452830188679,)],
 [(27848,)],
 [('thrombocytepenia',), ('thrombocytopenia',), ('thrombocytopenia',)],
 [(None,), ('SLE',), ('SLE+Psy',)],
 [(21480,)],
 [(119.23266219239373,)],
 [(4.281120716270456,)],
 [],
 [('-', 7.0), (None, 8.3), (None, 6.6)],
 [(22,)],
 [(0,)],
 [('F',)],
 [('1997-07-31',)]]

In [5]:
import re 

re.match('[^a-zA-Z0-9\s-]*([\w\s-]+)[^a-zA-Z0-9\s-]*', "'hello-john'.").group(1)

'hello-john'