In [76]:
import json
import sqlite3
import pandas as pd
import numpy as np
import requests
import tqdm
import re

In [63]:
# Arguments
db_path = '../Database/Aminer_Simplified-small.sqlite'
template_path = './templates.json'
url = 'openai-url'
api_key = 'your-api-key'
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {api_key}'
}
prompt_path = './ZeroShotRewitePrompt.txt'

In [19]:
def get_tables_info(db_path) -> dict:

    tables_info = {}
    
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    
    # Fetch names of all tables
    cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cur.fetchall()

    # For each table, fetch column names and data types
    for table in tables:
        table_name = table[0]
        cur.execute(f"PRAGMA table_info({table_name});")
        columns_info = cur.fetchall()
        columns = []
        for column_info in columns_info:
            column_id, column_name, column_type, notnull, dflt_value, pk = column_info
            columns.append((column_name, column_type))
        tables_info[table_name] = columns
    
    conn.close()
    return tables_info

get_tables_info(db_path=db_path)

{'Venue': [('id', 'TEXT'), ('DisplayName', 'TEXT')],
 'Affiliation': [('id', 'TEXT'),
  ('DisplayName', 'TEXT'),
  ('type', 'TEXT'),
  ('url', 'TEXT')],
 'Author': [('id', 'TEXT'),
  ('name', 'TEXT'),
  ('org', 'TEXT'),
  ('position', 'TEXT'),
  ('n_pubs', 'INTEGER'),
  ('n_citation', 'INTEGER'),
  ('h_index', 'INTEGER')],
 'Paper': [('id', 'TEXT'),
  ('title', 'TEXT'),
  ('year', 'INTEGER'),
  ('n_citation', 'INTEGER'),
  ('page_start', 'TEXT'),
  ('page_end', 'TEXT'),
  ('lang', 'TEXT'),
  ('volume', 'TEXT'),
  ('doi', 'TEXT'),
  ('pdf', 'TEXT'),
  ('abstract', 'TEXT')],
 'Venue_Papers': [('venue_id', 'TEXT'), ('paper_id', 'TEXT')],
 'Paper_Keywords': [('paper_id', 'TEXT'), ('keyword', 'TEXT')],
 'Paper_Authors': [('paper_id', 'TEXT'),
  ('rank', 'INTEGER'),
  ('author_id', 'TEXT')],
 'Orgnization_Researchers': [('affiliation_name', 'TEXT'),
  ('author_id', 'TEXT')],
 'Researcher_Interests': [('author_id', 'TEXT'),
  ('tag', 'TEXT'),
  ('weight', 'INTEGER')]}

In [12]:
def get_database_prompt(db_path) -> str:

    stmt = ''

    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    # Fetch names of all tables
    cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cur.fetchall()

    # Fech create statements for all tables
    for table in tables:
        table_name = table[0]
        cur.execute(f"SELECT sql FROM sqlite_master WHERE type='table' AND name='{table_name}';")
        create_statement = cur.fetchone()[0]

        stmt += create_statement + '\n\n'

    conn.close()
    return stmt

print(get_database_prompt(db_path=db_path))

CREATE TABLE Venue(
  id TEXT, -- id
  DisplayName TEXT, -- name of the conferenece/joural
  PRIMARY KEY (id)
)

CREATE TABLE Affiliation(
  id TEXT, -- id
  DisplayName TEXT, -- name of the orgnization
  type TEXT, -- orgnization type
  url TEXT, -- link of the orgnization's homepage
  PRIMARY KEY (id)
)

CREATE TABLE Author(
  id TEXT, -- id
  name TEXT, -- name
  org TEXT, -- author's current orgnization
  position TEXT, -- position
  n_pubs INTEGER, -- number of paper publication
  n_citation INTEGER, -- number of total citation
  h_index INTEGER, -- h-index
  PRIMARY KEY (id)
)

CREATE TABLE Paper(
  id TEXT, -- id
  title TEXT, -- title
  year INTEGER, -- publication year
  n_citation INTEGER, -- number of citation
  page_start TEXT, -- start page on the publication
  page_end TEXT, -- end page on the publication
  lang TEXT, -- language
  volume TEXT, -- volume of the publicaiton
  doi TEXT, -- digital object unique identifier
  pdf TEXT, -- pdf view link of the paper
  abstract

In [44]:
# get sample data from a table
def get_sample_data(db_path, table_name, tables_info, limit=20) -> pd.DataFrame:

    # Extract sample data of each column from given table
    columns = [col[0] for col in tables_info[table_name]]

    sample_data = None
    for i, column in enumerate(columns):
        conn = sqlite3.connect(db_path)
        query = f"SELECT {column} FROM {table_name} WHERE {column} IS NOT NULL LIMIT {limit};" 
        df = pd.read_sql_query(query, conn)
        conn.close()

        # concate sample data of each column
        if i == 0:
            sample_data = df
        else:
            sample_data = pd.concat([sample_data, df], axis=1)

    return sample_data

get_sample_data(db_path=db_path, table_name='Affiliation', tables_info=get_tables_info(db_path), limit=5)

Unnamed: 0,id,DisplayName,type,url
0,5f0c1358faabae14decd6074,AMC Networks International Southern Europe,company,https://amcnetworks.es/
1,5f0c1358faabae14decd6086,Santa Paula High School,school,http://sphs.net
2,5f0c1358faabae14decd608d,Kewego,company,https://www.unibw.de/home
3,5f0c1371faabae14decd60b6,Bundeswehr University Munich,university,http://mvhs.nbed.nb.ca/
4,5f0c1372faabae14decd60de,Sherman High School (Texas),school,http://adams.sb.school/


In [58]:
def genertate_template_data(db_path, template_path, count = 10) -> dict :
    
    # Prepare tables info
    tables_info = get_tables_info(db_path)
    
    # Prepare sample data for each table
    tables_data = {}
    for table_name in tables_info.keys():
        tables_data[table_name] = get_sample_data(db_path, table_name, tables_info, limit=20)
    
    # Load template
    with open(template_path, 'r') as f:
        templates = json.load(f)

    # Generate template data
    np.random.seed(0)
    generated = []
    
    for _ in range(count) :
        
        template = np.random.choice(templates)
        t_col = len(template['type']) # column count required in template
        t_text_col = len([col for col in template['type'] if col == 'TEXT'])  # text column count required in template
        t_number_col = len([col for col in template['type'] if col == 'NUMBER']) # number column count required in template
        query = template['query']
        question = np.random.choice(template['question']) # a query may have multiple questions

        while True : # sample a table until it satisfies the template
            table_name = np.random.choice(list(tables_info.keys()))
            columns = [col[0] for col in tables_info[table_name]]
            text_cols = [col[0] for col in tables_info[table_name] if col[1] == 'TEXT']
            number_cols = [col[0] for col in tables_info[table_name] if col[1] == 'INTEGER' or col[1] == 'REAL']
            if len(columns) < t_col or len(text_cols) < t_text_col or len(number_cols) < t_number_col :
                continue
            break
        
        # Items to fill slots in the template
        slot_columns = []
        slot_values = []
        slot_str = ''

        # Select columns that meets datatype requirement
        if 'type' in template :
            typeres = template['type']
            for tp in typeres :
                if tp == 'TEXT' :
                    slot_columns.append(np.random.choice(text_cols))
                elif tp == 'NUMBER' :
                    slot_columns.append(np.random.choice(number_cols))
                elif tp == 'ANY' :
                    slot_columns.append(np.random.choice(columns))
        else :
            slot_columns = np.random.choice(columns, t_col, replace=False)

        # Select values for the selected columns
        if 'value_int' in template :
            valueres = template['value_int']
            for val in valueres :
                if val != 'COUNT' :
                    col_name = slot_columns[int(val)]
                    col_data = tables_data[table_name][col_name]
                    slot_values.append(np.random.choice(col_data))
                else : # the returned record is limited
                    slot_values.append(np.random.randint(1, 100))
        
        # Selct str for he selected columns
        if 'value_str' in template :
            col_name = slot_columns[int(template['value_str'][-1])] # one str restriction at most in our template
            col_data = tables_data[table_name][col_name]
            slot_str = np.random.choice(col_data)

        # replace slot with sampled items
        def replace(query) :
            query = query.replace('{TABLE}', table_name) # fill table name 
            query = query.replace('{COLUMN}', slot_columns[0]) # fill columns
            for i in range(len(slot_columns)-1) :
                query = query.replace('{COLUMN'+str(i+2)+'}', slot_columns[i+1]) 
            if len(slot_values) > 0 : # fill values
                query = query.replace('{VALUE_INT}', str(slot_values[0]))
                if len(slot_values) > 1 :
                    query = query.replace('{VALUE_INT2}', str(slot_str[1]))
            if slot_str != '' : # fill str
                query = query.replace('{VALUE_STR}', slot_str)
            return query
        
        #print('Table:', table_name)
        #print('Columns:', slot_columns)
        #print('Values:', slot_values)
        #print('Str:', slot_str)
        #print('Question:', question)
        #print('Query:', query)

        query = replace(query)
        question = replace(question)
        
        #print('Question:', question)
        #print('Query:', query)
        generated.append({'question': question, 'SQL': query})
    return generated
    
        
genertate_template_data(db_path=db_path, template_path=template_path, count = 10)

[{'question': 'Report the n_citation of all Paper for which n_citation does not equals to 0.',
  'SQL': 'SELECT n_citation FROM Paper WHERE n_citation != 0'},
 {'question': 'what are the two paper_id of highest rank?',
  'SQL': 'SELECT paper_id FROM Paper_Authors ORDER BY rank DESC LIMIT 2'},
 {'question': 'show the type with showing up fewer than 40 times.',
  'SQL': 'SELECT type FROM Affiliation GROUP BY type HAVING COUNT (*) < 40'},
 {'question': 'How many Researcher_Interests has a weight of more than 1 and a tag is visualization?',
  'SQL': 'SELECT COUNT (*) FROM Researcher_Interests WHERE weight > 1 AND tag = "visualization"'},
 {'question': 'what is the position and id of the Author with the top 5 smallest n_citation ?',
  'SQL': 'SELECT position , id FROM Author ORDER BY n_citation LIMIT 5'},
 {'question': 'What is the n_citation value that has the most occurance?',
  'SQL': 'SELECT n_citation FROM Paper GROUP BY n_citation ORDER BY count(*) DESC LIMIT 1'},
 {'question': 'list 

In [65]:
def filter_template_data(data, db_path) -> list :
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    filtered = []
    for item in data :
        try :
            cur.execute(item['SQL'])
            result = cur.fetchall()
            if len(result) > 0 :
                filtered.append(item)
        except Exception as e :
            print(e)
            print('Error in SQL:', item['SQL'])
    conn.close()
    return filtered

data = genertate_template_data(db_path=db_path, template_path=template_path, count = 10)
data = filter_template_data(data, db_path)
data

[{'question': 'Report the n_citation of all Paper for which n_citation does not equals to 0.',
  'SQL': 'SELECT n_citation FROM Paper WHERE n_citation != 0'},
 {'question': 'what are the two paper_id of highest rank?',
  'SQL': 'SELECT paper_id FROM Paper_Authors ORDER BY rank DESC LIMIT 2'},
 {'question': 'show the type with showing up fewer than 40 times.',
  'SQL': 'SELECT type FROM Affiliation GROUP BY type HAVING COUNT (*) < 40'},
 {'question': 'How many Researcher_Interests has a weight of more than 1 and a tag is visualization?',
  'SQL': 'SELECT COUNT (*) FROM Researcher_Interests WHERE weight > 1 AND tag = "visualization"'},
 {'question': 'what is the position and id of the Author with the top 5 smallest n_citation ?',
  'SQL': 'SELECT position , id FROM Author ORDER BY n_citation LIMIT 5'},
 {'question': 'What is the n_citation value that has the most occurance?',
  'SQL': 'SELECT n_citation FROM Paper GROUP BY n_citation ORDER BY count(*) DESC LIMIT 1'},
 {'question': 'list 

In [79]:
def extract(response:str) -> str : # extract response from formatted string
    text = ''
    try :
        text = re.search(r'\{(.+?)\}', response, re.DOTALL).groups(0)[-1] 
    except : 
        return ''
    return text

def rewrite_template_query(data, database_prompt, prompt_path) -> list :
    rewritten = []
    prompt_template = open(prompt_path, 'r').read()
    prompt_template = prompt_template.replace('SCHEMA_SLOT', database_prompt)
    for item in tqdm.tqdm(data) :
        question = item['question']
        query = item['SQL']
        params = {
                    "model": "gpt-3.5-turbo-16k",
                    "messages": [{"role":"user", "content":prompt_template.replace('NATURAL_LANGUAGE_QUESTION', question).replace('SQL_QUERY', query)}],
                    "temperature": 0.5,
        }
        print(params)
        response = requests.post(url, headers=headers, data=json.dumps(params))
        rewrite_question = extract(response.json().get('choices')[0].get('message').get('content'))
        print(question)
        print(response.json().get('choices')[0].get('message').get('content'))
        if rewrite_question != '' :
            rewritten.append({'question': rewrite_question, 'SQL': query})
    return rewritten

rewrite_template_query(data=data, database_prompt=get_database_prompt(db_path), prompt_path=prompt_path)

  0%|          | 0/10 [00:00<?, ?it/s]

{'model': 'gpt-3.5-turbo-16k', 'messages': [{'role': 'user', 'content': '[Task Description]\nYou have a SQLite database. Based on the Schema(with useful comments on columns), you generated plentiful question-query pairs with template, whose readability is terrible. Now, given a question-query pair, you are asked to rewrite the question to enhance its readability with the help of comments in Schema.\n\n[Schema]\n{CREATE TABLE Venue(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the conferenece/joural\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Affiliation(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the orgnization\n  type TEXT, -- orgnization type\n  url TEXT, -- link of the orgnization\'s homepage\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Author(\n  id TEXT, -- id\n  name TEXT, -- name\n  org TEXT, -- author\'s current orgnization\n  position TEXT, -- position\n  n_pubs INTEGER, -- number of paper publication\n  n_citation INTEGER, -- number of total citation\n  h_index INTEGER, -- 

 10%|█         | 1/10 [00:01<00:14,  1.60s/it]

Report the n_citation of all Paper for which n_citation does not equals to 0.
{show me the number of citations for all papers that have at least one citation.}
{'model': 'gpt-3.5-turbo-16k', 'messages': [{'role': 'user', 'content': '[Task Description]\nYou have a SQLite database. Based on the Schema(with useful comments on columns), you generated plentiful question-query pairs with template, whose readability is terrible. Now, given a question-query pair, you are asked to rewrite the question to enhance its readability with the help of comments in Schema.\n\n[Schema]\n{CREATE TABLE Venue(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the conferenece/joural\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Affiliation(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the orgnization\n  type TEXT, -- orgnization type\n  url TEXT, -- link of the orgnization\'s homepage\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Author(\n  id TEXT, -- id\n  name TEXT, -- name\n  org TEXT, -- author\'s current orgniz

 20%|██        | 2/10 [00:04<00:18,  2.34s/it]

what are the two paper_id of highest rank?
{which two papers have the highest rank?}
{'model': 'gpt-3.5-turbo-16k', 'messages': [{'role': 'user', 'content': '[Task Description]\nYou have a SQLite database. Based on the Schema(with useful comments on columns), you generated plentiful question-query pairs with template, whose readability is terrible. Now, given a question-query pair, you are asked to rewrite the question to enhance its readability with the help of comments in Schema.\n\n[Schema]\n{CREATE TABLE Venue(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the conferenece/joural\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Affiliation(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the orgnization\n  type TEXT, -- orgnization type\n  url TEXT, -- link of the orgnization\'s homepage\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Author(\n  id TEXT, -- id\n  name TEXT, -- name\n  org TEXT, -- author\'s current orgnization\n  position TEXT, -- position\n  n_pubs INTEGER, -- number of paper p

 30%|███       | 3/10 [00:06<00:15,  2.25s/it]

show the type with showing up fewer than 40 times.
{show me the types of organizations that appear less than 40 times in the database.}
{'model': 'gpt-3.5-turbo-16k', 'messages': [{'role': 'user', 'content': '[Task Description]\nYou have a SQLite database. Based on the Schema(with useful comments on columns), you generated plentiful question-query pairs with template, whose readability is terrible. Now, given a question-query pair, you are asked to rewrite the question to enhance its readability with the help of comments in Schema.\n\n[Schema]\n{CREATE TABLE Venue(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the conferenece/joural\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Affiliation(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the orgnization\n  type TEXT, -- orgnization type\n  url TEXT, -- link of the orgnization\'s homepage\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Author(\n  id TEXT, -- id\n  name TEXT, -- name\n  org TEXT, -- author\'s current orgnization\n  position TEXT, 

 40%|████      | 4/10 [00:08<00:12,  2.05s/it]

How many Researcher_Interests has a weight of more than 1 and a tag is visualization?
{how many researchers have an interest in visualization with a weight greater than 1?}
{'model': 'gpt-3.5-turbo-16k', 'messages': [{'role': 'user', 'content': '[Task Description]\nYou have a SQLite database. Based on the Schema(with useful comments on columns), you generated plentiful question-query pairs with template, whose readability is terrible. Now, given a question-query pair, you are asked to rewrite the question to enhance its readability with the help of comments in Schema.\n\n[Schema]\n{CREATE TABLE Venue(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the conferenece/joural\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Affiliation(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the orgnization\n  type TEXT, -- orgnization type\n  url TEXT, -- link of the orgnization\'s homepage\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Author(\n  id TEXT, -- id\n  name TEXT, -- name\n  org TEXT, -- author\'s c

 50%|█████     | 5/10 [00:10<00:10,  2.17s/it]

what is the position and id of the Author with the top 5 smallest n_citation ?
{what is the position and id of the top 5 authors with the fewest number of citations?}
{'model': 'gpt-3.5-turbo-16k', 'messages': [{'role': 'user', 'content': '[Task Description]\nYou have a SQLite database. Based on the Schema(with useful comments on columns), you generated plentiful question-query pairs with template, whose readability is terrible. Now, given a question-query pair, you are asked to rewrite the question to enhance its readability with the help of comments in Schema.\n\n[Schema]\n{CREATE TABLE Venue(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the conferenece/joural\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Affiliation(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the orgnization\n  type TEXT, -- orgnization type\n  url TEXT, -- link of the orgnization\'s homepage\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Author(\n  id TEXT, -- id\n  name TEXT, -- name\n  org TEXT, -- author\'s current

 60%|██████    | 6/10 [00:13<00:10,  2.52s/it]

What is the n_citation value that has the most occurance?
{what is the number of citations that appears most frequently in our data?}
{'model': 'gpt-3.5-turbo-16k', 'messages': [{'role': 'user', 'content': '[Task Description]\nYou have a SQLite database. Based on the Schema(with useful comments on columns), you generated plentiful question-query pairs with template, whose readability is terrible. Now, given a question-query pair, you are asked to rewrite the question to enhance its readability with the help of comments in Schema.\n\n[Schema]\n{CREATE TABLE Venue(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the conferenece/joural\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Affiliation(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the orgnization\n  type TEXT, -- orgnization type\n  url TEXT, -- link of the orgnization\'s homepage\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Author(\n  id TEXT, -- id\n  name TEXT, -- name\n  org TEXT, -- author\'s current orgnization\n  position TEXT, --

 70%|███████   | 7/10 [00:16<00:07,  2.40s/it]

list all information about Affiliation.
{show me all the information about organizations in our database.}
{'model': 'gpt-3.5-turbo-16k', 'messages': [{'role': 'user', 'content': '[Task Description]\nYou have a SQLite database. Based on the Schema(with useful comments on columns), you generated plentiful question-query pairs with template, whose readability is terrible. Now, given a question-query pair, you are asked to rewrite the question to enhance its readability with the help of comments in Schema.\n\n[Schema]\n{CREATE TABLE Venue(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the conferenece/joural\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Affiliation(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the orgnization\n  type TEXT, -- orgnization type\n  url TEXT, -- link of the orgnization\'s homepage\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Author(\n  id TEXT, -- id\n  name TEXT, -- name\n  org TEXT, -- author\'s current orgnization\n  position TEXT, -- position\n  n_pubs INTEGER

 80%|████████  | 8/10 [00:18<00:04,  2.27s/it]

What is the average and maximum year of each Paper?
{what is the average and maximum publication year for each paper?}
{'model': 'gpt-3.5-turbo-16k', 'messages': [{'role': 'user', 'content': '[Task Description]\nYou have a SQLite database. Based on the Schema(with useful comments on columns), you generated plentiful question-query pairs with template, whose readability is terrible. Now, given a question-query pair, you are asked to rewrite the question to enhance its readability with the help of comments in Schema.\n\n[Schema]\n{CREATE TABLE Venue(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the conferenece/joural\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Affiliation(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the orgnization\n  type TEXT, -- orgnization type\n  url TEXT, -- link of the orgnization\'s homepage\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Author(\n  id TEXT, -- id\n  name TEXT, -- name\n  org TEXT, -- author\'s current orgnization\n  position TEXT, -- position\n  n_

 90%|█████████ | 9/10 [00:21<00:02,  2.74s/it]

Find n_pubs and id of the top 3 n_pubs.
{what are the number of paper publications and their IDs for the top 3 authors with the highest number of paper publications?}
{'model': 'gpt-3.5-turbo-16k', 'messages': [{'role': 'user', 'content': '[Task Description]\nYou have a SQLite database. Based on the Schema(with useful comments on columns), you generated plentiful question-query pairs with template, whose readability is terrible. Now, given a question-query pair, you are asked to rewrite the question to enhance its readability with the help of comments in Schema.\n\n[Schema]\n{CREATE TABLE Venue(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the conferenece/joural\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Affiliation(\n  id TEXT, -- id\n  DisplayName TEXT, -- name of the orgnization\n  type TEXT, -- orgnization type\n  url TEXT, -- link of the orgnization\'s homepage\n  PRIMARY KEY (id)\n)\n\nCREATE TABLE Author(\n  id TEXT, -- id\n  name TEXT, -- name\n  org TEXT, -- author\'s current

100%|██████████| 10/10 [00:24<00:00,  2.44s/it]

For every paper_id in Paper_Keywords, return the paper_id, its number of distinct paper_id as well as the total count of Paper_Keywords within that group.
{for each paper, how many distinct keywords does it have and what is the total count of keywords for that paper?}





[{'question': 'show me the number of citations for all papers that have at least one citation.',
  'SQL': 'SELECT n_citation FROM Paper WHERE n_citation != 0'},
 {'question': 'which two papers have the highest rank?',
  'SQL': 'SELECT paper_id FROM Paper_Authors ORDER BY rank DESC LIMIT 2'},
 {'question': 'show me the types of organizations that appear less than 40 times in the database.',
  'SQL': 'SELECT type FROM Affiliation GROUP BY type HAVING COUNT (*) < 40'},
 {'question': 'how many researchers have an interest in visualization with a weight greater than 1?',
  'SQL': 'SELECT COUNT (*) FROM Researcher_Interests WHERE weight > 1 AND tag = "visualization"'},
 {'question': 'what is the position and id of the top 5 authors with the fewest number of citations?',
  'SQL': 'SELECT position , id FROM Author ORDER BY n_citation LIMIT 5'},
 {'question': 'what is the number of citations that appears most frequently in our data?',
  'SQL': 'SELECT n_citation FROM Paper GROUP BY n_citation O