In [1]:
import json
import csv
import pymysql
import sqlite3
import time
import re
import os
import signal
from collections import defaultdict

In [2]:
!pwd

/home/hemilp/CSE_291_Virt/CSE-291-ML-for-Systems/code


### Extracting queries

In [3]:
sqlite_dbs = ['car_1', 'wine_1', 'student_1', 'inn_1', 'flight_1', 'formula_1', 'restaurants']
# sqlite_dbs = ['car_1', 'wine_1', 'student_1', 'inn_1', 'flight_1', 'formula_1']
mysql_dbs = ['advising', 'atis', 'geography', 'imdb']

independent_dbs = ['advising', 'atis', 'geography', 'restaurants', 'imdb']
spider_dbs = set(['car_1', 'wine_1', 'student_1', 'inn_1', 'flight_1', 'formula_1'])

json_data_dir = '../data/json/'
csv_data_dir = '../data/csv/'

In [4]:
def get_queries(db):
    # spider.json contains query for 6 different dbs
    spider = False
    if db == "spider":
        spider = True
        
    queryFile = os.path.join(json_data_dir, f'{db}.json')
                             
    # Open the JSON file and load its contents
    with open(queryFile, "r") as file:
        data = json.load(file)

    query_dict = defaultdict(list)
    for d in data:
        variables = d['variables']
        sql_queries = d['sql']

        if spider and d["sentences"][0]["database"] not in spider_dbs:
            continue

        # replacing each occurrence of a particular variable in any of the queries
        for sql_query in sql_queries:
            query = sql_query
            for var in variables:
                query = query.replace(var['name'], var['example'])
                
            if spider:
                query = re.sub(r'\( .+\.\* \)', '( * )', query)
                query_dict[d["sentences"][0]["database"]].append(query)
            else:
                query_dict[db].append(query)
    
    return query_dict

In [5]:
# Get queries for dbs in spider.json
spider_query_dict = get_queries('spider')
for db in spider_dbs:
    query_list = spider_query_dict[db]
    with open(os.path.join(csv_data_dir, f'{db}.csv'), 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['query'])
        print(db, len(query_list))
        for query in query_list:
            writer.writerow([query])

car_1 45
wine_1 41
student_1 27
formula_1 40
inn_1 36
flight_1 48


In [6]:
# Get queries for independent dbs
# for db in independent_dbs:
for db in ['yelp']:
    query_list = get_queries(db)[db]
    with open(os.path.join(csv_data_dir, f'{db}.csv'), 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['query'])
        print(db, len(query_list))
        for query in query_list:
            writer.writerow([query])

yelp 122


In [7]:
all_queries = []

# all_queries.extend(get_queries("./atis.json"))
queries = dict()

import os
for filename in os.listdir('./data/json'):
    if filename.endswith('spider.json'):
        all_queries.extend(get_queries(f'./data/json/{filename}'))
        queries[filename] = get_queries(f'./data/json/{filename}')

    
print(len(all_queries))

FileNotFoundError: [Errno 2] No such file or directory: './data/json'

In [14]:
len(all_queries)
for filename in os.listdir('.'):
    if filename.endswith('spider.json'):
        print(f'{filename}: {len(queries[filename])}')

spider.json: 40


In [15]:
s = set()
with open('./spider.json', 'r') as f:
    data = json.load(f)

for i in range(len(data)):
    sentences = data[i]["sentences"]
    for sentence in sentences:
        s.add(sentence["database"])

print(s)
print(len(s))

{'customers_and_addresses', 'manufacturer', 'medicine_enzyme_interaction', 'climbing', 'customers_campaigns_ecommerce', 'geo', 'department_store', 'scholar', 'riding_club', 'device', 'club_1', 'department_management', 'cre_Theme_park', 'wedding', 'behavior_monitoring', 'journal_committee', 'train_station', 'game_injury', 'flight_2', 'product_catalog', 'company_employee', 'county_public_safety', 'phone_market', 'hr_1', 'assets_maintenance', 'sakila_1', 'race_track', 'tracking_orders', 'small_bank_1', 'college_1', 'orchestra', 'hospital_1', 'performance_attendance', 'customer_complaints', 'company_1', 'pets_1', 'tracking_software_problems', 'workshop_paper', 'company_office', 'apartment_rentals', 'machine_repair', 'flight_1', 'music_2', 'customers_card_transactions', 'local_govt_and_lot', 'shop_membership', 'party_people', 'network_1', 'wta_1', 'election', 'document_management', 'poker_player', 'scientist_1', 'museum_visit', 'products_for_hire', 'movie_1', 'news_report', 'game_1', 'schoo

In [4]:
with open('./data.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['query'])
    for query in all_queries:
        writer.writerow([query])

NameError: name 'all_queries' is not defined

### Obtaining execution time

At this point, we have all the queries. Now, we write a script to run the queries one by one and store their execution times in the CSV file.

In [8]:
runtime_data_dir = "../data/runtime"

if not os.path.exists(runtime_data_dir):
    os.makedir(runtime_data_dir)

In [14]:
# excute sqlite queries
with open(os.path.join(runtime_data_dir, f'query-runtime-spider.csv'), 'w', newline='') as outfile:
    query_writer = csv.writer(outfile)
    
    for i, db in enumerate(spider_dbs):
        sqlite_file = f'./data/db/{db}.sqlite'

        connection = sqlite3.connect(sqlite_file)

        cursor = connection.cursor()

        with open(os.path.join(csv_data_dir, f'{db}.csv'), newline='') as csvfile:
            query_reader = csv.reader(csvfile)
            rows = list(query_reader)

        # Add header for the "running time" column
        if i == 0:
            rows[0].append('runtime (ms)')
            rows[0].append('db')
            rows[0].append('engine')
            query_writer.writerows(rows[0:1])

        for j, row in enumerate(rows[1:]):
            start_time = time.time()
            sql_query = row[0]
            try:
                cursor.execute(sql_query)
            except:
                print(f'Line {j+2}: {sql_query}')
                continue
            cursor.connection.commit()
            end_time = time.time()
            running_time = (end_time - start_time) * 1000
            row.append(running_time)
            row.append(db)
            row.append('sqlite')
            
            cursor.execute("PRAGMA cache_size=0;")
            cursor.connection.commit()

            query_writer.writerow(row)

        cursor.close()
        connection.close()

In [12]:
!brew services start mysql

[34m==>[0m [1mSuccessfully started `mysql` (label: homebrew.mxcl.mysql)[0m


In [13]:
# To create local mysql instance
# brew services start mysql
# mysql -u root
# DROP DATABASE IF EXISTS {db};
# CREATE DATABASE {db};
# USE {db};
# source /path/to/{db}.sql
# \q
# brew services stop mysql
# excute mysql queries

# Define a function to handle the timeout
def timeout_handler(signum, frame):
    raise TimeoutError("Query execution timed out")

# Set the signal alarm to trigger
signal.signal(signal.SIGALRM, timeout_handler)
    
# with open(os.path.join(runtime_data_dir, f'query-runtime-spider.csv'), 'w', newline='') as outfile:
with open(os.path.join(runtime_data_dir, f'query-runtime-yelp.csv'), 'w', newline='') as outfile:
    query_writer = csv.writer(outfile)
#     for i, db in enumerate(['car_1', 'student_1', 'inn_1', 'formula_1', 'restaurants']):
    for i, db in enumerate(['yelp']):
        connection = pymysql.connect(
            host='localhost',
            user='root',
            password='',
            database=db
        )
        
        cursor = connection.cursor()
        
        cursor.execute("SET sql_mode=(SELECT REPLACE(@@sql_mode,'ONLY_FULL_GROUP_BY',''));")
        cursor.connection.commit()
                
        with open(os.path.join(csv_data_dir, f'{db}.csv'), newline='') as csvfile:
            query_reader = csv.reader(csvfile)
            row = next(query_reader)
            if i == 0:
                row.append('runtime (ms)')
                row.append('db')
                row.append('engine')
                query_writer.writerows([row])

            for j, row in enumerate(query_reader):
                signal.alarm(60)  # Timeout after 60 seconds
                
                start_time = time.time()
                sql_query = row[0]
                try:
                    cursor.execute(sql_query)
                    cursor.connection.commit()
                except TimeoutError:
                    print(f'Line {j+2}: TimeoutError')
                    continue
                except:
                    print(f'Line {j+2}: {sql_query}')
                    continue
                finally:
                    signal.alarm(0)
                    connection.ping(reconnect=True) # attempt to reconnect
                
                end_time = time.time()
                running_time = (end_time - start_time) * 1000
                row.append(running_time)
                row.append(db)
                row.append('mysql')

                query_writer.writerow(row)
                
        cursor.close()
        connection.close()

Line 50: SELECT NEIGHBORHOODalias0.NAME FROM BUSINESS AS BUSINESSalias0 , NEIGHBORHOOD AS NEIGHBORHOODalias0 , REVIEW AS REVIEWalias0 , USER AS USERalias0 WHERE NEIGHBORHOODalias0.BUSINESS_ID = BUSINESSalias0.BUSINESS_ID AND REVIEWalias0.BUSINESS_ID = BUSINESSalias0.BUSINESS_ID AND USERalias0.NAME = "Michelle" AND USERalias0.USER_ID = REVIEWalias0.USER_ID ;
Line 84: SELECT COUNT( DISTINCT ( REVIEWalias0.TEXT ) ) , REVIEWalias0.MONTH FROM BUSINESS AS BUSINESSalias0 , REVIEW AS REVIEWalias0 , USER AS USERalias0 WHERE REVIEWalias0.BUSINESS_ID = BUSINESSalias0.BUSINESS_ID AND USERalias0.NAME = "Michelle" GROUP BY REVIEWalias0.MONTH ;


In [14]:
!brew services stop mysql

Stopping `mysql`... (might take a while)
[34m==>[0m [1mSuccessfully stopped `mysql` (label: homebrew.mxcl.mysql)[0m


In [5]:
# Merge all the runtime files
dbs = ['advising', 'atis', 'geography', 'restaurants', 'imdb', 'spider']

output_file = os.path.join(runtime_data_dir, f'query-runtime-all.csv')

# Open the output CSV file in write mode
with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Iterate over each input file
    for i, db in enumerate(dbs):
        input_file = os.path.join(runtime_data_dir, f'query-runtime-{db}.csv')
        # Open the input CSV file
        with open(input_file, 'r', newline='') as infile:
            reader = csv.reader(infile)
            
            header = next(reader)
            if i == 0:
                writer.writerow(header)
                
            # Iterate over each row in the input file
            for row in reader:
                # Write the row to the output CSV file
                writer.writerow(row)

print("Concatenation complete")

Concatenation complete


In [35]:
import pandas as pd
df = pd.read_csv(os.path.join(runtime_data_dir, f'query-runtime-all.csv'))
print(df.head)
print(df.isnull().any())

<bound method NDFrame.head of                                                   query  runtime (ms)  \
0     SELECT DISTINCT COURSEalias0.ADVISORY_REQUIREM...      8.092165   
1     SELECT DISTINCT COURSEalias0.DEPARTMENT , COUR...      0.772238   
2     SELECT DISTINCT COURSEalias0.DEPARTMENT , COUR...      0.530958   
3     SELECT COUNT( * ) > 0 FROM COURSE AS COURSEali...     92.988014   
4     SELECT DISTINCT COURSEalias0.DEPARTMENT , COUR...      1.093864   
...                                                 ...           ...   
1910  SELECT MAX( RESULTSalias0.FASTESTLAPSPEED ) , ...      1.104116   
1911  SELECT AVG( RESULTSalias0.FASTESTLAPSPEED ) , ...      1.092196   
1912  SELECT COUNT( * ) , DRIVERSalias0.DRIVERID , D...     16.340971   
1913  SELECT COUNT( * ) , DRIVERSalias0.DRIVERID FRO...     15.832901   
1914  SELECT DRIVERSalias0.DRIVERID , DRIVERSalias0....     17.425776   

             db  engine  
0      advising   mysql  
1      advising   mysql  
2      advising