In [177]:
import sqlite3 as sql
from contextlib import closing
import pandas as pd
import numpy as np
import time as tm
import pickle
import os
import csv
import importlib
import execute  # Import module without specifying the function
importlib.reload(execute)  # Force reload
import index  # Import module without specifying the function
importlib.reload(index) # Force reload
from execute import my_execute
from index import my_index

sqlite3 does not support contextual closing natively (yet). Using a super-elegant workaround proposed by erlendaasland\
https://discuss.python.org/t/implicitly-close-sqlite3-connections-with-context-managers/33320/3

In [156]:
def safe_tran( db_name, query ):
  with closing( sql.connect( db_name ) ) as conn:
    cur = conn.execute( query )
    cols = [ col[0] for col in cur.description ]
    df = pd.DataFrame.from_records( cur, columns = cols )
    return df


db_name = "public.db"
get_gold_results = lambda query: safe_tran( db_name, query )

In [185]:
def make_sqlite_query( clause ):
  query = "SELECT id FROM tbl WHERE "
  query += " AND ".join( [ ' '.join( pred ) for pred in clause ] )
  return query

def eval_results( clause, disk, idx_stat ):
  # Execute the query on an actual DB
  df_gold = get_gold_results( make_sqlite_query( clause ) )

  # Execute the query using the index and time it
  tic = tm.perf_counter()
  diskloc_list = my_execute( clause, idx_stat )
  toc = tm.perf_counter()
  t_idx = toc - tic

  # Do sanity checks on the returned locations -- dont want any buffer overflow attacks :)
  diskloc_list = np.minimum( np.maximum( diskloc_list, 0 ), len( disk ) - 1 )

  # Find the seek and read time requried to retrieve records from the virtual disk
  diffs = diskloc_list[ 1: ] - diskloc_list[ :-1 ]
  # Take care of cases where we need to loop back to reach a record
  diffs[ diffs <= 0 ] += len( disk )
  t_seek = diffs.sum()
  t_read = len( diskloc_list )
  # Sanity check
  assert( t_seek >= t_read - 1 )
  t_seek -= t_read - 1
  # Take care of pesky edge cases
  if t_read == 0:
    t_seek = 0

  # Get hold of the tuples chosen by the index from the virtual disk
  response_stu = []
  # print(type(disk))
  # print(type(diskloc_list))
  if len( diskloc_list ) > 0:
    response_stu = disk[ diskloc_list ]
  df_stu = pd.DataFrame( response_stu, columns = [ "id" ] )

  # Rename columns just to be safe so as to enable merging
  df_stu.rename( dict( zip( df_stu.columns, df_gold.columns ) ), axis = 1, inplace = True )
  
  
  union = pd.merge( df_gold, df_stu, how = "outer", indicator = True )
  inter = pd.merge( df_gold, df_stu, how = "inner", indicator = True )
  
  # Assuming 'union' already includes the '_merge' column
  difference = union[union['_merge'] != 'both']
  print(difference)



  # If the gold response is not empty, use intersection over union score
  # Since union removes duplicates, consider length of diskloc_list as well
  if len( df_gold ) > 0:
    score = round( len( inter ) / max( len( diskloc_list ), len( union ) ), 2 )
  # If the gold response itself is empty, penalize non-empty response by index
  elif len( df_gold ) == 0:
    score = round( 1 / ( 1 + len( diskloc_list ) ), 2 )

  if score != 1:
    print(clause)

  return t_idx, t_seek, t_read, score

In [173]:
n_trials = 1

t_build = 0
disk_size = np.int64(0)
idx_size = 0
t_idx = 0
t_seek = np.int64(0)
t_read = np.int64(0)
score = 0

In [174]:
# Read the data to be indexed
with open( "public.csv", 'r' ) as csvfile:
  reader = csv.reader( csvfile )
  tuples = [ ( int( row[ 0 ] ), row[ 1 ], int( row[ 2 ] ) ) for row in reader ]

# Create proper predicates out of CSV data
def make_predicates( tok_list ):
  if len( tok_list ) == 3:
    return [ tok_list ]
  if len( tok_list ) == 6:
    return [ tok_list[ :3 ], tok_list[ 3: ] ]

# Read the clauses that will constitute the evaluation queries
with open( "clauses.csv", 'r' ) as csvfile:
  reader = csv.reader( csvfile )
  c_list = [ make_predicates( row ) for row in reader ]

In [178]:
for t in range( n_trials ):
  tic = tm.perf_counter()
  disk, idx_stat = my_index( tuples )
  disk = np.array( disk )
  toc = tm.perf_counter()
  t_build += toc - tic

  disk_size += len( disk )

  with open( f"idx_dump_{t}.pkl", "wb" ) as outfile:
    pickle.dump( idx_stat, outfile, protocol=pickle.HIGHEST_PROTOCOL )

  idx_size += os.path.getsize( f"idx_dump_{t}.pkl" )
  # print(idx_stat)
  for clause in c_list:
    t_i, t_s, t_r, scr = eval_results( clause, disk, idx_stat )
    t_idx += t_i
    t_seek += t_s
    t_read += t_r
    score += scr

(491533, 'cuodrij', 1964)
(252307, 'cyadraf', 1964)
(307789, 'dheofoi', 1964)
(254497, 'dyesyaej', 1964)
(425114, 'fajij', 1964)
(381553, 'fasau', 1964)
(384256, 'fasu', 1964)
(175989, 'febag', 1964)
(172821, 'fijaf', 1964)
(227749, 'fojub', 1964)
(190627, 'fosuj', 1964)
(187456, 'jachuuf', 1964)
(238686, 'jajae', 1964)
(482278, 'jeughaij', 1964)
(132314, 'jufiel', 1964)
(226700, 'laujou', 1964)
(467830, 'lodyoj', 1964)
(119014, 'lolef', 1964)
(452210, 'roju', 1964)
(454323, 'suicil', 1964)
(323044, 'traenaf', 1964)
(398507, 'baefe', 1965)
(454982, 'bhejuc', 1965)
(341328, 'bocuj', 1965)
(146959, 'bolo', 1965)
(140274, 'ceujai', 1965)
(363102, 'dhifar', 1965)
(449908, 'drujof', 1965)
(243202, 'druula', 1965)
(381823, 'dyufol', 1965)
(258944, 'fecaj', 1965)
(451728, 'fijos', 1965)
(287572, 'foasyes', 1965)
(368177, 'fogib', 1965)
(153204, 'fuja', 1965)
(266325, 'gyaofuis', 1965)
(455098, 'jaofo', 1965)
(147420, 'jeghi', 1965)
(335299, 'jenof', 1965)
(284109, 'joji', 1965)
(477813, 'khuf

In [161]:
t_build /= n_trials
disk_size /= n_trials
idx_size /= n_trials
t_idx /= n_trials
t_seek /= n_trials
t_read /= n_trials
score /= n_trials
score /= len( c_list )

print( t_build, disk_size, idx_size, t_idx, t_seek, t_read, score )

3.9317062999980408 200000.0 20372311.0 0.03958410004997859 131476090.0 1400713.0 0.5567000000000001


0.22813676666313162 300000.0 2861000.0 3.787527566673816 6400303.0 1400713.0 1.0
0.21858643333447011 300000.0 2861000.0 3.9164780333449016 6400303.0 1400713.0 1.0


1. Disk size is 300000 because ids are stored 3 times: sorted by id, by name, by year
2. Index size is high because we have stored entire tuples in idx_stat returned by my_index()
3. Score is 1.0 indicating 100% accuracy

# ----------------- END ------------------



Below cells are for individual clause testing

In [179]:
ind_clause = [['name', 'LIKE', "'je%'"], ['year', '>=', '1955']]

In [186]:
import sqlite3 as sql
from contextlib import closing
import pandas as pd
import numpy as np
import time as tm
import pickle
import os
import csv
import importlib
import execute  # Import module without specifying the function
importlib.reload(execute)  # Force reload
import index  # Import module without specifying the function
importlib.reload(index) # Force reload
from execute import my_execute
from index import my_index

t_build = 0
disk_size = np.int64(0)
idx_size = 0
t_idx = 0
t_seek = np.int64(0)
t_read = np.int64(0)
score = 0

t_i, t_s, t_r, scr = eval_results( ind_clause , disk, idx_stat )
t_idx += t_i
t_seek += t_s
t_read += t_r
score += scr

print( t_build, disk_size, idx_size, t_idx, t_seek, t_read, score )

{1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024}
          id      _merge
0     100018   left_only
1     100046   left_only
2     100117   left_only
3     100154   left_only
4     100193  right_only
...      ...         ...
9833  499678   left_only
9834  499691   left_only
9835  499758   left_only
9836  499827   left_only
9837  499993   left_only

[9685 rows x 2 columns]
[['name', 'LIKE', "'je%'"], ['year', '>=', '1955']]
0 0 0 0.0006703000035486184 6796339 4948 0.02


Below cells are to check disk locations

In [21]:
disk

array([491533, 252307, 307789, ..., 194617, 142226, 105687])

In [22]:
disk_locations2 = my_execute([['name', '=', "'jafif'"]], idx_stat)
print("Query 2 Disk Locations:", disk_locations2)

[['name', '=', "'jafif'"]]
j
a
f
i
f
Query 2 Disk Locations: [141896, 141897, 141898, 141899, 141900, 141901, 141902, 141903, 141904, 141905, 141906, 141907, 141908, 141909, 141910]
