**Upper Bound - LP**

In [None]:
!pip install pulp
!pip install gurobipy

import gurobipy as gp
from gurobipy import GRB
from typing import List

def get_ub_two_dims(seen: List[List[int]], row_sum: List[int], col_sum: List[int]) -> int:
    n, m = len(row_sum), len(col_sum)

    model = gp.Model("Upper_Bound_Problem")
    model.setParam("OutputFlag", 0)

    # Variables
    matrix = model.addVars(n, m, name="u", lb=0.0)
    row_max = model.addVars(n, name="rmax", lb=0.0)

    # Objective: Minimize sum of row maxima
    model.setObjective(gp.quicksum(row_max[i] for i in range(n)), GRB.MINIMIZE)

    # Row constraints: sum of unseen + seen equals row sum
    for i in range(n):
        model.addConstr(
            gp.quicksum(matrix[i, j] for j in range(m)) + sum(seen[i][j] for j in range(m)) == row_sum[i]
        )

    # Column constraints: sum of unseen + seen ≤ column sum
    for j in range(m):
        model.addConstr(
            gp.quicksum(matrix[i, j] for i in range(n)) + sum(seen[i][j] for i in range(n)) <= col_sum[j]
        )

    # Row max constraints
    for i in range(n):
        for j in range(m):
            model.addConstr(row_max[i] >= matrix[i, j] + seen[i][j])

    model.optimize()

    if model.Status != GRB.OPTIMAL:
        raise RuntimeError("Gurobi did not find an optimal solution.")

    total_row_max = sum(row_max[i].X for i in range(n))
    conflict = int(sum(row_sum) - total_row_max)
    return conflict



**Lower Bound - Mixed LP**

In [None]:
import gurobipy as gp
from gurobipy import GRB
from typing import List

def get_lb_two_dims(seen: List[List[int]], row_sum: List[int], col_sum: List[int]) -> int:
    conflict = 0

    if seen and row_sum:
        n = len(row_sum)
        m = len(col_sum)

        row_sum_seen = list(map(sum, seen))
        col_sum_seen = list(map(sum, zip(*seen)))

        row_max_seen = list(map(max, seen))
        col_max_seen = list(map(max, zip(*seen)))

        diff_row_sum = [row_sum[i] - row_sum_seen[i] for i in range(n)]
        diff_col_sum = [col_sum[j] - col_sum_seen[j] for j in range(m)]

        upper_bound_row_sum = [
            max(seen[i][j] + min(diff_row_sum[i], diff_col_sum[j]) for j in range(m))
            for i in range(n)
        ]

        model = gp.Model("Lower_Bound_Problem")
        model.setParam('OutputFlag', 0)

        matrix = model.addVars(n, m, name="unseen", lb=0.0)
        row_max = model.addVars(n, name="row_max", lb=0.0)
        selected = model.addVars(n, m, name="selected", lb=0.0)

        for i in range(n):
            model.addConstr(row_max[i] <= upper_bound_row_sum[i])
            model.addConstr(gp.quicksum(matrix[i, j] for j in range(m)) + sum(seen[i][j] for j in range(m)) == row_sum[i])
            model.addConstr(gp.quicksum(selected[i, j] for j in range(m)) == 1)

        for j in range(m):
            model.addConstr(gp.quicksum(matrix[i, j] for i in range(n)) + sum(seen[i][j] for i in range(n)) <= col_sum[j])

        for i in range(n):
            for j in range(m):
                model.addConstr(row_max[i] >= seen[i][j] + matrix[i, j])
                model.addConstr(
                    row_max[i] <= seen[i][j] + matrix[i, j] +
                    (row_sum[i] - row_sum_seen[i] + row_max_seen[i] - seen[i][j]) * (1 - selected[i, j])
                )

        model.setObjective(gp.quicksum(row_max[i] for i in range(n)), GRB.MAXIMIZE)
        model.optimize()

        if model.Status == GRB.OPTIMAL:
            total_row_max = sum(row_max[i].X for i in range(n))
            conflict = int(sum(row_sum) - total_row_max)

    return conflict

**Lower Bound - Disregard Column Sum**

In [None]:
import copy
from typing import List, Set, Tuple

def lb_disregardColSum_2D(seen: List[List[int]], row_sum_orig: List[int], col_sum_orig: List[int])-> int:

  row_sum = copy.copy(row_sum_orig)
  col_sum = copy.copy(col_sum_orig)
  conflict = sum(row_sum)
  n = len(row_sum)
  m = len(col_sum)
  seen_row_sum = [0] * n
  seen_col_sum = [0] * m
  remaining_col = [0] * m
  remaining_row = [0] * n
  for i in range(n):
    for j in range(m):
      seen_row_sum[i] += seen[i][j]
      seen_col_sum[j] += seen[i][j]
    remaining_row[i] = row_sum[i] - seen_row_sum[i]

  for j in range(m):
    remaining_col[j] = col_sum[j] - seen_col_sum[j]

  for i in range(n):
    for j in range(m):
      if j==0:
        max_col = seen[i][j] + min(remaining_col[j], remaining_row[i])
      else:
        cur_entry = seen[i][j] + min(remaining_col[j], remaining_row[i])
        if max_col < cur_entry:
          max_col = cur_entry
    conflict = conflict - max_col

  return conflict

**Exact Top-K Ranking**

In [None]:
import time

def calculateExactTopk(LB, UB):

  exactTopK = []
  remainingAFD = set(UB.keys())

  while remainingAFD:
    isTopk = False
    for ubAFD in list(remainingAFD):
      ubVal = UB[ubAFD]
      if all(ubVal < LB[lbAFD] for lbAFD in remainingAFD if lbAFD != ubAFD):
        exactTopK.append(ubAFD)
        remainingAFD.remove(ubAFD)
        isTopk = True
        break
    if not isTopk:
      break

  exactTopK = [int(afd) for afd in exactTopK]

  return exactTopK

**Actual Top-K**

In [None]:
def calculateActualTopk(file_name):

  actualConflictDict = {}

  mainDF = pd.read_csv(file_name, header = 0)
  # last column in dataset is the Z attribute
  A_name_list = mainDF.columns[:-1].tolist()

  for A_name in A_name_list:

    Z_name = "y"
    df = mainDF[[A_name, Z_name]]

    a_indices = np.unique(df[A_name])
    a_indices = a_indices.tolist()
    a_indices_map = dict(zip(a_indices, range(len(a_indices))))

    z_indices = np.unique(df[Z_name])
    z_indices = z_indices.tolist()
    z_indices_map = dict(zip(z_indices, range(len(z_indices))))

    row_sum = [0]*len(a_indices)
    col_sum = [0]*len(z_indices)

    matrix = []
    for i in a_indices:
      matrix.append([0]*len(z_indices))

    dataset = df.values.tolist()

    for data in dataset:
      row_index = a_indices_map[data[0]]
      col_index = z_indices_map[data[1]]
      row_sum[row_index] += 1
      col_sum[col_index] += 1
      matrix[row_index][col_index] += 1

    conflict = 0
    for index,row in enumerate(matrix):
      conflict += row_sum[index]-max(row)

    actualConflictDict[A_name] = conflict

  actualTopk = sorted(actualConflictDict, key=lambda k: actualConflictDict[k])
  actualTopk = [int(key) for key in actualTopk]

  actualTopkDict = dict(sorted(actualConflictDict.items(), key=lambda item: item[1]))

  return actualTopk, actualTopkDict

In [None]:
import pandas as pd
import numpy as np

def calculateActualG3Error(file_name, dataInterval, stopAtIndex):

  actualErrorFile = 'ActualG3Error.xlsx'

  df_empty = pd.DataFrame()
  df_empty.to_excel(actualErrorFile, index=False)

  isIndex = 0

  mainDF = pd.read_csv(file_name, header = 0)
  # last column in dataset is the Z attribute
  A_name_list = mainDF.columns[:-1].tolist()

  for A_name in A_name_list:

    oldActualDF = pd.read_excel(actualErrorFile)

    Z_name = "y"
    df = mainDF[[A_name, Z_name]]

    a_indices = np.unique(df[A_name])
    a_indices = a_indices.tolist()
    a_indices_map = dict(zip(a_indices, range(len(a_indices))))

    z_indices = np.unique(df[Z_name])
    z_indices = z_indices.tolist()
    z_indices_map = dict(zip(z_indices, range(len(z_indices))))

    row_sum = [0]*len(a_indices)
    col_sum = [0]*len(z_indices)

    matrix = []
    for i in a_indices:
      matrix.append([0]*len(z_indices))

    dataset = df.values.tolist()

    indexDict = {'Index': []}
    actualG3Dict = {A_name: []}

    for dataIDX,data in enumerate(dataset):

      if dataIDX > stopAtIndex:
        break

      row_index = a_indices_map[data[0]]
      col_index = z_indices_map[data[1]]
      row_sum[row_index] += 1
      col_sum[col_index] += 1
      matrix[row_index][col_index] += 1

      conflict = 0
      for index,row in enumerate(matrix):
        conflict += row_sum[index]-max(row)

      if dataIDX % dataInterval == 0 or dataIDX+1 == len(dataset):
        indexDict['Index'].append(dataIDX+1)
        actualG3Dict[A_name].append(conflict)

    dfIndex = pd.DataFrame(indexDict)
    dfActual = pd.DataFrame(actualG3Dict)

    if isIndex == 0:
      oldActualDF = pd.concat([oldActualDF, dfIndex], axis=1, ignore_index=False)
      isIndex = 1
    dfActual = pd.concat([oldActualDF, dfActual], axis=1, ignore_index=False)

    dfActual.to_excel(actualErrorFile, index=False)

**Calculating Upper and Lower Bounds**

In [None]:
def removeSingleMarginals(calc_matrix, row_sum, col_sum):

  row_sum_copy = row_sum[:]
  col_sum_copy = col_sum[:]
  matrix_copy = [row[:] for row in calc_matrix]

  rows_to_remove = [i for i, rsum in enumerate(row_sum_copy) if rsum == 1]

  for i in sorted(rows_to_remove, reverse=True):
    row = matrix_copy[i]
    for j, val in enumerate(row):
        col_sum_copy[j] -= val

    del matrix_copy[i]
    del row_sum_copy[i]

  return matrix_copy, row_sum_copy, col_sum_copy

**Algorithm 1: Exact LP**

In [None]:
import os
import time
import numpy as np
import pandas as pd

def calculateBoundsMLP(file_name, dataInterval, stopAtIndex):

  UBResultFile = 'UBResults.xlsx'
  LB1ResultFile = 'MLP_LBResults.xlsx'

  df_empty = pd.DataFrame()
  df_empty.to_excel(UBResultFile, index=False)

  df_empty = pd.DataFrame()
  df_empty.to_excel(LB1ResultFile, index=False)

  mainDF = pd.read_csv(file_name, header = 0)
  # last column in dataset is the Z attribute
  A_name_list = mainDF.columns[:-1].tolist()

  isIndex = 0

  lpUBDict = {}
  lpLBDict = {}
  boundsTimeLP = {}

  for A_name in A_name_list:

    oldUBDF = pd.read_excel(UBResultFile)
    oldLB1DF = pd.read_excel(LB1ResultFile)

    Z_name = "y"
    df = mainDF[[A_name, Z_name]]

    a_indices = np.unique(df[A_name])
    a_indices = a_indices.tolist()
    a_indices_map = dict(zip(a_indices, range(len(a_indices))))

    z_indices = np.unique(df[Z_name])
    z_indices = z_indices.tolist()
    z_indices_map = dict(zip(z_indices, range(len(z_indices))))

    dataset = df.values.tolist()
    row_sum = [0]*len(a_indices)
    col_sum = [0]*len(z_indices)

    for data in dataset:
      row_index = a_indices_map[data[0]]
      col_index = z_indices_map[data[1]]
      row_sum[row_index] += 1
      col_sum[col_index] += 1

    op1 = []
    calc_matrix = []

    for i in a_indices:
      calc_matrix.append([0]*len(z_indices))

    indexDict = {'Index': []}
    ubdataDict = {A_name: []}
    lb1dataDict = {A_name: []}

    ubTimeDictLP = {}
    lbTimeDictLP = {}

    for index,data in enumerate(dataset):

      if index > stopAtIndex:
          break

      row_index = a_indices_map[data[0]]
      col_index = z_indices_map[data[1]]
      calc_matrix[row_index][col_index] += 1

      if index % dataInterval == 0 or index == len(dataset)-1:

        if index not in lpUBDict:
          lpUBDict[index] = {}

        if index not in lpLBDict:
          lpLBDict[index] = {}

        mat, rowS, colS = removeSingleMarginals(calc_matrix, row_sum, col_sum)

        startTimeLP = time.time()

        startUBTimeLP = time.time()
        ubValue = get_ub_two_dims(mat, rowS, colS)
        stopUBTimeLP = time.time()

        startLBTimeLP = time.time()
        lb1Value = get_lb_two_dims(mat, rowS, colS)
        stopLBTimeLP = time.time()

        stopTimeLP = time.time()

        boundsTimeLP[index] = stopTimeLP - startTimeLP

        ubTimeDictLP[index] = stopUBTimeLP - startUBTimeLP
        lbTimeDictLP[index] = stopLBTimeLP - startLBTimeLP

        if not op1:
          op1.append([index+1, ubValue, lb1Value])
          u1 = ubValue
          l1 = lb1Value

        else:
          op1.append([index+1, min(ubValue, op1[-1][1]), max(lb1Value, op1[-1][2])])
          u1 = min(ubValue, op1[-1][1])
          l1 = max(lb1Value, op1[-1][2])

        lpUBDict[index][A_name] = u1
        lpLBDict[index][A_name] = l1

        indexDict['Index'].append(index+1)
        ubdataDict[A_name].append(u1)
        lb1dataDict[A_name].append(l1)

    dfIndex = pd.DataFrame(indexDict)
    dfUB = pd.DataFrame(ubdataDict)
    dfLB1 = pd.DataFrame(lb1dataDict)

    if isIndex == 0:
      oldUBDF = pd.concat([oldUBDF, dfIndex], axis=1, ignore_index=False)
      oldLB1DF = pd.concat([oldLB1DF, dfIndex], axis=1, ignore_index=False)
      isIndex = 1

    dfUB = pd.concat([oldUBDF, dfUB], axis=1, ignore_index=False)
    dfLB1 = pd.concat([oldLB1DF, dfLB1], axis=1, ignore_index=False)

    dfUB.to_excel(UBResultFile, index=False)
    dfLB1.to_excel(LB1ResultFile, index=False)

    df2 = pd.DataFrame(list(ubTimeDictLP.items()), columns=['Index', 'Time'])
    df3 = pd.DataFrame(list(lbTimeDictLP.items()), columns=['Index', 'Time'])

    df2.to_excel('ExUBTimeLP.xlsx', index=False)
    df3.to_excel('ExLBTimeLP.xlsx', index=False)

  return lpUBDict, lpLBDict, boundsTimeLP

**Algorithm 2: Exact DCS**

In [None]:
import os
import time
import numpy as np
import pandas as pd

def calculateBoundsDCS(file_name, dataInterval, stopAtIndex):

  UBResultFile = 'UBResults.xlsx'
  LB2ResultFile = 'DCS_LBResults.xlsx'

  df_empty = pd.DataFrame()
  df_empty.to_excel(UBResultFile, index=False)

  df_empty = pd.DataFrame()
  df_empty.to_excel(LB2ResultFile, index=False)

  mainDF = pd.read_csv(file_name, header = 0)
  # last column in dataset is the Z attribute
  A_name_list = mainDF.columns[:-1].tolist()

  isIndex = 0

  dcsUBDict = {}
  dcsLBDict = {}
  boundsTimeDCS = {}

  for A_name in A_name_list:

    oldUBDF = pd.read_excel(UBResultFile)
    oldLB2DF = pd.read_excel(LB2ResultFile)

    Z_name = "y"
    df = mainDF[[A_name, Z_name]]

    a_indices = np.unique(df[A_name])
    a_indices = a_indices.tolist()
    a_indices_map = dict(zip(a_indices, range(len(a_indices))))

    z_indices = np.unique(df[Z_name])
    z_indices = z_indices.tolist()
    z_indices_map = dict(zip(z_indices, range(len(z_indices))))

    dataset = df.values.tolist()
    row_sum = [0]*len(a_indices)
    col_sum = [0]*len(z_indices)

    for data in dataset:
      row_index = a_indices_map[data[0]]
      col_index = z_indices_map[data[1]]
      row_sum[row_index] += 1
      col_sum[col_index] += 1

    op1 = []
    calc_matrix = []

    for i in a_indices:
      calc_matrix.append([0]*len(z_indices))

    indexDict = {'Index': []}
    ubdataDict = {A_name: []}
    lb2dataDict = {A_name: []}

    ubTimeDictDCS = {}
    lbTimeDictDCS = {}

    for index,data in enumerate(dataset):

      if index > stopAtIndex:
          break

      row_index = a_indices_map[data[0]]
      col_index = z_indices_map[data[1]]
      calc_matrix[row_index][col_index] += 1

      if index % dataInterval == 0 or index == len(dataset)-1:

        if index not in dcsUBDict:
          dcsUBDict[index] = {}

        if index not in dcsLBDict:
          dcsLBDict[index] = {}

        mat, rowS, colS = removeSingleMarginals(calc_matrix, row_sum, col_sum)

        startTimeDCS = time.time()

        startUBTimeDCS = time.time()
        ubValue = get_ub_two_dims(mat, rowS, colS)
        stopUBTimeDCS = time.time()

        startLBTimeDCS = time.time()
        lb2Value = lb_disregardColSum_2D(mat, rowS, colS)
        stopLBTimeDCS = time.time()

        stopTimeDCS = time.time()

        boundsTimeDCS[index] = stopTimeDCS - startTimeDCS

        ubTimeDictDCS[index] = stopUBTimeDCS - startUBTimeDCS
        lbTimeDictDCS[index] = stopLBTimeDCS - startLBTimeDCS

        if not op1:
          op1.append([index+1, ubValue, lb2Value])
          u1 = ubValue
          l2 = lb2Value
        else:
          op1.append([index+1, min(ubValue, op1[-1][1]), max(lb2Value, op1[-1][2])])
          u1 = min(ubValue, op1[-1][1])
          l2 = max(lb2Value, op1[-1][2])

        dcsUBDict[index][A_name] = u1
        dcsLBDict[index][A_name] = l2

        indexDict['Index'].append(index+1)
        ubdataDict[A_name].append(u1)
        lb2dataDict[A_name].append(l2)

    dfIndex = pd.DataFrame(indexDict)
    dfUB = pd.DataFrame(ubdataDict)
    dfLB2 = pd.DataFrame(lb2dataDict)

    if isIndex == 0:
      oldUBDF = pd.concat([oldUBDF, dfIndex], axis=1, ignore_index=False)
      oldLB2DF = pd.concat([oldLB2DF, dfIndex], axis=1, ignore_index=False)
      isIndex = 1

    dfUB = pd.concat([oldUBDF, dfUB], axis=1, ignore_index=False)
    dfLB2 = pd.concat([oldLB2DF, dfLB2], axis=1, ignore_index=False)

    dfUB.to_excel(UBResultFile, index=False)
    dfLB2.to_excel(LB2ResultFile, index=False)

    df4 = pd.DataFrame(list(ubTimeDictDCS.items()), columns=['Index', 'Time'])
    df5 = pd.DataFrame(list(lbTimeDictDCS.items()), columns=['Index', 'Time'])

    df4.to_excel('ExUBTimeDCS.xlsx', index=False)
    df5.to_excel('ExLBTimeDCS.xlsx', index=False)

  return dcsUBDict, dcsLBDict, boundsTimeDCS

**Score Calculation - P@K/NDCG**

In [None]:
import numpy as np
import pandas as pd

def excelToDict(file_path):

  df = pd.read_excel(file_path)
  data_dict = {}

  for index, row in df.iterrows():
    index_value = row['Index']
    sub_dict = {column: row[column] for column in df.columns if column != 'Index'}
    data_dict[index_value] = sub_dict

  return data_dict

# Precision at k score
def precision_at_k(y_true, y_pred, k):

  patk = 0.0

  if y_pred:
    k = min(k, len(y_pred))
    relevantItems = sum([1 for item in y_pred[:k] if item in y_true[:k]])
    patk = relevantItems/k

  return patk

def checkEqualAFDs(topkDict):

  sortedKeysByVal = sorted(topkDict, key=topkDict.get)

  eqIndexDict = {}
  current_index = 0
  prev_value = None

  for key in sortedKeysByVal:
      value = topkDict[key]
      if value != prev_value:
          current_index += 1
      eqIndexDict[key] = current_index - 1
      prev_value = value

  return eqIndexDict

# ndcg - true relevance score
def calcRelevanceScore(topkDict, fixedIndexDict, k):

  true_rs = [0] * len(fixedIndexDict)

  fixedIndexDict = checkEqualAFDs(topkDict)

  if topkDict:
    maxValNorm = topkDict[max(topkDict, key=topkDict.get)]
    for key, value in topkDict.items():
      if maxValNorm != 0:
        normVal_RS = 1-(value/maxValNorm)
      else:
        normVal_RS = 0

      if key in fixedIndexDict:
        true_rs[fixedIndexDict[key]] = normVal_RS

  return true_rs

# ndcg - predicted relevance score
def adjustRelevanceScore(predTopk, trueTopk, true_rs):

  relMap = dict(zip(trueTopk, true_rs))
  predTopk = predTopk[:len(trueTopk)]

  if predTopk:
    pred_rs = [relMap.get(item, 0) for item in predTopk]
    pred_rs += [0] * (len(trueTopk) - len(predTopk))
  else:
    pred_rs = [0] * len(trueTopk)

  return pred_rs

# dcg score
def dcg_at_k(relevance_scores, k):

  dcg = 0.0
  for i in range(min(k, len(relevance_scores))):
      dcg += (2**relevance_scores[i] - 1) / np.log2(i + 2)
  return dcg

# ncdg score
def ndcg_at_k(pred_rel, true_rel, k):

  dcg_max = dcg_at_k(pred_rel, k)
  ideal_relevance_scores = sorted(true_rel, reverse=True)
  idcg = dcg_at_k(ideal_relevance_scores, k)

  return dcg_max/idcg if idcg > 0 else 0

**Generate Results**

In [None]:
import time
import pandas as pd
from sklearn.metrics import ndcg_score

# Main Method: makes calls to all algorithms for bound calculation
def calculatePatKandNDCG(file_name, dataInterval, rows, kList, stopAtIndex, fn, sortedStr):

  # actual top-k
  actualTopk, actualTopkDict = calculateActualTopk(file_name)
  calculateActualG3Error(file_name, dataInterval, stopAtIndex)
  fixedIndexAFDDict = {key: idx for idx, key in enumerate(actualTopkDict.keys())}

  # time vs k graphs
  lpTimeVsKDict = {}
  dcsTimeVsKDict = {}

  for k in kList:

    # Algorithms 1: Exact LP
    startKTimeLP = time.time()
    patkGraphLP, ndcgkGraphLP, = [], []

    mainUBDictLP, mainLBDictLP, lpTimeVsDataDict = calculateBoundsMLP(file_name, dataInterval, stopAtIndex)
    keys = mainUBDictLP.keys()

    isTimedLP = False

    for key in keys:

      ubdataDictLP = mainUBDictLP[key]
      lbdataDictLP = mainLBDictLP[key]
      attr = list(ubdataDictLP.keys())

      exactTopk_mlp = calculateExactTopk(lbdataDictLP, ubdataDictLP)
      patkLP = precision_at_k(actualTopk, exactTopk_mlp, k=k)
      patkGraphLP.append(patkLP)

      actualTopk_rs  = calcRelevanceScore(actualTopkDict, fixedIndexAFDDict, k)
      mlpTopk_rs = adjustRelevanceScore(exactTopk_mlp, actualTopk[:k], actualTopk_rs[:k])
      ndcgatkLP = ndcg_at_k(mlpTopk_rs, actualTopk_rs, k=k)
      ndcgkGraphLP.append(ndcgatkLP)

      if not isTimedLP:
        if patkLP==1.0 and ndcgatkLP==1.0:
          stopKTimeLP = time.time()
          lpTimeVsKDict[k] = stopKTimeLP - startKTimeLP
          isTimedLP = True

    # Algorithms 2: Exact DCS
    startKTimeDCS = time.time()
    patkGraphDCS, ndcgkGraphDCS, = [], []

    mainUBDictDCS, mainLBDictDCS, dcsTimeVsDataDict = calculateBoundsDCS(file_name, dataInterval, stopAtIndex)
    keys = mainUBDictDCS.keys()

    isTimedDCS = False

    for key in keys:

      ubdataDictDCS = mainUBDictDCS[key]
      lbdataDictDCS = mainLBDictDCS[key]
      attr = list(ubdataDictDCS.keys())

      exactTopk_dcs = calculateExactTopk(lbdataDictDCS, ubdataDictDCS)
      patkDCS = precision_at_k(actualTopk, exactTopk_dcs, k=k)
      patkGraphDCS.append(patkDCS)

      actualTopk_rs  = calcRelevanceScore(actualTopkDict, fixedIndexAFDDict, k)
      dcsTopk_rs = adjustRelevanceScore(exactTopk_dcs, actualTopk[:k], actualTopk_rs[:k])
      ndcgatkDCS = ndcg_at_k(dcsTopk_rs, actualTopk_rs, k=k)
      ndcgkGraphDCS.append(ndcgatkDCS)

      if not isTimedDCS:
        if patkDCS==1.0 and ndcgatkDCS==1.0:
          stopKTimeDCS = time.time()
          dcsTimeVsKDict[k] = stopKTimeDCS - startKTimeDCS
          isTimedDCS = True

    # p@k and ndcg graphs
    generatePatK(patkGraphLP, patkGraphDCS, rows, dataInterval, k, stopAtIndex, fn, sortedStr)
    generateNDCGK(ndcgkGraphLP, ndcgkGraphDCS, rows, dataInterval, k, stopAtIndex, fn, sortedStr)

  # clocktime graphs
  generateKClocktimeGraph(lpTimeVsKDict, dcsTimeVsKDict, kList, fn, sortedStr)
  generateDataClocktimeGraph(lpTimeVsDataDict, dcsTimeVsDataDict, rows, dataInterval, stopAtIndex, fn, sortedStr)

  return None

**Precision at k Graph**

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def generatePatK(patkGraph_lb1, patkGraph_lb2, rows, dataInterval, k, stopAtIndex, fn, sortedStr):

  rows_read = list(range(0, round(stopAtIndex) + 1, dataInterval))
  rows_read = [round((x / rows) * 100) for x in rows_read]

  bar_width = 0.25
  index = np.arange(len(rows_read))

  patterns = ['o', '|']

  plt.bar(index, patkGraph_lb1, bar_width, label='ExLP', hatch=patterns[0], facecolor='tan', edgecolor='black')
  plt.bar(index + bar_width, patkGraph_lb2, bar_width, label='ExDCS', hatch=patterns[1], facecolor='indianred', edgecolor='black')

  plt.xlabel('Percent of Data Read', fontsize=16, fontweight='bold')
  plt.ylabel('Precision', fontsize=16, fontweight='bold')

  plt.xticks(index + bar_width * 1.75, [f'{round(r, 0)}' for r in rows_read], fontsize=16)
  plt.yticks(fontsize=15)

  plt.grid(True, linestyle='--', linewidth=0.5)
  plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=2, prop={'weight': 'bold', 'size': 14})

  plt.savefig('P@K='+str(k)+'_'+fn+'_'+sortedStr+'.png', dpi=300, bbox_inches='tight')
  plt.close()

**NDCG Graph**

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def generateNDCGK(ndcgkGraph_lb1, ndcgkGraph_lb2, rows, dataInterval, k, stopAtIndex, fn, sortedStr):

  rows_read = list(range(0, round(stopAtIndex) + 1, dataInterval))
  rows_read = [round((x / rows) * 100) for x in rows_read]

  bar_width = 0.25
  index = np.arange(len(rows_read))

  patterns = ['o', '|']

  plt.bar(index, ndcgkGraph_lb1, bar_width, label='ExLP', hatch=patterns[0], facecolor='tan', edgecolor='black')
  plt.bar(index + bar_width, ndcgkGraph_lb2, bar_width, label='ExDCS', hatch=patterns[1], facecolor='indianred', edgecolor='black')

  plt.xlabel('Percent of Data Read', fontsize=16, fontweight='bold')
  plt.ylabel('NDCG', fontsize=16, fontweight='bold')

  plt.xticks(index + bar_width * 1.75, [f'{round(r, 0)}' for r in rows_read], fontsize=15)
  plt.yticks(fontsize=15)

  plt.grid(True, linestyle='--', linewidth=0.5)
  plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=2, prop={'weight': 'bold', 'size': 14})

  plt.savefig('NDCG@K='+str(k)+'_'+fn+'_'+sortedStr+'.png', dpi=300, bbox_inches='tight')
  plt.close()

**K vs Clocktime Graph**

In [None]:
import matplotlib.pyplot as plt

def generateKClocktimeGraph(lpTimeVsKDict, dcsTimeVsKDict, kList, fn, sortedStr):

  x = kList
  y1 = list(lpTimeVsKDict.values())
  y2 = list(dcsTimeVsKDict.values())

  plt.figure(figsize=(14, 8))
  plt.plot(x, y1, marker='v', linestyle='-', color='tan', label='ExLP', linewidth=5)
  plt.plot(x, y2, marker='^', linestyle='-', color='indianred', label='ExDCS', linewidth=5)

  plt.xticks(x, fontsize=35)
  y_min = min(min(y1), min(y2)) - 0.05
  y_max = max(max(y1), max(y2)) + 0.05
  yticks = np.round(np.linspace(y_min, y_max, num=5), 3)
  plt.yticks(yticks, fontsize=35)

  plt.xlabel('k', fontsize=34, fontweight='bold')
  plt.ylabel('Time (sec)', fontsize=34, fontweight='bold')

  plt.grid(True, linestyle='--', linewidth=0.5)
  plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.20), ncol=2, prop={'weight': 'bold', 'size': 35})

  plt.savefig('ExKvsClocktime'+'_'+fn+'_'+sortedStr+'.png', dpi=300, bbox_inches='tight')
  plt.close()

**Data Read vs Clocktime Graph**

In [None]:
import matplotlib.pyplot as plt

def generateDataClocktimeGraph(lpdsTimeVsDataDict, ipfTimeVsDataDict, rows, dataInterval, stopAtIndex, fn, sortedStr):

  rows_read = list(range(0, round(stopAtIndex) + 1, dataInterval))
  rows_read = [round((x / rows) * 100) for x in rows_read]

  y1 = list(lpdsTimeVsDataDict.values())
  y2 = list(ipfTimeVsDataDict.values())

  plt.figure(figsize=(14, 8))
  plt.plot(rows_read, y1, marker='v', linestyle='-', color='tan', label='ExLP', linewidth=5)
  plt.plot(rows_read, y2, marker='^', linestyle='-', color='indianred', label='ExDCS', linewidth=5)

  plt.xticks(rows_read, fontsize=35)
  y_min = min(min(y1), min(y2)) - 0.001
  y_max = max(max(y1), max(y2)) + 0.001
  yticks = np.round(np.linspace(y_min, y_max, num=6), 4)
  plt.yticks(yticks, fontsize=35)

  plt.xlabel('Data Read (%)', fontsize=34, fontweight='bold')
  plt.ylabel('Time (sec)', fontsize=34, fontweight='bold')

  plt.grid(True, linestyle='--', linewidth=0.5)
  plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.20), ncol=2, prop={'weight': 'bold', 'size': 35})

  plt.savefig('ExDatavsClocktime'+'_'+fn+'_'+sortedStr+'.png', dpi=300, bbox_inches='tight')
  plt.close()

**Dataset Sorting (Key/Non-Key)**

In [None]:
import pandas as pd

def sortDataset(file_name, attr):

  df = pd.read_csv(file_name)

  if attr != '0':
    df_sorted = df.sort_values(by=attr)
    sortedFN = 'Sorted_X'+attr+'_'+file_name
    df_sorted.to_csv(sortedFN, index=False)
  else:
    return file_name

  return sortedFN

**Start Point** (Define parameters and run code)

In [None]:
def getDatasetParam(datasetName):

    dataset_config = {
        'dataset1': {'fn': 'SyntheticData1', 'rows': 100000, 'interval': 10000, 'fileName': 'SyntheticData1.csv'},
        'dataset2': {'fn': 'SyntheticData2', 'rows': 100000, 'interval': 10000, 'fileName': 'SyntheticData2.csv'},
        'dataset3': {'fn': 'SyntheticData3', 'rows': 160000, 'interval': 16000, 'fileName': 'SyntheticData3.csv'},
        'dataset4': {'fn': 'SyntheticData4', 'rows': 48000, 'interval': 4800, 'fileName': 'SyntheticData4.csv'}
    }

    if datasetName in dataset_config:
        return dataset_config[datasetName]
    else:
        raise ValueError(f"{datasetName} not found.")

def AFDStartPoint():

  # define dataset paramters
  datasetNum = 'dataset1'
  params = getDatasetParam(datasetNum)
  fn = params['fn']
  rows = params['rows']
  interval = params['interval']
  fileName = params['fileName']

  # define sorting order
  attr = 'y'
  sortedStr = '(Sorted By Z Attribute)'
  data = sortDataset(fileName, attr)

  # define k values
  kList = [3, 5, 7]
  # define limit of records read
  stopAtIndex = 1*rows

  # pass the parameters and generate results
  calculatePatKandNDCG(data, interval, rows, kList, stopAtIndex, fn, sortedStr)

In [None]:
# start point
AFDStartPoint()