In [1]:
import pandas as pd
import sqlite3
import re
import json
import itertools
import random
import time
import os

def getConnAndCursor():
    conn = sqlite3.connect('test.db')
    cursor = conn.cursor()
    return conn, cursor

conn, cursor = getConnAndCursor()

def aggregate(setOfVariables, tableName, aggregationVariable):
    newTableName = tableName + "_a"
    setOfVariablesWithoutAggregation = setOfVariables.copy()
    setOfVariablesWithoutAggregation.remove(aggregationVariable)
    setOfVariables.remove(aggregationVariable)
    setOfVariables.append(aggregationVariable)
    #print(setOfVariables)

    if len(setOfVariablesWithoutAggregation) == 0:
        return tableName

    createQuery = f'CREATE TABLE IF NOT EXISTS {newTableName} ({", ".join(setOfVariablesWithoutAggregation)} TEXT, {aggregationVariable} TEXT)'
    #aggregateQuery = f'SELECT {aggregationVariable}, GROUP_CONCAT({", ".join(setOfVariables)}) as {aggregationVariable} FROM {tableName} GROUP BY {aggregationVariable}'
    aggregateQuery = f'INSERT INTO {newTableName} SELECT {", ".join(setOfVariablesWithoutAggregation)}, GROUP_CONCAT({aggregationVariable}) FROM {tableName} GROUP BY {", ".join(setOfVariablesWithoutAggregation)}'
    
    cursor.execute(createQuery)
    cursor.execute(aggregateQuery)
    try:
        cursor.execute(f"DROP TABLE {tableName}")
    except:
        pass

    cursor.execute(f'SELECT * FROM {newTableName}')
    #print(cursor.fetchall())
    
    conn.commit()
    return newTableName

def propagate(setOfVariables, tableName, propagationVariable):
    newTableName = tableName + "_p"
    setOfVariablesWithoutPropagation = setOfVariables.copy()
    setOfVariablesWithoutPropagation.remove(propagationVariable[1])
    setOfVariablesWithoutPropagation.remove(propagationVariable[0])
    setOfVariablesWithoutPropagation.insert(0, propagationVariable[0])

    setOfVariables.remove(propagationVariable[1])
    setOfVariables.remove(propagationVariable[0])
    setOfVariables.insert(0, propagationVariable[0])
    #print(setOfVariables)

    createQuery = f'CREATE TABLE IF NOT EXISTS {newTableName} ({", ".join(setOfVariablesWithoutPropagation)} TEXT)'
    setOfVariablesWithoutPropagation.remove(propagationVariable[0])

    #'SELECT \'(\' || grouped_R || \') (\' || grouped_S || \')\' FROM (SELECT grouped_R, GROUP_CONCAT(S) as grouped_S FROM (SELECT S, GROUP_CONCAT(R) as grouped_R FROM dnf GROUP BY S) GROUP BY grouped_R)'
    if len(setOfVariablesWithoutPropagation) == 0:
        propagateQuery = f'INSERT INTO {newTableName} SELECT \'(\' || {propagationVariable[0]} || \')*(\' || {propagationVariable[1]} || \')\' FROM {tableName}'
    else:
        propagateQuery = f'INSERT INTO {newTableName} SELECT \'(\' || {propagationVariable[0]} || \')*(\' || {propagationVariable[1]} || \')\' as {propagationVariable[0]}, {", ".join(setOfVariablesWithoutPropagation)} TEXT FROM {tableName}'

    cursor.execute(createQuery)
    cursor.execute(propagateQuery)
    try:
        #cursor.execute(f"DROP TABLE {tableName}")
        pass
    except:
        pass

    cursor.execute(f'SELECT * FROM {newTableName}')
    #print(cursor.fetchall())
    
    conn.commit()
    return newTableName

def setUp(DNF, tableName, conn, cursor, setOfVariables):

    try:
        #cursor.execute("DROP TABLE dnf")
        pass
    except:
        pass


    columns = ', '.join([f'{var} TEXT' for var in setOfVariables])
    create_table_sql = f'CREATE TABLE IF NOT EXISTS {tableName} ({columns})'

    attempt = 0
    while attempt < 3:
        try:
            cursor.execute(create_table_sql)
            break
        except Exception as e:
            print(e)
            attempt += 1
            print("Problem creating table, retrying in 1 second with attempt number " + str(attempt))
            time.sleep(1)
            
    
    placeholders = ', '.join(['?' for _ in setOfVariables])
    insert_sql = f'INSERT INTO "{tableName}" ({", ".join(setOfVariables)}) VALUES ({placeholders})'

    for clause in DNF:
        cursor.execute(insert_sql, clause)

    conn.commit()
    return tableName

def process_signature(original_signature, tableName, setOfVariables):
    def resolve_inner_signature(inner_signature):       
        current_var = ""
        NewTableName = tableName
        #print(inner_signature)

        pattern = r'([A-Z])\*'
        matches = re.findall(pattern, inner_signature)

        for match in matches:
            #print(match + " aggregate ")
            NewTableName = aggregate(setOfVariables, NewTableName, match)
            inner_signature = inner_signature.replace(match + "*", match)

        for part in inner_signature:
            if part:
                if current_var:
                    #print(current_var + part + " propagate ")
                    NewTableName = propagate(setOfVariables, NewTableName, [current_var, part])
                    inner_signature = inner_signature.replace(part, '')
                else:
                    current_var = part
        return inner_signature, NewTableName
    
    signature = original_signature

    while '(' in signature:
        innermost_parentheses = re.search(r'\([^()]*\)', signature)
        signature = innermost_parentheses.group(0)[1:-1]
                
    innermost_signature, tableName = resolve_inner_signature(signature)
    original_signature = original_signature.replace(signature, innermost_signature)

    if '(' in original_signature:
        original_signature = re.sub(r'\((\w)\)', r'\1', original_signature)
        #print(original_signature)
        original_signature, tableName = process_signature(original_signature, tableName, setOfVariables)
        return original_signature, tableName
    else:    
        return original_signature, tableName

def factorisation(DNF, tableNameInput, setOfVariables, signature):

    #conn, cursor = getConnAndCursor()
    
    tableName = setUp(DNF, tableNameInput, conn, cursor, setOfVariables)

    signature, tableName = process_signature(signature, tableName, setOfVariables)

    cursor.execute(f'SELECT * FROM {tableName}')
    rows = cursor.fetchall()
    
    #factorised_string = re.sub(r'\((\w\w|\w\w\w)\)', r'\1', rows[0][0])
    #cursor.close()
    #conn.close()

    return rows #factorised_string


def parse_boolean_formula(formula):
    def parse_subformula(subformula):
        # Check if subformula is a single operand
        if not any(op in subformula for op in '*,'):
            return subformula.strip()
        
        result = {'operator': None, 'subformula': []}
        
        # Determine the main operator at this level
        if '*' in subformula:
            result['operator'] = 'and'
        elif ',' in subformula:
            result['operator'] = 'or'
        
        parts = []
        balance = 0
        current_part = []
       
        for char in subformula:
            if char == '(':
                balance += 1
            elif char == ')':
                balance -= 1
            if (char == '*' and balance == 0 and result['operator'] == 'and'):
                parts.append(''.join(current_part).strip())
                current_part.clear()
            else:
                current_part.append(char)

        parts.append(''.join(current_part).strip())
        
        for part in parts:
            if part.startswith('(') and part.endswith(')'):
                result['subformula'].append(parse_formula(part[1:-1]))
            else:
                result['subformula'].append(part)
        
        return result
    
    def parse_formula(formula):
        formula = formula.strip()
               
        #if formula.startswith('(') and formula.endswith(')'):
        #    return parse_formula(formula[1:-1])

        # Split the main formula by top-level commas
        subformulas = []
        balance = 0
        last_split = 0
        for i, char in enumerate(formula):
            if char == '(':
                balance += 1
            elif char == ')':
                balance -= 1
            elif char == ',' and balance == 0:
                subformulas.append(formula[last_split:i].strip())
                last_split = i + 1
        subformulas.append(formula[last_split:].strip())
       
        if len(subformulas) == 1:
            return parse_subformula(subformulas[0])
        else:
            return {
                'operator': 'or',
                'subformula': [parse_subformula(sub) for sub in subformulas]
            }
    
    return parse_formula(formula)

def reduceDepth(formula):
    parentOperator = formula["operator"]

    isFinalDepth = True

    for clause in formula["subformula"]:
        try:
            clause["operator"]
            isFinalDepth = False
            break
        except:
            continue

    if isFinalDepth:
        return formula
        
    for index, clause in enumerate(formula["subformula"]):
        try:
            if clause["operator"] == parentOperator:
                formula["subformula"].pop(index)
                formula["subformula"] += clause["subformula"]
                reduceDepth(formula)
            else:
                formula["subformula"][index] = reduceDepth(clause)  

        except:
            continue


    return formula

def generate_best_Signature(df, listOfVariables, run):
    signatureSet = []
    permutations = list(itertools.permutations(listOfVariables))
    lenPermutations = len(permutations)
    if lenPermutations > 100:
        for i in range(5000):
            p = permutations[random.randint(0, lenPermutations-1)]
            signature = "(" + "*(".join(p[:-1]) + "*" + p[-1] + ")*"*(len(p)-1)
            if signature in signatureSet:
                    i=i-1
            else:
                signatureSet.append(signature)
    else:
        for p in permutations:
            signature = "(" + "*(".join(p[:-1]) + "*" + p[-1] + ")*"*(len(p)-1)
            if signature not in signatureSet:
                signatureSet.append(signature)

    factorisedSize = {}
    for index, signature in enumerate(signatureSet):
        print(signature)
        tableName = f"dnf_{run}_signature_{index}"
        
        rows = factorisation(df['DNF'], tableName, listOfVariables.copy(), signature)
        length_factorised = 0
        for row in rows:
            for clause in row:
                length_factorised += len(clause)

        factorisedSize[signature] = length_factorised

    best_signature = min(factorisedSize.items(), key=lambda x: x[1])[0]
    return best_signature

def create_factorsied_formula(DNF, listOfVariables, signature, index):
    tablename = f"dnf_{index}"
    rows = factorisation(DNF, tablename, listOfVariables, signature)
    factorised_rows = []
    for row in rows:
        factorised_rows.append(re.sub(r'\(([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})\)', r'\1', row[0]))

    factorised_string = ",".join(factorised_rows)
    parsed_boolean_formula = parse_boolean_formula(factorised_string)
    factorised_formula = reduceDepth(parsed_boolean_formula)
    return factorised_formula


In [None]:
variable_name_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]

with open("data/hardInstancesJsonList/hard_instances.json") as fp:
    JsonObjectList = json.load(fp)

i = 0
for JsonObject in JsonObjectList:
    filename = JsonObject["query_name"]
    print(filename)

    for index, instance in enumerate(JsonObject["instances"]):
        print(index)
        #if not os.path.isfile("data/"+ filename + str(index) + "_factorised.json"):
        i=i+1

        DNF = instance["dnf"]
        if len(DNF) == 0:
            continue

        DNFJson = {"DNF": DNF}

        listOfVariables = variable_name_list[:len(DNF[0])]
        best_signature, factorsed_size_dict = generate_best_Signature(DNFJson, listOfVariables, i)

        factorised_formula = create_factorsied_formula(DNF, listOfVariables, best_signature, i)

        with open("data/hardInstancesFactorized/" + filename + str(index) + "_factorised.json", 'w', encoding='utf-8') as f:
            json.dump((factorised_formula, best_signature, factorsed_size_dict), f, ensure_ascii=False, indent=4)

In [2]:
variable_name_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]

#directory = os.fsencode("dnfs_list")

i=0
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    i=i+1

    print("Current File: " + filename)

    if filename.endswith(".json"):
        print("check")
        with open("dnfs_list/" + filename) as fp:
            JsonObject = json.load(fp)
        try:
            JsonObject['Signature']
        except:
            try:
                first = True
                DNFList = JsonObject['DNF']
                factorised_formula_list = []
                for DNF in DNFList:
                    if first:
                        if len(DNF) == 0:
                            continue
                        listOfVariables = variable_name_list[:len(DNF[0])]
                        print("check2a")
                        best_signature = generate_best_Signature(JsonObject, listOfVariables, i)
                        print("check2b")
                        first = False
                        print("Best signature:" + best_signature)
                        JsonObject['Signature'] = best_signature
                
                        with open("dnfs_list/" + filename, 'w', encoding='utf-8') as f:
                            json.dump(JsonObject, f, ensure_ascii=False, indent=4)

                    print("check3")
                    factorised_formula = create_factorsied_formula(DNF, listOfVariables, best_signature, i+1)
                    factorised_formula_list.append(factorised_formula)
                    print("check4")
                with open("dnfs_factorised/" + filename.replace("_dnf.json", "_factorised.json"), 'w', encoding='utf-8') as f:
                    json.dump(factorised_formula_list, f, ensure_ascii=False, indent=4)
            except:
                continue



Current File: 10a_keyword_keyword_dnf.json
check
check2a
(A*B)*
Current File: 10b_publication_year_dnf.json
check
check2a
(A*B)*
Current File: 11a_conference_name_dnf.json
check
Current File: 11a_keyword_keyword_dnf.json
check
Current File: 11b_conference_name_dnf.json
check
Current File: 11b_keyword_keyword_dnf.json
check
check2a
(A*(B*C)*)*
Current File: 11d_conference_name_dnf.json
check
check2a
(A*(B*C)*)*
Current File: 11d_domain_name_dnf.json
check
check2a
(A*(B*C)*)*
Current File: 11e_conference_name_dnf.json
check
check2a
(A*(B*C)*)*
Current File: 11e_domain_name_dnf.json
check
Current File: 11f_conference_name_dnf.json
check
check2a
(A*(B*C)*)*
Current File: 11f_domain_name_dnf.json
check
Current File: 12a_conference_name_dnf.json
check
Current File: 12c_conference_name_dnf.json
check
check2a
(A*(D*(C*(B*E)*)*)*)*
Current File: 12c_keyword_keyword _dnf.json
check
check2a
(C*(D*(B*(A*E)*)*)*)*
Current File: 1a_conference_name_dnf.json
check
check2a
(E*(C*(B*(D*A)*)*)*)*
Current