In [None]:

import json, subprocess
import re, ast, io, os
import shutil
from subprocess import Popen
from os import listdir
from os.path import isdir, join
import warnings
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import time





fileName = {
    "PST":"true.txt",
    "PSF": "false.txt",
    "Owl": "input.owl",     # the input KG
    "Orig": "original",
    "ABOX": "triples.txt",
    "TBOX": "rules.txt",
    #"EC": "equClass.txt",
    "Repairs": "meta.txt",
    "Proofs": "proofs.txt",
    "NewRepairs": "newRepair.txt",
    "ES": "entScores",
    "Solutions": "solutions",   # folder of storing fault-free KGs.
    "Archive": "archive"        # folder of faulty KG that are expensive to further repair
}

repFileStructure = {
    "Heuristics": 1,
    "ProtectList": 2,
    "TheoryGraph": 3,
    "RsBanned": 4,
    "RsApplied": 5,
    "RsLen": 6
}

# the main entrance for applying ABCT, whose source code is in ABCPath, to the knowledge graph stored in file KGF.
# the preferred structure is PosDir, NegDir
# files are in ABCPath/data.

def abct(ABCPath, KGsDir):
    
    abcStart = time.time()
    # initialise main data folder
    # ABCPath = os.path.abspath(os.getcwd())
    # KGsDir = os.path.join(ABCPath, "data/")
    
    if not os.path.exists(KGsDir):
        os.makedirs(KGsDir)
    else:
        subFolders = [join(KGsDir, f) for f in listdir(KGsDir) if os.path.isdir(join(KGsDir, f))]
        [shutil.rmtree(f, ignore_errors=True) for f in subFolders]    # if exist the folder, remove all its subfolders.

    log = open(os.path.join(KGsDir, "log.txt"), "w")
    log.write("start process.\n")

    # name the input files of true, false sets and *.owl KG.
    PosDir = os.path.join(KGsDir, fileName["PST"])
    NegDir = os.path.join(KGsDir, fileName["PSF"])
    InpOWLDir = os.path.join(KGsDir, fileName["Owl"])
    InpRuleDir = os.path.join(KGsDir, fileName["TBOX"])
    repairDir = os.path.join(KGsDir, fileName["Repairs"])

    
     # initialise the serial number of KGs and folder name for storing data
    SerialNum = 0
    # generate the first candidate folder
    KGOrig = os.path.join(KGsDir, fileName["Orig"])
    os.mkdir(KGOrig)

    # copy the input file meta to KGOrig
    if os.path.exists(repairDir):
        shutil.copyfile(repairDir, os.path.join(KGOrig, fileName["Repairs"]))
    else:
        f = open(os.path.join(KGOrig, fileName["Repairs"]), 'w')
        f.write('[].\n'*6)
        f.close()

    #create a file to record applied repair plans by appending
    TripleDir = os.path.join(KGOrig, fileName["ABOX"])
    TripleRel = os.path.join(KGOrig, "relevantTriples")
    RuleDir = os.path.join(KGOrig, fileName["TBOX"])
    #EqDir = os.path.join(KGOrig, fileName["EC"])
    ProofDir = os.path.join(KGOrig, fileName["Proofs"])
    entDir = os.path.join(KGOrig, fileName["ES"])
    
    # if there is no owl, copy triples.txt to original/triples.txt
    if not os.path.exists(InpOWLDir):
        shutil.copyfile(os.path.join(KGsDir, fileName["ABOX"]), TripleDir)
    else:
        # get the input KG from file KGF, and extract copy rules.
        jsld2prolog2(ABCPath, InpOWLDir, TripleDir, TripleRel)
    shutil.copyfile(InpRuleDir, RuleDir)
    log.write("--------------- Finish jsld2prolog2---------------")
    

    # creat files that do no exist.
    iniFiles = [TripleDir, TripleRel, RuleDir, ProofDir, repairDir, entDir]
    for f in iniFiles:
        open(f, "a+").close()



    # Prepare the input for fault detection
    inputFaultDet = "\", \"".join([TripleDir, RuleDir, PosDir, NegDir, ProofDir])

    InsuffNum, IncompNum = fault_det(ABCPath, inputFaultDet)
    faultNum = str(int(InsuffNum) + int(IncompNum))
    # calculate entrenchment scores.
    argEnt = "\", \"".join([TripleDir, RuleDir, ProofDir, entDir])
    proc =subprocess.run(["swipl", "-l", "entrenchment.pl", "-g", "main(\"" +argEnt+ "\")"],
                          timeout=None, cwd = ABCPath, stdout=subprocess.PIPE)
    ES1, ES2 = proc.stdout.decode("utf-8").split("\n")[-3:-1]

    # rename the folder of the original KG based on fault numbers
    if float(ES2) >= 0:
        KG_Nunbers = "_".join([faultNum, '-'+ ES1, '-'+ ES2, str(SerialNum), InsuffNum, IncompNum, faultNum])
    else:
        KG_Nunbers = "_".join([faultNum, '-'+ ES1, ES2[1:], str(SerialNum), InsuffNum, IncompNum, faultNum])

    KGinp = os.path.join(KGsDir, KG_Nunbers)
    # os.rename(KGOrig, KGinp)
    # copy everything in KGsDir (the entire folder KGinp) into a new folder named after fileName["Orig"]
    shutil.copytree(KGOrig, KGinp)
    esOrigDir = os.path.join(KGsDir, fileName["Orig"], KG_Nunbers, fileName["ES"])

    # initise the folders for output
    SolArcDir= os.path.join(KGsDir, fileName["Solutions"])
    if not os.path.exists(SolArcDir):
        os.makedirs(SolArcDir)
    else:
        shutil.rmtree(SolArcDir, ignore_errors=True)    # if exist the folder, remove all its subfolders.

    AbandonDir = os.path.join(KGsDir, fileName["Archive"])
    if not os.path.exists(AbandonDir):
        os.makedirs(AbandonDir)
    else:
        shutil.rmtree(AbandonDir, ignore_errors=True)    # if exist the folder, remove all its subfolders.


    # repair the KGOrig
    sorted_files = [KGinp]
    i = 0
    costLimit = 0  # initialise the limit of search cost as 0, representing no solution has been found
    timelog = open(os.path.join(KGsDir, "timelog.txt"), "w")
    while i < len(sorted_files):
        start_time = time.time()
        # backup SerialNum so that whether new KG is found can be checked
        SerialNumBack = SerialNum
        log.write('\n current cost limit is  '+ str(costLimit) +'\n')
        log.write('\n current SerialNum is  '+ str(SerialNum) +'\n')

        KGCurDir = sorted_files[i]
        print(KGCurDir)
        # read proofs of this faulty theory
        proofDir = os.path.join(KGCurDir, fileName["Proofs"])

        if not os.path.exists(proofDir):
            warnings.warn("A faulty theory without proof files.")
            continue

        [SuffProofs, InsuffProofs, IncompProof] = open(proofDir).read().split("\n")

        # get the file that records the applied repair plans by appending.
        repairFile = os.path.join(KGCurDir, fileName["Repairs"])
        RuleDir = os.path.join(KGCurDir, fileName["TBOX"])
        TripleDir = os.path.join(KGCurDir, fileName["ABOX"])
        # creat a file to recode newly found repair plans.
        # it contains updated heuristics, ProtectList but only the newly unapplied repairPlans.
        repairNewF = os.path.join(KGCurDir, "repairPlans.txt")
        repairGroupNew  = os.path.join(KGCurDir, "repairPlanGroups.txt")
        open(repairNewF, "a").close()
        open(repairGroupNew, "a").close()
        

        # Step 4 find repaire plans
        # 4.1 generate all possible repair plans
        inpRepairGen = [proofDir, PosDir, NegDir, TripleDir, RuleDir, repairFile, repairNewF, repairGroupNew]
        argRepairGen = "\", \"".join(inpRepairGen)
        proc =subprocess.run(["swipl", "-l", "repairGen.pl", "-g", "main(\"" + argRepairGen + "\")"],
                              timeout=None, cwd = ABCPath, stdout=subprocess.PIPE)
        print(proc.stdout.decode("utf-8").split("\n"))
        repGenInfo = proc.stdout.decode("utf-8")

        # if the repair cannot be generated
        if "success" not in repGenInfo:
            warnings.warn("A faulty theory that cannot be repaired has been found.")
            log.write('\n' + KGCurDir+ " cannot be repaired.\n")
            return
        # 4.2 try to compute max sets of commuting repair plans.
        try:
            repairNewF2 = os.path.join(KGCurDir, "repairPlansSets.txt")
            inpRepairGen2 = [repairGroupNew, RuleDir, repairNewF2]
            argRepairGen2 = "\", \"".join(inpRepairGen2)
            proc =subprocess.run(["swipl", "-l", "repairGen.pl", "-g", "repCombineFile(\"" + argRepairGen2 + "\")"],
                                  timeout=None, cwd = ABCPath, stdout=subprocess.PIPE)
            repGenInfo2 = proc.stdout.decode("utf-8")
            if "success" in repGenInfo2:
                repNewF = open(repairNewF2, 'r')
                repPlanList = repNewF.readlines()
        # Get a list of repair plans
        except:
            print("\nWarning: no max sets of commuting repair plans are found. Will apply repair plans one by one.\n")
            my_file = open(repairNewF, "r")
            repPlanList = my_file.readlines()       # get the list of rpeair plans.

        # Step 5 remove the SerialNum KG with KGs generated by applying each repaire plan
        for repPlanTem in tqdm(repPlanList):
            repPlan = repPlanTem.split(".")[0]
            log.write('\n step 5: '+ repPlan + '\n')
            # creat a sub-forder to store files about applying the ith repair plan to the current KG
            KGNewTem = os.path.join(KGCurDir, "repair")
            os.makedirs(KGNewTem)
            repTripleDir = os.path.join(KGNewTem, fileName["ABOX"])
            repRuleDir = os.path.join(KGNewTem, fileName["TBOX"])
            repairFileNew = os.path.join(KGNewTem, fileName["Repairs"])
            # creat files that do no exist.
            newFiles = [repTripleDir, repRuleDir, repairFileNew]
            for f in newFiles:
                open(f, "a+").close()

            inputAppRep = [repPlan, TripleDir, RuleDir, repairFile, repTripleDir, repRuleDir, repairFileNew]
            ArgAppRep = "\", \"".join(inputAppRep)

            # Apply repairs to generate a newly repaired KG for the next layer.
            proc =subprocess.run(["swipl", "-l", "repairApply.pl", "-g", "main(\"" + ArgAppRep + "\")"],
                                  timeout=None, cwd = ABCPath, stdout=subprocess.PIPE)
            appResult = proc.stdout.decode("utf-8")

            # if the repair is applied, then generate new folde for the new KG
            if "success" in appResult:
                log.write("\n success in applying " + repPlan +" to " + KGCurDir + "\n")
                timelog = open(os.path.join(KGsDir, "timelog.txt"), "a+")
                timelog.write(str(time.time() - start_time)+"\n")
                timelog.close()
                SerialNum += 1    # confirm that serial number of KG is increased by 1

                # get information about the repairs
                oldRepF = open(repairFile, "r")
                newRepF = open(repairFileNew, "r")
                first = oldRepF.readlines()[:-3]     # depends on repFileStructur
                second= newRepF.readlines()
                newRepF.close()
                oldRepF.close()

                content = first + second

                repFwrite = open(repairFileNew, "w")
                repFwrite.write(''.join(content))
                repFwrite.close()

                # TODO: calculate equivalence classes
                # detect faults
                proofNewDir = os.path.join(KGNewTem, fileName["Proofs"])
                open(proofNewDir, 'w').close()
                entDirNew = os.path.join(KGNewTem, fileName["ES"])
                open(entDirNew, 'w').close()

                # Prepare the input for fault detection
                inpFaultDet = "\", \"".join([repTripleDir, repRuleDir, PosDir, NegDir, proofNewDir])

                InsuffNum, IncompNum  = fault_det(ABCPath, inpFaultDet)      # detect remaining faults, whose proofs are written in the file "faultProofs".
                faultNumNew = int(InsuffNum) + int(IncompNum)
                costNew = faultNumNew + int(second[-1][0])  # remaining fault number + applied repairs" number
                log.write('\n faultNumNew is '+ str(faultNumNew) + '\n')

                # calculate entrenchment scores.
                argEntNew = "\", \"".join([repTripleDir, repRuleDir, proofNewDir, entDirNew])
                proc =subprocess.run(["swipl", "-l", "entrenchment.pl", "-g", "main(\"" +argEntNew+ "\")"],
                                      timeout=None, cwd = ABCPath, stdout=subprocess.PIPE)
                ES1New, ES2New = proc.stdout.decode("utf-8").split("\n")[-3:-1]

                # rename the folder of the original KG based on fault numbers
                if float(ES2New) >= 0:
                    KGDirNew = os.path.join(KGsDir, "_".join([str(costNew), '-'+ ES1New, '-'+ ES2New, str(SerialNum), InsuffNum, IncompNum, str(faultNumNew)]))
                else:
                    KGDirNew = os.path.join(KGsDir, "_".join([str(costNew), '-'+ ES1New, ES2New[1:], str(SerialNum), InsuffNum, IncompNum, str(faultNumNew)]))
                os.rename(KGNewTem, KGDirNew)

                # archive the fault-free KGs to the solutions folder.
                if faultNumNew == 0:
                    log.write('\n find a solution, named '+ KGDirNew + '\n')
                     # set the costlimit as the number of repairs applied to the fault-free KG.
                    if costLimit == 0:
                        costLimit = costNew
                        shutil.move(KGDirNew, SolArcDir)    # folder KGDirNew will be a sub-folder under SolArcDir

                    # if this solution has beyond the cost limit, then terminate this search branch by archiving it.
                    elif costNew > costLimit:
                        # archive the faulty KG
                        shutil.move(KGDirNew, AbandonDir)
                    else:
                        shutil.move(KGDirNew, SolArcDir)    # folder KGDirNew will be a sub-folder under SolArcDir


            # otherwise, log.write error message and then continue to apply the next repair plan
            else:
                shutil.rmtree(KGNewTem)
                print("Error: repair plan" + repPlan +" cannot be applied to " + KGCurDir )
                log.write("\n Error: repair plan: " + repPlan +" cannot be applied to " + KGCurDir + "\n")
                continue


        # archive the faulty KG whose repiars have been explored and the onese whose cost has beyond the limitation
        shutil.move(KGCurDir, AbandonDir)
        for f in listdir(KGsDir):
            if os.path.isdir(join(KGsDir, f)):
                if len(f.split('_')) > 3 and costLimit != 0:
                    if int(f.split('_')[0]) >= costLimit:
                        shutil.move(os.path.join(join(KGsDir, f)), AbandonDir)

        # if there is at least one repaired KG" found for the input faulty KG, rerank the current faulty theory and start over
        if SerialNumBack < SerialNum:
            sorted_files = sorted([join(KGsDir, f) for f in listdir(KGsDir) if (os.path.isdir(join(KGsDir, f)) and len(f.split('_')) > 3)])
            # if there is no faulty KG, terminate.
            if sorted_files == []:
                break
            log.write("\n current faulty KGs to repiar include: ")
            log.write('\n'.join(sorted_files) + '\n')
            i = 0
        else:
            i += 1

    log.write("--------------- Finish Reapir Generation---------------")
    print("--------------- Finish Reapir Generation---------------")

    runningTime = time.time() - abcStart
    log.write("\n Running time is "+ str(runningTime) +" seconds.")
    log.close()
    print("Running time is %s  seconds." %runningTime)

    return runningTime

# input is ",".join([TripleDir, RuleDir, EqDir, PosDir, NegDir, ProofDir])
def fault_det(ABCPath, inputFiles):
    # detect fault
    proc =subprocess.run(["swipl", "-l", "faultDet.pl", "-g", "main(\""+inputFiles+"\")"],
                          timeout=None, cwd = ABCPath, stdout=subprocess.PIPE)
    # print(proc.stdout.decode("utf-8"))
    InsuffNum, IncompNum = proc.stdout.decode("utf-8").split("\n")[-3:-1]
    return InsuffNum, IncompNum





# read ABOX from json-ld format to the internal representation format for ABCT to ues in Prolog.
# TripleF and RuleF are the directories where the former contaqins only assertions and the latter only rules.
def jsld2prolog2(ABCPath, JsonFile, TripleF, TripleRel):
    inpJ = open(JsonFile, 'r')
    inputData = json.loads(inpJ.read())
    allAboxF = open(TripleF, "w")
    relAboxF= open(TripleRel, "w")
    outdata = []

    for Entity in inputData:
        # get the id of the entity, which will be used as the subject of all triples from this entity.
        subject = Entity["@id"]
        z = re.search(r"#(.*)", str(subject))
        if z:
            subject1 = z.group(1)
        else:
            subject1 = subject
        for key in Entity.keys():
            if key != "@id":
                y = re.search(r"#(.*)", str(key))
                if y:
                    key1 = y.group(1)
                else:
                    key1 = key

                if type(Entity[key]) == list:
                    for obj in Entity[key]:
                        x = re.search(r"#(.*)", str(obj))
                        if x:
                            assertion = "["+"+[\""+key1+"\", [\""+subject1+"\"], [\""+x.group(1)+"\"]]]"
                            outdata.append(assertion)
                        else:
                            assertion = "["+"+[\""+key1+"\", [\""+subject1+"\"], [\""+str(obj)+"\"]]]"
                            outdata.append(assertion)
                else:
                    obj = Entity[key]
                    x = re.search(r"#(.*)", str(obj))
                    if x:
                        assertion = "["+"+[\""+key1+"\", [\""+subject1+"\"], [\""+x.group(1)+"\"]]]"
                        outdata.append(assertion)
                    else:
                        assertion = "["+"+[\""+key1+"\", [\""+subject1+"\"], [\""+str(obj)+"\"]]]"
                        outdata.append(assertion)

    allAboxF.write(".\n".join(outdata)+ ".")

    """
    # get relevant Abox
    Argument = [allAboxF, RulesF, PST, PSF, HeuF, RelAboxF]
    proc =subprocess.run(["swipl", "-l", "util.pl", "-g", "relAxiom("+ ", ".join(Argument) + "\")"],
                          timeout=None, cwd = ABCPath, stdout=subprocess.PIPE)
    Flag = proc.stdout.decode("utf-8").split("Flag:")[-1]

    # if the repair cannot be generated
    if Flag != "success":
        warnings.warn("Fail in extracting relevant Abox, so all triples are seen as relevant.")
        shutil.copy(allAboxF, relAboxF)
    """
    # TODO: extract rules
    inpJ.close()
    allAboxF.close()
    relAboxF.close()

    inpArg = "\", \"".join([TripleF, TripleRel])
    proc =subprocess.run(["swipl", "-l", "util.pl", "-g", "removeQuote([\"" +inpArg+ "\"])"],
           timeout=None, cwd = ABCPath, stdout=subprocess.PIPE)


    return

# 
def CreatTheory(dataDir, m, n):
    inputTheory = []
    
    
    for i in range(1, m+1):
        entity = {}
        entity["@id"] = "eid_" + str(i)
        entity["label"] = "bfd" + str(i)
        entity["type"] = "bfd_session"
        entity["dependOn"] = "eid_" + str(i-1)
        
        inputTheory.append(entity)
        
    for j in range(m+1, m+n+1):
        entity = {}
        entity["@id"] = "eid_" + str(m+j)
        entity["label"] = "isis" + str(j)
        entity["type"] = "isis_session"
        entity["dependOn"] = "eid_" + str(j-m)
        
        inputTheory.append(entity)

    if os.path.exists(dataDir):
        subFolders = [join(dataDir, f) for f in listdir(dataDir) if os.path.isdir(join(dataDir, f))]
        [shutil.rmtree(f, ignore_errors=True) for f in subFolders]    # if exist the folder, remove all its subfolders.

    else:
        os.makedirs(dataDir)
    inputOwlFile = open(os.path.join(dataDir, 'input.owl'), 'w', encoding='utf-8')
    inputOwlFile.write(json.dumps(inputTheory, indent = 4))
    
    inputRuleFile = open(os.path.join(dataDir, 'rules.txt'), 'w', encoding='utf-8')
    inputRuleFile.write("[-[dependOn, vble(x), vble(y)],-[type, vble(x), [bfd_session]],-[type, vble(y), [bfd_session]]].")
    inputFalseSetFile = open(os.path.join(dataDir, 'false.txt'), 'w', encoding='utf-8')
    inputFalseSetFile.write('[].')
    inputTrueSetFile = open(os.path.join(dataDir, 'true.txt'), 'w', encoding='utf-8')
    inputTrueSetFile.write('[].')
    inputTrueSetFile = open(os.path.join(dataDir, 'meta.txt'), 'w', encoding='utf-8')
    inputTrueSetFile.write('[].\n[[+[type,_,_]], [-_,-_,-_]].\n'+ '[].\n'*4)
    
    inputOwlFile.close()
    inputFalseSetFile.close()
    inputTrueSetFile.close()
    
    print('Complete creating input theories')


def testRunTime(path):
    timeInfo = []
    
    
    for x in range(10, 11):
        dataDir = os.path.join(path, 'data_' + str(x))
        CreatTheory(dataDir, x, x)
        time = abct(path, dataDir)
        timeInfo.append(( x,time))
    f = open('running_log.txt', 'w') 
    f.write(timeInfo)
    f.close()
    
    plt.plot([x for (x,_) in timeInfo], [y for (_,y) in timeInfo])
    plt.show()

# if __name__ == '__main__':
#     abct()


In [None]:
testRunTime('/Users/xueli/Documents/code/treat-abc-main/')