In [1]:
import os
import fitz
import unidecode

from datetime import datetime as dt

import voila
import ipywidgets

from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode
from pylatexenc.latex2text import LatexNodes2Text

from whoosh.index import create_in
from whoosh.index import open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.qparser import MultifieldParser
from whoosh.analysis import LanguageAnalyzer
from whoosh.analysis import StemmingAnalyzer


def analyzeText(text):
    languageAnalyzer = LanguageAnalyzer("pt")
    stemmingAnalyzer = StemmingAnalyzer()
    langText = ""
    finalText = ""
    
    for token in languageAnalyzer(text):
        langText += "".join(token.text)
        langText += " "

    for token in stemmingAnalyzer(langText):
        finalText += "".join(token.text)
        finalText += " "
        
    print("-----------------------------------------------------------------------------")
    print(finalText)
    return finalText

class WhooshHandler:
    specialCharacters = ['´', '¸', '˜', '`', '^']
    indexFolder = None
    directory = None
    questionFolders = None
    
    def __init__(self, indexFolder):
        assert indexFolder != None, 'Index folder name cannot be None'
        assert isinstance(indexFolder,str), 'Index folder name must be a string'
        assert indexFolder , 'Index folder name cannot be empty'
        
        '''assert directory != None, 'Documents directory cannot be None'
        assert isinstance(directory,str), 'Documents directory must be a string'
        assert directory , 'Documents directory cannot be empty'''
        
        self.indexFolder = indexFolder
        #self.directory = directory
        #self.questionFolders = [x for x in os.listdir(self.directory)]
        
        
        
#----------------------------------------------------------------------------

    def createSchema(self):
        schema = Schema(path=ID(stored=True, unique=True),
                        content=TEXT(lang = "pt"),
                        date=DATETIME(stored=True),
                        tags=KEYWORD(stored=True),
                        nOcurrences=NUMERIC(numtype=int,bits=64,stored=True))
        
        if not os.path.exists(self.indexFolder):
            os.mkdir(self.indexFolder)
            
        ix = create_in(self.indexFolder, schema)
        return ix
        
#----------------------------------------------------------------------------

    def __decodePdf(self, path):
        doc = fitz.open(path)
        for page in doc:
            text = page.getText()
        return self.__removeSpecialChar(text)


    def __decodeTex(self,path):
        doc = " ".join(open(path, "r").readlines()).encode("ISO-8859-1").decode("utf-8")
        a = LatexNodes2Text().latex_to_text(doc)
        a = unidecode.unidecode(a)
        return self.__removeSpecialChar(a)

    def __decodePy(self,path):
        if os.path.isfile(path):
            doc = open(path, 'r')
            return self.__removeSpecialChar(doc.read())
        else:
            return None

    def __removeSpecialChar(self,text):
        text
        for sChar in self.specialCharacters:
            text = text.replace(sChar, '')

        return str(text)

#------------------------------------------------------------------------------

    
    def deleteIndex(self, name):
        deleted = False
        message="Error"
        
        if os.path.exists("IndexFiles.txt"):
            with open("IndexFiles.txt","r") as f:
                indexListFile = f.readlines()
            
        else:
            deleted = False
            message = "No valid Indexes exist."
            return [deleted,message]
        
        for i in indexListFile:
            if i.strip("\n") == name:
                try:
                    shutil.rmtree(name)
                    shutil.rmtree(name+"_HomeWork")
                    deleted = True
                    message = "Index successfully deleted."
                    break

                except:
                    deleted = False
                    message = "Error ocurred. Please verify that the desired Index directory exists."
                    break
            else:
                deleted= False
                message = "Please insert a valid Index name."
        
        if deleted == True:
            with open("IndexFiles.txt", "w") as f:
                for line in indexListFile:
                    if line.strip("\n") != name:
                        f.write(line)


        return [deleted,message]
    
    def writeSchema(self, directory, occurrences = 0):
        flag = False
        try:
            ix = open_dir(self.indexFolder)
        except EmptyIndexError:
            return False,("The index provided does not exist, make sure you add it before using it and do not "
                    +"delete it manually")
        
        addedDirs = ""
        
        for root, dirs, files in os.walk(directory):
            for d in dirs:
                if d.find("question")>=0:
                    addedDirs +=  root+ '\\' + d + ", "
                    path = root+ '\\' + d + '\\version_1\\'
                    if os.path.exists(path):
                        if(not flag):
                            flag = self.__writeSchema2(ix, path, occurrences)
                        else:
                            self.__writeSchema2(ix, path, occurrences)
        if(not flag):
            return ("Unfortunately it was not possible to find any exercises in the directory provided!" 
                    +"Make sure the exercises contain the following format"
                    +" ...\\question_x\\version_x\\true_or_false_question.x")
        else:
            return True,("The following exercise folders were found and added to the index:"+addedDirs)
     
            
    def __writeSchema2(self, ix, path, occurrences):
        writer = ix.writer()
        dateNow = dt.now()
        pathPdf = path +"\\true_or_false_question.pdf"
        textPdf = self.__decodePdf(pathPdf)
        pathPy = path +"\\program.py"
        textPy = self.__decodePy(pathPy)
        pathTex = path +"\\true_or_false_question.tex"
        textTex = self.__decodeTex(pathTex)
        
        finalPdfText= analyzeText(textPdf)
        flag = True
           
        
        with ix.searcher() as seacher:
            query = QueryParser("content",schema=ix.schema)
            parse = query.parse(finalPdfText)
            result = seacher.search(parse)
            if(not result.is_empty()):
                prevOc = result[0]["nOcurrences"]
                prevPath = result[0]["path"]
                prevDate = result[0]["date"]
                prevTags = result[0]["tags"]
                print(result[0])
                writer.update_document(path = prevPath, content=finalPdfText,date=prevDate, tags=prevTags, nOcurrences=prevOc+1)
            else:
                writer.add_document(path=pathPdf, content=finalPdfText, date=dateNow, tags='pdf',nOcurrences=1)
            
#             if textPy is not None:
#                 writer.add_document(path=pathPy, content=analyzeText(textPy), date=dateNow, tags='py')

#             if textPy is not None:
#                 textTex += textPy
#             writer.add_document(path=pathTex, content=analyzeText(textTex), date=dateNow, tags='tex')

        writer.commit()
        return flag

        


#----------------------------------------------------------------------------
#----------------------------------------------------------------------------

class Searcher:
    specialCharacters = ['´', '¸', '˜', '`', '^']
    ix = None
    
    def __init__(self, indexFolder):
        assert indexFolder != None, 'Index folder name cannot be None'
        assert isinstance(indexFolder,str), 'Index folder name must be a string'
        assert indexFolder , 'Index folder name cannot be empty'
        
        self.ix = open_dir(indexFolder)
    
    def deleteEntry(self, doc_path):
        try:
            ix = open_dir(self.indexFolder)
        except EmptyIndexError:
            return [False,("The index provided does not exist, make sure you add it before using it and do not "
                    +"delete it manually")]
        writer = ix.writer()
        message = ""
        deleted = False
        doc_path+= "\\version_1\\"
        try:
            writer.delete_by_term('path', doc_path+"true_or_false_question.pdf")
            writer.delete_by_term('path', doc_path+"program.py")
            writer.delete_by_term('path', doc_path+"true_or_false_question.tex")
            writer.commit()
            message = ("Document successfully deleted.")
            deleted = True
        except:
            message = ("Please verify the document exists.")
            deleted = False
            
        return [deleted, message]
    
    def parser(self, keyword, docType = 'all'):
        resultArray = []
        keyword =  analyzeText(unidecode.unidecode(keyword))
        
        with self.ix.searcher() as searcher:
            if docType == 'all':
                query = QueryParser("content", schema=self.ix.schema).parse(keyword)          
            else:
                query = MultifieldParser(["content", "tags"], schema=self.ix.schema).parse("content:"+keyword+" tags:"+docType)
            
            results = searcher.search(query)
            
            if(results.is_empty()):
                return None
            else:
                for result in results:
                    path = result["path"]
                    tag = result["tags"]
                    
                    resultArray.append([path,tag])
        
        return resultArray   


In [5]:
WH = WhooshHandler("indexTeste2")
# WH.createSchema()
# WH.getDirs(r"C:\Users\ASUS\Desktop\Semestre6\Projeto\perguntas")
WH.writeSchema(r"C:\Users\ASUS\Desktop\Semestre6\Projeto\perguntas", True)

# S = Searcher("index")
# result = S.parser("programas","all")
# print(result)


-----------------------------------------------------------------------------
resolv sistem equaco lin incognit metod adica orden gauss 2s 3t 2t 3t indiqu verdadeir fal 
<Hit {'path': 'C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas\\matematica\\linear_equations_system_3x3_unique_solution\\question\\version_1\\true_or_false_question.pdf', 'date': datetime.datetime(2020, 8, 6, 15, 40, 45, 626874), 'nOcurrences': 3, 'tags': 'pdf'}>
-----------------------------------------------------------------------------
resolv sistem equaco lin incognit metod adica orden gauss 8c 13 8d 9e 2c 4e 4d 4c indiqu verdadeir fal 
<Hit {'path': 'C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas\\matematica\\linear_equations_system_3x3_unique_solution\\question_old.20200522_trab1_with_bug\\question\\version_1\\true_or_false_question.pdf', 'date': datetime.datetime(2020, 8, 6, 15, 31, 37, 458532), 'nOcurrences': 3, 'tags': 'pdf'}>
------------------------------------------------------------------