In [11]:
import os
import fitz
import unidecode
import shutil

from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

import voila
import ipywidgets

from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode
from pylatexenc.latex2text import LatexNodes2Text

from whoosh.index import create_in
from whoosh.index import open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.qparser import MultifieldParser
from whoosh.analysis import LanguageAnalyzer
from whoosh.analysis import StemmingAnalyzer
from whoosh.index import EmptyIndexError

def analyzeText(text):
    languageAnalyzer = LanguageAnalyzer("pt")
#     stemmingAnalyzer = StemmingAnalyzer()
    langText = ""
#     finalText = ""
    
    for token in languageAnalyzer(text):
        langText += "".join(token.text)
        langText += " "

#     for token in stemmingAnalyzer(langText):
#         finalText += "".join(token.text)
#         finalText += " "
        
    #print("-----------------------------------------------------------------------------")
    #print(finalText)
    return langText

class WhooshHandler:
    specialCharacters = ['´', '¸', '˜', '`', '^']
    indexFolder = None
    homeWorkIndex = None
    directory = None
    questionFolders = None
    
    def __init__(self, indexFolder):
        assert indexFolder != None, 'Index folder name cannot be None'
        assert isinstance(indexFolder,str), 'Index folder name must be a string'
        assert indexFolder , 'Index folder name cannot be empty'
        
        '''assert directory != None, 'Documents directory cannot be None'
        assert isinstance(directory,str), 'Documents directory must be a string'
        assert directory , 'Documents directory cannot be empty'''
        
        self.indexFolder = indexFolder
        self.homeWorkIndex = self.indexFolder + "_HomeWork"
        #self.directory = directory
        #self.questionFolders = [x for x in os.listdir(self.directory)]
        
#----------------------------------------------------------------------------

    def createSchema(self):
        schema = Schema(path=ID(stored=True, unique=True),
                        content=TEXT(stored=True, lang = "pt"),
                        date=DATETIME(stored=True, sortable=True),
                        tags=KEYWORD(stored=True), 
                        nOccurrences = NUMERIC(stored=True, sortable=True))
        teste=False
        if not os.path.exists("IndexFiles.txt"):
            f =open("IndexFiles.txt","w").close()
            teste=True
        
        if not os.path.exists(self.indexFolder):
            os.mkdir(self.indexFolder)
            
        txtFile = open("IndexFiles.txt","r").readlines()
        txtFile = [index.replace('\n', '') for index in txtFile]
        
        boolea = self.indexFolder in txtFile
        
        if(not (self.indexFolder in txtFile)):
            with open("IndexFiles.txt", 'a') as out:
                out.write(self.indexFolder + '\n')
        
        homeWorkSchema = Schema(number=ID(stored=True, unique=True),
                         content=TEXT(stored=True))
        
        if not os.path.exists(self.homeWorkIndex):
            os.mkdir(self.homeWorkIndex)
            
        ix = create_in(self.indexFolder, schema)
        hix = create_in(self.homeWorkIndex, homeWorkSchema)
        
        return ix, hix, teste
        
#----------------------------------------------------------------------------

    def __decodePdf(self, path):
        doc = fitz.open(path)
        for page in doc:
            text = page.getText()
        return self.__removeSpecialChar(text)


    def __decodeTex(self,path):
        doc = " ".join(open(path, "r").readlines()).encode("ISO-8859-1").decode("utf-8")
        a = LatexNodes2Text().latex_to_text(doc)
        a = unidecode.unidecode(a)
        return self.__removeSpecialChar(a)

    def __decodePy(self,path):
        if os.path.isfile(path):
            doc = open(path, 'r')
            return self.__removeSpecialChar(doc.read())
        else:
            return None

    def __removeSpecialChar(self,text):
        text
        for sChar in self.specialCharacters:
            text = text.replace(sChar, '')

        return str(text)
#------------------------------------------------------------------------------

#------------------------------------------------------------------------------
    def deleteIndex(self, name):
        deleted = False
        message="Error"
        
        if os.path.exists("IndexFiles.txt"):
            with open("IndexFiles.txt","r") as f:
                indexListFile = f.readlines()
            
        else:
            deleted = False
            message = "No valid Indexes exist."
            return [deleted,message]
        
        for i in indexListFile:
            if i.strip("\n") == name:
                try:
                    shutil.rmtree(name)
                    shutil.rmtree(name+"_HomeWork")
                    deleted = True
                    message = "Index successfully deleted."
                    break

                except:
                    deleted = False
                    message = "Error ocurred. Please verify that the desired Index directory exists."
                    break
            else:
                deleted= False
                message = "Please insert a valid Index name."
        
        if deleted == True:
            with open("IndexFiles.txt", "w") as f:
                for line in indexListFile:
                    if line.strip("\n") != name:
                        f.write(line)


        return [deleted,message]

                
    
    def writeSchema(self, directory, occurrences = 0):
        flag = False
        try:
            ix = open_dir(self.indexFolder)
        except EmptyIndexError:
            return False,("The index provided does not exist, make sure you add it before using it and do not "
                    +"delete it manually")
        
        addedDirs = ""
        
        for root, dirs, files in os.walk(directory):
            for d in dirs:
                if d.find("question")>=0:
                    addedDirs +=  root+ '\\' + d + ", "
                    path = root+ '\\' + d + '\\version_1\\'
                    if os.path.exists(path):
                        if(not flag):
                            flag = self.__writeSchema2(ix, path, occurrences)
                        else:
                            self.__writeSchema2(ix, path, occurrences)
        if(not flag):
            return ("Unfortunately it was not possible to find any exercises in the directory provided!" 
                    +"Make sure the exercises contain the following format"
                    +" ...\\question_x\\version_x\\true_or_false_question.x")
        else:
            return True,("The following exercise folders were found and added to the index:"+addedDirs)
            
            
    def __writeSchema2(self, ix, path, occurrences):
        writer = ix.writer()
        dateNow = dt.now()
        pathPdf = path +"true_or_false_question.pdf"
        textPdf = self.__decodePdf(pathPdf)
        pathPy = path +"program.py"
        textPy = self.__decodePy(pathPy)
        pathTex = path +"true_or_false_question.tex"
        textTex = self.__decodeTex(pathTex)
        
        finalPdfText= analyzeText(textPdf)
        flag = False
           
        
        with ix.searcher() as seacher:
            query = QueryParser("content",schema=ix.schema)
            parse = query.parse(finalPdfText)
            result = seacher.search(parse)
            flag = True
            if(not result.is_empty()):
                prevOc = result[0]["nOcurrences"]
                prevPath = result[0]["path"]
                prevDate = result[0]["date"]
                prevTags = result[0]["tags"]
                writer.update_document(path = prevPath, content=finalPdfText,date=prevDate, tags=prevTags, nOcurrences=prevOc+1)
            else:
                writer.add_document(path=pathPdf, content=finalPdfText, date=dateNow, tags='pdf',nOcurrences=1)
            
            if textPy is not None:
                writer.add_document(path=pathPy, content=analyzeText(textPy), date=dateNow, tags='py')

            if textPy is not None:
                textTex += textPy
            writer.add_document(path=pathTex, content=analyzeText(textTex), date=dateNow, tags='tex')

        writer.commit()       
        
        return flag


#----------------------------------------------------------------------------
    def addHomeWork(self,questionsPath):
        ix = open_dir(self.indexFolder)
        
        hix = open_dir(self.homeWorkIndex)
        hixWriter = hix.writer()
        documentsNum = hix.searcher().doc_count_all()
      
        with ix.searcher() as searcher:
            query = QueryParser("path", schema=ix.schema)
            
            for path in questionsPath:    
                questionPath = path + "\\version_1\\true_or_false_question.pdf"
                
                parse = query.parse(questionPath)
                result = searcher.search(parse)
                print(result)
                if(not result.is_empty()):
                    self.writeSchema(path, False, result[0]["nOccurrences"] + 1);
                else:
                    self.writeSchema(path, False, 0);
 
        hixWriter.commit()

#----------------------------------------------------------------------------

class Searcher:
    specialCharacters = ['´', '¸', '˜', '`', '^']
    indexFolder = None
    
    def __init__(self, indexFolder):
        assert indexFolder != None, 'Index folder name cannot be None'
        assert isinstance(indexFolder,str), 'Index folder name must be a string'
        assert indexFolder , 'Index folder name cannot be empty'
        
        self.indexFolder = indexFolder
    
    def deleteEntry(self, doc_path):
        try:
            ix = open_dir(self.indexFolder)
        except EmptyIndexError:
            return [False,("The index provided does not exist, make sure you add it before using it and do not "
                    +"delete it manually")]
        writer = ix.writer()
        message = ""
        deleted = False
        doc_path+= "\\version_1\\"
        try:
            writer.delete_by_term('path', doc_path+"true_or_false_question.pdf")
            writer.delete_by_term('path', doc_path+"program.py")
            writer.delete_by_term('path', doc_path+"true_or_false_question.tex")
            writer.commit()
            message = ("Document successfully deleted.")
            deleted = True
        except:
            message = ("Please verify the document exists.")
            deleted = False
            
        return [deleted, message]
    
    
    def parser(self, keyword, docType = 'all', sortType = "None", fromDate = "all time"):
        
        try:
            ix = open_dir(self.indexFolder)
        except EmptyIndexError:
            return [False,("The index provided does not exist, make sure you add it before using it and do not "
                    +"delete it manually")]
        
        resultArray = []
        keyword = analyzeText(unidecode.unidecode(keyword))
        
        today = dt.now()
        date = ""
        
        if(keyword== ""):
            parseQuery = ""
        else:
            parseQuery = "content:"+keyword
        
        if(fromDate != "all time"):
            if(fromDate == "this year"):
                date = today - relativedelta(years = 1)
            elif(fromDate == "this month"):
                date = today - relativedelta(months = 1)
            elif(fromDate == "this week"):
                date = today - relativedelta(weeks = 1)
            
            parseQuery = (parseQuery+ " " + u"date:["+ date.strftime("%Y%m%d") + " to " + today.strftime("%Y%m%d") + "]")

        
        with ix.searcher() as searcher:
            if docType == 'all':
                query = MultifieldParser(["content", "date"], schema=ix.schema).parse(parseQuery)          
            else:
                query = MultifieldParser(["content", "date", "tags"], schema=ix.schema).parse(parseQuery+" tags:"+docType)
            
            results = ""
            
            if(sortType == "By Date"):
                results = searcher.search(query, sortedby = "date", reverse = True)
            elif(sortType == "By Number of ocurrences"):
                results = searcher.search(query, sortedby = "nOccurrences", reverse = True)
            else:
                results = searcher.search(query)
                
            
            
            if(results.is_empty()):
                return False, ("Não foram encontrados resultados com estes parâmetros de pesquisa")
            else:
                for result in results:
                    print(result["nOccurrences"])
                    path = result["path"]
                    tag = result["tags"]
                
                    
                    resultArray.append([path,tag])
        
        return True,resultArray


In [3]:
WH = WhooshHandler("teste")
WH.createSchema()
WH.writeSchema(r"C:\Users\ASUS\Desktop\Semestre6", 0)
#WH.addHomeWork(["D:\Faculdade\Projeto\Projeto-Final\exercicios\question_1",
#               "D:\Faculdade\Projeto\Projeto-Final\exercicios\question_3"])


# S = Searcher("index2")
# S.deleteEntry(r"C:\Users\ASUS\Desktop\Semestre6\Projeto\perguntas_teste\question_1")
# result = S.parser("","pdf", "all time")
# print(result)

# WH.deleteIndex("index")




(True,
 'The following exercise folders were found and added to the index:C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas\\matematica\\linear_equations_system_3x3_unique_solution\\question, C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas\\matematica\\linear_equations_system_3x3_unique_solution\\question_old.20200522_trab1_with_bug, C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas\\matematica\\linear_equations_system_3x3_unique_solution\\question_old.20200522_trab1_with_bug\\question, C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas\\python\\cor_rgb\\question, C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas\\python\\imagem\\question, C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas_teste\\question_1, C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas_teste\\question_2, ')