In [11]:
# -*- coding: utf-8 -*-
from datetime import datetime as dt
import fitz  # PyMUPDF
import os
import unidecode
import re
import voila
import ipywidgets
from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode
from pylatexenc.latex2text import LatexNodes2Text

from whoosh.index import create_in
from whoosh.fields import *
from whoosh.analysis import StemmingAnalyzer
from whoosh.index import create_in
from whoosh.qparser import QueryParser
from whoosh.qparser import MultifieldParser
from whoosh.index import open_dir
from whoosh.analysis import LanguageAnalyzer
from whoosh.analysis import FancyAnalyzer
from whoosh.analysis import StemmingAnalyzer
from whoosh.lang.porter import stem
from whoosh.query import variations

 


"""
    Class Local Search representa um motor de pesquisa capaz de pesquisar por exercicios de
    python em .pdf .tex e .py numa diretoria local.

    Os ficheiros devem estar dentro de uma diretoria com a seguinte estrutura:
    question_x\version_x\true_or_false_question.pdf sendo x o numero da questão

    O nome dos ficheiros deve ser o seguinte:
    Pdf -> true_or_false_question.pdf
    Latex -> true_or_false_question.tex
    Python -> program.py
    -----------------------------------------------------------------------------------------
    Parametros:
        path : str
            string com o caminho para a diretoria que contem as várias questões

    ------------------------------------------------------------------------------------------
    Metodos Publicos:
        searchKeyword(self, keyword, docType='pdf')

    Metodos Privados:
        decodePython(self, path)
        decodeTex(self, path)
        decodePdf(self, path)
        removeSpecialChar(self,text)

"""
class LocalSearch:
    directory = None
    specialCharacters = ['´', '¸', '˜', '`', '^']
    questionFolders = None
    def __init__(self, path):
        assert path != None, 'None path'
        assert isinstance(path,str), 'Path must be a string'
        assert path , 'Path must be not empty'
        self.directory = path
        self.questionFolders = [x for x in os.listdir(self.directory)]
    
    def __decodePdf(self,path):
        doc = fitz.open(path)
        for page in doc:
            text = page.getText()
        return text


    def __decodeTex(self,path):
        doc = " ".join(open(path, "r").readlines()).encode("ISO-8859-1").decode("utf-8")
        a = LatexNodes2Text().latex_to_text(doc)
        return a

    def __decodePy(self,path):
        doc = open(path,'r')
        return doc.read()

    def __removeSpecialChar(self,text):
        for sChar in self.specialCharacters:
            text = text.replace(sChar, '')

        return text


    def searchKeyword(self,keyword,docType='all'):
        for question in self.questionFolders:
            path = self.directory + "\\" + question + "\\version_1"
            
            pathPdf = path +"\\true_or_false_question.pdf"
            textPdf = self.__decodePdf(pathPdf)
            writeSchema(pathPdf,textPdf,"pdf")
           
            pathPy = path +"\\program.py"
            textPy = self.__decodePy(pathPy)
            writeSchema(pathPy,textPy,"py")
            
            pathTex = path +"\\true_or_false_question.tex"
            textTex = self.__decodeTex(pathTex) + textPy
            writeSchema(pathTex,textTex,"tex")

            

def SearchWidget(event):
    out.clear_output()
    with out:
        result = "\n".join(LS.searchKeyword(text.value, dropDown.value))
        print(result)
            
    return

def createSchema():
    schema = Schema(path=ID(stored=True),
                    content=TEXT(lang="pt"),
                    date=DATETIME(stored=True),
                    tags=KEYWORD)
    
    if not os.path.exists("index"):
        os.mkdir("index")
    ix = create_in("index", schema)

def writeSchema(path, text, tag):
    
    ix = open_dir("index")
    pt = LanguageAnalyzer("pt")
    fancy = FancyAnalyzer()
    stem = StemmingAnalyzer()
    fancyAn=""
    stemAn=""
    for token in pt(text):
        fancyAn += "".join(token.text)
        fancyAn += " "
    for token in stem(fancyAn):
        stemAn += "".join(token.text)
        stemAn += " "
           
#     fancyAn = [token.text for token in fancy(text)]
#     stemAn =[token.text for token in stem(fancyAn)]
#     print("")
#     print(fancyAn)
#     print("")
#     print(stemAn)
    writer = ix.writer()
    writer.add_document(path=path,
                        content=stemAn,
                        date= dt.now(),
                        tags=tag)
    writer.commit()


    
def searchIndex(text, docType):
    
    ix = open_dir("index")
    pt = LanguageAnalyzer("pt")
    stem = StemmingAnalyzer()
    fancy = FancyAnalyzer()
    tempText = ""
    tempText1 = ""
    ptText = [token.text for token in pt(text)]
#     ptText = [token.text for token in fancy(ptText)]

#     print(variations(ptText))
#     for token in fancy(text):
#         tempText += "".join(token.text)
#         tempText += " "
#     for token in stem(tempText):
#         tempText1 += "".join(token.text)
#         tempText1+= " "
#     text = tempText1
    print(ptText)
    all_docs = ix.searcher().documents()
    
    with ix.searcher() as searcher:
        for word in ptText:
            query = MultifieldParser(["content", "tags"], schema=ix.schema).parse("content:"+word+" tags:"+docType)
            results = searcher.search(query)
            i=0
            if(results.is_empty()):
                return
            else:
                for docnum, score in results.items():
                    print(docnum,score)
                    print(results.fields(i))
                    i+=1


out = ipywidgets.Output()
# createSchema()
# LS = LocalSearch("C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas_teste")
# LS.searchKeyword("")
searchIndex("integer programas argumentar", "pdf")

text = ipywidgets.Text(placeholder = 'keyword')
button = ipywidgets.Button(description='Search', icon='search')
button.on_click(SearchWidget)

dropDown = ipywidgets.Dropdown(
    options=['all','pdf', 'python', 'latex'],
    value='all',
    description='Type:',
    disabled=False,
)

display(text,dropDown,button)
display(out)


['integ', 'program', 'argument']
1 2.268091843710586
{'date': datetime.datetime(2020, 7, 20, 16, 39, 33, 519343), 'path': 'C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas_teste\\question_1\\version_1\\true_or_false_question.pdf'}
4 2.268091843710586
{'date': datetime.datetime(2020, 7, 20, 16, 39, 33, 588166), 'path': 'C:\\Users\\ASUS\\Desktop\\Semestre6\\Projeto\\perguntas_teste\\question_2\\version_1\\true_or_false_question.pdf'}


Text(value='', placeholder='keyword')

Dropdown(description='Type:', options=('all', 'pdf', 'python', 'latex'), value='all')

Button(description='Search', icon='search', style=ButtonStyle())

Output()