In [1]:
import os, sys
import logging

idx_path = os.path.abspath(os.path.join(r"./src/indexer"))
model_path = os.path.abspath(os.path.join(r"./src/model"))
srch_path = os.path.abspath(os.path.join(r"./src/searcher"))
srch_build_path = os.path.abspath(os.path.join(r"./src/search_processor"))

if idx_path not in sys.path:
    sys.path.append(idx_path)
    sys.path.append(model_path)
    sys.path.append(srch_path)
    sys.path.append(srch_build_path)

In [2]:
log_filename = os.path.join('logs', 'log.log')
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()])

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Ruan
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Ruan
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from indexador import index_files
from vector_model import build_vector_model, calculate_tf_idf
from buscador import search_from_config
from search_processor import build_query_file

In [5]:
build_query_file("configs/PC.CFG")
#Isso constroi nosso arquivo de query.

2024-06-23 19:45:46,049 - INFO - Parsing configuration file: configs/PC.CFG
2024-06-23 19:45:46,052 - INFO - Input query file added: "./data/input/cfquery.xml"
2024-06-23 19:45:46,057 - INFO - Output query file added: "./result/busca_cfquery.csv"
2024-06-23 19:45:46,058 - INFO - Expected Result file set to: ./result/esperados.csv
2024-06-23 19:45:46,063 - INFO - Configuration file parsed successfully. Input Query file: ./data/input/cfquery.xml, Output Query files: ./result/busca_cfquery.csv,Expected Result file: ./result/esperados.csv
2024-06-23 19:45:46,064 - INFO - Started processing XML file: ./data/input/cfquery.xml
2024-06-23 19:45:46,286 - INFO - Query file generated: ./result/busca_cfquery.csv
2024-06-23 19:45:46,287 - INFO - Results file generated: ./result/esperados.csv
2024-06-23 19:45:46,288 - INFO - Finished processing.


In [6]:
index = index_files("configs/GLI.CFG")
##Isso aqui constroi a lista inversa. Eu chamei ela de "index", já que ela é essencialmente um índice.
##Ela é usada pra recuperar o texto depois, e calcular a "direção" de um documento inteiro.

2024-06-23 19:45:46,305 - INFO - Script execution started
2024-06-23 19:45:46,308 - INFO - Started reading config file
2024-06-23 19:45:46,311 - INFO - Finished reading config file
2024-06-23 19:45:46,314 - INFO - Started reading data from ./data/xmls/cf74.xml
2024-06-23 19:45:46,449 - INFO - Finished reading data from ./data/xmls/cf74.xml
2024-06-23 19:45:46,463 - INFO - Started reading data from ./data/xmls/cf75.xml
2024-06-23 19:45:46,505 - INFO - Finished reading data from ./data/xmls/cf75.xml
2024-06-23 19:45:46,507 - INFO - Started reading data from ./data/xmls/cf76.xml
2024-06-23 19:45:46,584 - INFO - Finished reading data from ./data/xmls/cf76.xml
2024-06-23 19:45:46,589 - INFO - Started reading data from ./data/xmls/cf77.xml
2024-06-23 19:45:46,662 - INFO - Finished reading data from ./data/xmls/cf77.xml
2024-06-23 19:45:46,666 - INFO - Started reading data from ./data/xmls/cf78.xml
2024-06-23 19:45:46,694 - INFO - Finished reading data from ./data/xmls/cf78.xml
2024-06-23 19:

In [7]:
build_vector_model("configs/VECTOR_MODEL.CFG", calculate_tf_idf)
##Você pode alterar a métrica aqui. Passe um segundo argumento, "metric = função", onde função recebe somente o index. Por padrão, vai ser usado TF-IDF com ajuste (0.5 + 0.5*tf) * idf

2024-06-23 19:45:47,991 - INFO - Vectorial model execution started
2024-06-23 19:45:47,997 - INFO - Started reading index config file
2024-06-23 19:45:48,058 - INFO - Finished reading index config file
2024-06-23 19:45:48,067 - INFO - Started loading index from ./result/inverse_list.csv
2024-06-23 19:45:48,800 - INFO - Finished loading index from ./result/inverse_list.csv
2024-06-23 19:45:48,801 - INFO - Checking if METRIC file ./result/vector_model.csv exists
2024-06-23 19:45:48,802 - INFO - METRIC file ./result/vector_model.csv not found. Calculating METRIC scores.
2024-06-23 19:45:48,805 - INFO - Started calculating Metric for Analysis... (default is tf-idf)
2024-06-23 19:45:52,915 - INFO - Finished calculating metric.
2024-06-23 19:45:52,916 - INFO - Started writing METRIC scores to ./result/vector_model.csv
2024-06-23 19:46:11,333 - INFO - Finished writing METRIC scores to ./result/vector_model.csv
2024-06-23 19:46:11,334 - INFO - Vectorial model execution finished
2024-06-23 19:4

In [8]:
search_from_config("configs/BUSCA.CFG", index)
##Realiza a busca, e escreve tudo no arquivo "./data/output/resultados.csv"
##Peguei apenas os 10 maiores resultados -- tem 1215 documentos na coleção, o arquivo ficaria muito poluido com muitos resultados.
##Vai de 1 a 100. E pula o 93... deus sabe o que aconteceu com o 93.....
##Originalmente eu printava o tempo de cada query. era uma coisa de microsegundos, muito pouco. E poluia a tela massivamente.

2024-06-23 19:46:11,523 - INFO - Parsing configuration file: configs/BUSCA.CFG
2024-06-23 19:46:11,535 - INFO - Model file set to: ./result/vector_model.csv
2024-06-23 19:46:11,536 - INFO - Query file added: "./result/busca_cfquery.csv"
2024-06-23 19:46:11,537 - INFO - Result file set to: ./result/resultados.csv
2024-06-23 19:46:11,539 - INFO - Configuration file parsed successfully. Model file: ./result/vector_model.csv, Query files: ['./result/busca_cfquery.csv'], Result file: ./result/resultados.csv
2024-06-23 19:46:11,539 - INFO - Loading vector model from file: ./result/vector_model.csv
2024-06-23 19:46:16,138 - INFO - Vector model loaded successfully with 9141 words and 1215 documents
2024-06-23 19:46:16,164 - INFO - Reading query file: ./result/busca_cfquery.csv
2024-06-23 19:46:16,221 - INFO - Query file './result/busca_cfquery.csv' read successfully. Number of Queries: 99
2024-06-23 19:46:16,687 - INFO - Results sorted for all queries in file './result/busca_cfquery.csv'. Tota