In [1]:
import os, sys
import logging

idx_path = os.path.abspath(os.path.join(r"./src/indexer"))
model_path = os.path.abspath(os.path.join(r"./src/model"))
srch_path = os.path.abspath(os.path.join(r"./src/searcher"))
srch_build_path = os.path.abspath(os.path.join(r"./src/search_processor"))

if idx_path not in sys.path:
    sys.path.append(idx_path)
    sys.path.append(model_path)
    sys.path.append(srch_path)
    sys.path.append(srch_build_path)

In [2]:
log_filename = os.path.join('logs', 'log.log')
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()])

In [3]:
from indexador import index_files
from vector_model import build_vector_model, calculate_tf_idf
from buscador import search_from_config
from search_processor import build_query_file

In [4]:
csv_file_name =  "./data/input/busca_cfquery.csv"
build_query_file("./data/input/cfquery.xml", csv_file_name)
#Isso constroi nosso arquivo de query.

2024-06-23 04:35:17,276 - INFO - Started processing XML file: ./data/input/cfquery.xml
2024-06-23 04:35:17,393 - INFO - query file generated: ./data/input/busca_cfquery.csv
2024-06-23 04:35:17,395 - INFO - Finished processing.


In [5]:
index = index_files("configs/GLI.CFG")
##Isso aqui constroi a lista inversa. Eu chamei ela de "index", já que ela é essencialmente um índice.
##Ela é usada pra recuperar o texto depois, e calcular a "direção" de um documento inteiro.

2024-06-23 04:35:17,408 - INFO - Script execution started
2024-06-23 04:35:17,409 - INFO - Started reading config file
2024-06-23 04:35:17,410 - INFO - Finished reading config file
2024-06-23 04:35:17,411 - INFO - Started reading data from ./data/xmls/cf74.xml
2024-06-23 04:35:17,452 - INFO - Finished reading data from ./data/xmls/cf74.xml
2024-06-23 04:35:17,456 - INFO - Started reading data from ./data/xmls/cf75.xml
2024-06-23 04:35:17,493 - INFO - Finished reading data from ./data/xmls/cf75.xml
2024-06-23 04:35:17,497 - INFO - Started reading data from ./data/xmls/cf76.xml
2024-06-23 04:35:17,576 - INFO - Finished reading data from ./data/xmls/cf76.xml
2024-06-23 04:35:17,580 - INFO - Started reading data from ./data/xmls/cf77.xml
2024-06-23 04:35:17,637 - INFO - Finished reading data from ./data/xmls/cf77.xml
2024-06-23 04:35:17,646 - INFO - Started reading data from ./data/xmls/cf78.xml
2024-06-23 04:35:17,766 - INFO - Finished reading data from ./data/xmls/cf78.xml
2024-06-23 04:

In [6]:
build_vector_model("configs/VECTOR_MODEL.CFG", calculate_tf_idf)
##Você pode alterar a métrica aqui. Passe um segundo argumento, "metric = função", onde função recebe somente o index. Por padrão, vai ser usado TF-IDF com ajuste (0.5 + 0.5*tf) * idf

2024-06-23 04:35:18,485 - INFO - Vectorial model execution started
2024-06-23 04:35:18,486 - INFO - Started reading index config file
2024-06-23 04:35:18,489 - INFO - Finished reading index config file
2024-06-23 04:35:18,489 - INFO - Started loading index from ./data/index/inverse_list.csv
2024-06-23 04:35:18,893 - INFO - Finished loading index from ./data/index/inverse_list.csv
2024-06-23 04:35:18,894 - INFO - Checking if METRIC file ./data/index/vector_model.csv exists
2024-06-23 04:35:28,600 - INFO - Loaded METRIC scores from ./data/index/vector_model.csv
2024-06-23 04:35:28,601 - INFO - Skipping METRIC calculation as results already exist.
2024-06-23 04:35:28,602 - INFO - Vectorial model execution finished
2024-06-23 04:35:28,603 - INFO - Total execution time: 0:00:10.117101


In [7]:
search_from_config("configs/BUSCA.CFG", index)
##Realiza a busca, e escreve tudo no arquivo "./data/output/resultados.csv"
##Peguei apenas os 10 maiores resultados -- tem 1215 documentos na coleção, o arquivo ficaria muito poluido com muitos resultados.

2024-06-23 04:35:28,887 - INFO - Parsing configuration file: configs/BUSCA.CFG
2024-06-23 04:35:28,889 - INFO - Model file set to: ./data/index/vector_model.csv
2024-06-23 04:35:28,891 - INFO - Query file added: "./data/input/busca_cfquery.csv"
2024-06-23 04:35:28,895 - INFO - Result file set to: ./data/output/resultados.csv
2024-06-23 04:35:28,897 - INFO - Configuration file parsed successfully. Model file: ./data/index/vector_model.csv, Query files: ['./data/input/busca_cfquery.csv'], Result file: ./data/output/resultados.csv
2024-06-23 04:35:28,898 - INFO - Loading vector model from file: ./data/index/vector_model.csv
2024-06-23 04:35:31,696 - INFO - Vector model loaded successfully with 9141 words and 1215 documents
2024-06-23 04:35:31,740 - INFO - Reading query file: ./data/input/busca_cfquery.csv
2024-06-23 04:35:31,819 - INFO - Query file './data/input/busca_cfquery.csv' read successfully. Number of Queries: 99
2024-06-23 04:35:32,106 - INFO - Results sorted for all queries in f