### Importação das bibliotecas

In [1]:
import pandas as pd
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import RSLPStemmer
nltk.download('stopwords')
nltk.download('rslp')
import matplotlib.pyplot as plt
import heapq
import time

# Realizando a leitura do dataset e tokenização

db = pd.read_csv("results.csv")

documents = db['text']
stopwords = stopwords.words("portuguese")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rubens.sousa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\rubens.sousa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping stemmers\rslp.zip.


### Questão 01) Execute o algoritmo ilustrado na Fig. 5.8 do livro texto (pag. 157) para gerar um índice similar o mostrado na Fig. 5.4 (pag. 134). Guarde o índice em disco em formato csv. 

In [2]:
toker = RegexpTokenizer(r'\b[A-zÀ-ú\d\-\']+')
inverted_list = {}
n = 0

for document in documents:
    n += 1
    tokens = toker.tokenize(document.lower())
    for token in set(tokens):
        if (token not in stopwords and len(token) >= 2):
            count = (n, tokens.count(token))
            if (token not in inverted_list.keys()):
                  inverted_list[token] = []
                  inverted_list[token].append(count)
            else:
                  inverted_list[token].append(count)

In [3]:
inverted_index_df = pd.DataFrame.from_dict(inverted_list, orient="index")
inverted_index_df.to_csv('data.csv')
inverted_index = inverted_list

### Implemente as abordagens de processamento de consulta documento-por-vez e termo-por-vez (Fig. 5.16 e 5.18). (2 pts)

### Dê evidências de que sua implementação está correta

### Document at Time

In [4]:
def document_at_time(query, inverted_index, k):
    inverted_lists = []

    r = []
    for word in query.split(" "):
        if word in inverted_index.keys():
            inverted_lists.append(inverted_index[word])
    for document in range(1, len(documents)+1):
        sd = 0
        for inverted_list in inverted_lists:
            for i in inverted_list:
                if (i[0] == document):
                    sd += i[1]
                    break
        if (sd != 0):
          heapq.heappush(r, (sd, document))
    return heapq.nlargest(k, r)
document_at_time("política", inverted_index, 10)

[(5, 40),
 (4, 132),
 (4, 44),
 (3, 54),
 (3, 52),
 (3, 19),
 (2, 129),
 (2, 47),
 (2, 46),
 (2, 25)]

### Term at time

In [5]:
def term_at_time(query, inverted_index, k):
    
    a = {}
    inverted_lists = []
    r = []
    for word in query.split(" "):
        if word in inverted_index.keys():
            inverted_lists.append(inverted_index[word])
    for inverted_list in inverted_lists:
        for post in inverted_list:
            d = post[0]
            freq = post[1]
            if (d in a.keys()):
                a[d] += freq
            else:
                a[d] = freq
    for (d, ad) in a.items():
        sd = ad
        heapq.heappush(r, (sd, d))
    return heapq.nlargest(k, r)
term_at_time("política", inverted_index, 10)

[(5, 40),
 (4, 132),
 (4, 44),
 (3, 54),
 (3, 52),
 (3, 19),
 (2, 129),
 (2, 47),
 (2, 46),
 (2, 25)]

In [6]:
queries = ["política", "presidente", "educação", "ministério", "empresa"]

### Execute as 5 consultas em cada algoritmo retornando os top-10 documentos (parâmetro k do algoritmo) 

In [7]:
results_document = []
results_term = []
time_results_document = []
time_results_term = []
k = 10

for query in queries:
  init_document = time.time()
  results_document.append(document_at_time(query, inverted_index, k))
  end_document = time.time()
  time_results_document.append(end_document - init_document)
  
  init_term = time.time()
  results_term.append(term_at_time(query, inverted_index, k))
  end_term = time.time()
  time_results_term.append(end_term - init_term)

queries_df = pd.DataFrame()
queries_df['query'] = queries
queries_df['document_at_a_time'] = results_document
queries_df['term_at_a_time'] = results_term
queries_df['compare'] = queries_df.document_at_a_time == queries_df.term_at_a_time
queries_df

Unnamed: 0,query,document_at_a_time,term_at_a_time,compare
0,política,"[(5, 40), (4, 132), (4, 44), (3, 54), (3, 52),...","[(5, 40), (4, 132), (4, 44), (3, 54), (3, 52),...",True
1,presidente,"[(9, 3), (6, 41), (5, 6), (4, 129), (4, 98), (...","[(9, 3), (6, 41), (5, 6), (4, 129), (4, 98), (...",True
2,educação,"[(4, 6), (3, 69), (3, 34), (2, 114), (2, 11), ...","[(4, 6), (3, 69), (3, 34), (2, 114), (2, 11), ...",True
3,ministério,"[(3, 34), (3, 1), (2, 129), (2, 123), (2, 114)...","[(3, 34), (3, 1), (2, 129), (2, 123), (2, 114)...",True
4,empresa,"[(2, 54), (2, 53), (2, 52), (1, 127), (1, 108)...","[(2, 54), (2, 53), (2, 52), (1, 127), (1, 108)...",True


### Compare os tempos médios de execução e uso de memória de cada algoritmo

In [8]:
queries_tempo_df = pd.DataFrame()
queries_tempo_df['tempo_médio_document_at_a_time'] = time_results_document
queries_tempo_df['tempo_médio_term_at_a_time'] = time_results_term
queries_tempo_df

Unnamed: 0,tempo_médio_document_at_a_time,tempo_médio_term_at_a_time
0,0.000997,0.0
1,0.000997,0.001001
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


### Implemente uma das versões de consulta conjuntiva (AND) 

In [9]:
def document_and(query, inverted_index, k):
  
    query_indexes = []
    rank = []
  
    for word in query.split(" "):
        if word in inverted_index.keys():
            query_indexes.append(inverted_index[word])
  
    all_query = [item for sublist in query_indexes for item in sublist]
    all_query.sort()
  
    for i in range(len(all_query)):
        doc_score = 0
    d = all_query.pop()
    for index in all_query:
        if index[0] == d[0]:
            doc_score += index[1] + d[1]
        if doc_score != 0:
            heapq.heappush(rank, (doc_score, d[0]))
        
    return heapq.nlargest(k, rank)

In [10]:
queries = ["política", "presidente", "educação", "ministério", "empresa"]

In [11]:
results_document_and = []
k = 10

for query in queries:
    results_document_and.append(document_and(query, inverted_index, k))


queries_df = pd.DataFrame()
queries_df['query'] = queries
queries_df['AND'] = results_document_and
queries_df

Unnamed: 0,query,AND
0,política,[]
1,presidente,[]
2,educação,[]
3,ministério,[]
4,empresa,[]
