# Trabalho Prático - Sistema de recuperação textual 

#### SCC0282 - Recuperção de Informação 

##### Alunos:
Pedro Afonso Fazio Michalichem - 10734196 <br> Rafael Silva - 7564023 <br> Ricardo Atakiama - 10262482 <br> 

#### Para este trabalho foram utilizadas os seguintes pacotes da biblioteca nltk:
- 'tokenize' para transformação do texto em tokens
- 'corpus' para remoção de stop words, 
- 'stem' para steeming (radicalização)

#### - xmltodict 
- para trabalhar com xml


In [6]:
import glob
import re
import os
import sys
import math
import numpy as np
import nltk
import xmltodict

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ricar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ricar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Flags para controle dos modos de pesquisa

In [2]:
SHOULD_ENABLE_STOP_WORDS=True
SHOULD_ENABLE_STEMMING=True

In [3]:
def remove_special_characters(text):
    regex = re.compile('[^a-zA-Z0-9\s]')
    text_returned = re.sub(regex, '', text)
    return text_returned

In [4]:
def finding_all_unique_words_and_freq(dict_global, words):
    for word in set(words):
        if word in dict_global.keys():
            dict_global[word] += words.count(word)
        else:
            dict_global[word] = words.count(word)

In [5]:
en_stops = set(stopwords.words('english'))
    
def remove_stop_words(all_words):
    words = []
    for word in all_words: 
        if word not in en_stops:
            words.append(word)
    return words

In [6]:
ps = PorterStemmer()

def stem_words(words):
    return [ps.stem(w) for w in words]

In [7]:
def prepare_words_from_text(text):
    text = remove_special_characters(text)
    words = word_tokenize(text.lower())
    
    if SHOULD_ENABLE_STOP_WORDS:
        words = remove_stop_words(words)
    if SHOULD_ENABLE_STEMMING:
        words = stem_words(words)
    
    return words

In [8]:
def process_file(words_index, global_dictionary, filename):
    with open(filename) as fd:
        doc = xmltodict.parse(fd.read())
        text = doc['DOC']['TEXT']
        index = os.path.basename(filename)
        if isinstance(text, str):
            words = prepare_words_from_text(text)
            finding_all_unique_words_and_freq(global_dictionary, words)
            words_index[index] = words
        else:
            print(index + " is empty")
            words_index[index] = []

In [9]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

In [10]:
class Node:
    def __init__(self, docId, freq):
        self.freq = freq
        self.doc = docId
        self.next = None
    
    def __str__(self):        
        return 'doc:' + str(self.doc) + ', freq:' + str(self.freq)

class LinkedList:
    def __init__(self):
        self.head = None
        self.tail = None
        self.n_docs = 0
    
    def print_list(self):
        aux = self.head
        while aux:
            print(aux)
            aux = aux.next
    
    def get_doclist(self):
        l = []
        aux = self.head
        while aux:
            l.append([aux.doc, aux.freq])
            aux = aux.next
        return l
    
    def add_doc(self, doc, freq):
        node = Node(doc, freq)        
        if self.head == None:
            self.head = node        
        else:
            self.tail.next = node
        self.tail = node
        self.n_docs += 1

# Terms Database

A seção a seguir é responsável por construir o banco de dados por termos

In [11]:
# root = "../FIRE2010/en.doc.2010/TELEGRAPH_UTF8"
root = "../FIRE2010/en.doc.2010/TELEGRAPH_UTF8/2006_utf8/nation"

global_dictionary = {}
words_index = {}
processed_files = 0

for path, subdirs, files in os.walk(root):
    print("processing " + path)
    for name in files:
        filename = os.path.join(path, name)
        if filename.endswith(".utf8"):
            processed_files += 1
            process_file(words_index, global_dictionary, filename)

processing ../FIRE2010/en.doc.2010/TELEGRAPH_UTF8/2006_utf8/nation
1060408_nation_story_6073156.utf8 is empty


In [12]:
docs_id_list = [*words_index]

In [13]:
unique_words_all = sorted(set(global_dictionary.keys()))
linked_list_data = {}

for word in unique_words_all:
    linked_list_data[word] = LinkedList()

for doc in words_index.keys():
    words = words_index[doc]
    for word in set(words):
        linked_list_data[word].add_doc(doc, words.count(word))

# Queries

A seção a seguir é responsável por carregar o arquivo com as queries

In [14]:
queries_filename = "../FIRE2010/en.topics.76-125.2010.txt"

queries = {}

with open(queries_filename) as fd:
    doc = xmltodict.parse(fd.read())
    for query in doc['topics']['top']: 
        queries[query['num']] = {
            'title': query['title'],
            'desc': query['desc'],
            'narr': query['narr'],
            '@lang': query['@lang']
        }

# Probabilistic Model

In [15]:
def probabilistic(query):
    query_words = prepare_words_from_text(query)
    answer = {}
    for doc in words_index.keys():
        words = words_index[doc]
        common_words = intersection(query_words, words)
        score = 0
        for ki in common_words:
            score += math.log10((len(docs_id_list)+0.5)/(linked_list_data[ki].n_docs+0.5))
        answer[doc] = score
    return answer


res = probabilistic(queries['76']['title'])

In [27]:
# for key in res.keys():
#     if int(res[key]) > 0:
#         print(key, res[key])

# Vector Model

In [16]:
m = np.zeros((len(unique_words_all), len(docs_id_list)))

for i in range(len(unique_words_all)):
    word = unique_words_all[i]
    postings = linked_list_data[word].get_doclist()
    ni = linked_list_data[word].n_docs
    for node in postings:
        docID = docs_id_list.index(node[0])
        freq = node[1]
        m[i, docID] = freq
#     print(word, m[i])

In [17]:
m = np.zeros((len(unique_words_all), len(docs_id_list)))
for i in range(len(unique_words_all)):
    word = unique_words_all[i]
    postings = linked_list_data[word].get_doclist()
    ni = linked_list_data[word].n_docs
    idf = math.log2(len(docs_id_list)/ni)
    for node in postings:
        docID = docs_id_list.index(node[0])
        freq = node[1]
        m[i, docID] = (1 + math.log2(freq))*idf
#     print(word, m[i])

In [18]:
norm = np.sum(m**2, axis=0)
norm = [math.sqrt(norm[i]) for i in range(len(norm))]
# print(norm)

In [19]:
def vector_model(query):
    query_words = prepare_words_from_text(query)
    q_vector = np.zeros(len(unique_words_all))
    for i in range(len(unique_words_all)):
        word = unique_words_all[i]
        if word in query_words:
            ni = linked_list_data[word].n_docs
            idf = math.log2(len(docs_id_list)/ni)
            q_vector[i] = (1 + math.log2(query_words.count(word)))*idf
    return q_vector
    
    
q_vector = vector_model(queries['76']['title'])
# print(q_vector)

In [25]:
ranking = np.zeros(len(docs_id_list))
for j in range(len(docs_id_list)):
    ranking[j] = np.dot(m[:,j], q_vector) / norm[j]
#     print(docs_id_list[j], ranking[j])

  ranking[j] = np.dot(m[:,j], q_vector) / norm[j]


# Ground Truth

In [22]:
def open_file(filename):
    file = open(filename, "r")
    text = file.read()
    file.close()
    return text

In [23]:
gt_filename = "../FIRE2010/en.qrels.76-125.2010.txt"
gt_file = open_file(gt_filename)

def parse_gt(line):
    data = line.strip().split(' ')
    return {
        'index': data[0],
        'file': data[2],
        'relevant': data[3]
    }

with open(gt_filename) as fd:
    content = fd.readlines()
    print(parse_gt(content[0]))
#     for line in content:

{'index': '76', 'file': '1040901_nation_story_3702283.utf8', 'relevant': '0'}
