In [1]:
import glob
import re
import os
import sys
import math
import numpy as np
import nltk
import xmltodict

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/rafael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rafael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def remove_special_characters(text):
    regex = re.compile('[^a-zA-Z0-9\s]')
    text_returned = re.sub(regex, '', text)
    return text_returned

In [3]:
def finding_all_unique_words_and_freq(dict_global, words):
    for word in set(words):
        if word in dict_global.keys():
            dict_global[word] += words.count(word)
        else:
            dict_global[word] = words.count(word)

In [4]:
en_stops = set(stopwords.words('english'))
    
def remove_stop_words(all_words):
    words = []
    for word in all_words: 
        if word not in en_stops:
            words.append(word)
    return words

In [5]:
def process_file(words_index, global_dictionary, filename):
    with open(filename) as fd:
        doc = xmltodict.parse(fd.read())
        text = doc['DOC']['TEXT']
        if not isinstance(text, str):
            print(filename)
            return
        text = remove_special_characters(text)
        words = word_tokenize(text)
        words = [word.lower() for word in words]
        words = remove_stop_words(words)
        finding_all_unique_words_and_freq(global_dictionary, words)
        index = os.path.basename(filename)
        words_index[index] = words

In [6]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

In [7]:
class Node:
    def __init__(self, docId, freq):
        self.freq = freq
        self.doc = docId
        self.next = None
    
    def __str__(self):        
        return 'doc:' + str(self.doc) + ', freq:' + str(self.freq)

class LinkedList:
    def __init__(self):
        self.head = None
        self.tail = None
        self.n_docs = 0
    
    def print_list(self):
        aux = self.head
        while aux:
            print(aux)
            aux = aux.next
    
    def get_doclist(self):
        l = []
        aux = self.head
        while aux:
            l.append([aux.doc, aux.freq])
            aux = aux.next
        return l
    
    def add_doc(self, doc, freq):
        node = Node(doc, freq)        
        if self.head == None:
            self.head = node        
        else:
            self.tail.next = node
        self.tail = node
        self.n_docs += 1

# Terms Database

A seção a seguir é responsável por construir o banco de dados por termos

In [8]:
# root = "../FIRE2010/en.doc.2010/TELEGRAPH_UTF8"
root = "../FIRE2010/en.doc.2010/TELEGRAPH_UTF8/2006_utf8/nation"

global_dictionary = {}
words_index = {}
processed_files = 0

for path, subdirs, files in os.walk(root):
    print("processing " + path)
    for name in files:
        filename = os.path.join(path, name)
        if filename.endswith(".utf8"):
            processed_files += 1
            process_file(words_index, global_dictionary, filename)
            
processed_files = len(words_index)

processing ../FIRE2010/en.doc.2010/TELEGRAPH_UTF8/2006_utf8/nation
../FIRE2010/en.doc.2010/TELEGRAPH_UTF8/2006_utf8/nation/1060408_nation_story_6073156.utf8


In [9]:
docs_id_list = [*words_index]

In [10]:
unique_words_all = sorted(set(global_dictionary.keys()))
linked_list_data = {}

for word in unique_words_all:
    linked_list_data[word] = LinkedList()

for doc in words_index.keys():
    words = words_index[doc]
    for word in set(words):
        linked_list_data[word].add_doc(doc, words.count(word))

# Queries

A seção a seguir é responsável por carregar o arquivo com as queries

In [11]:
queries_filename = "../FIRE2010/en.topics.76-125.2010.txt"

queries = {}

with open(queries_filename) as fd:
    doc = xmltodict.parse(fd.read())
    for query in doc['topics']['top']: 
        queries[query['num']] = {
            'title': query['title'],
            'desc': query['desc'],
            'narr': query['narr'],
            '@lang': query['@lang']
        }

# Probabilistic Model

In [12]:
def probabilistic(query):
    query = remove_special_characters(query)
    query = word_tokenize(query.lower())

    answer = {}
    for doc in words_index.keys():
        words = words_index[doc]
        common_words = intersection(query, words)
        score = 0
        for ki in common_words:
            score += math.log10((processed_files+0.5)/(linked_list_data[ki].n_docs+0.5))
        answer[doc] = score
    return answer


res = probabilistic(queries['76']['title'])

# Vector Model

In [13]:
m = np.zeros((len(unique_words_all), len(docs_id_list)))

for i in range(len(unique_words_all)):
    word = unique_words_all[i]
    postings = linked_list_data[word].get_doclist()
    ni = linked_list_data[word].n_docs
    for node in postings:
        docID = docs_id_list.index(node[0])
        freq = node[1]
        m[i, docID] = freq
#     print(word, m[i])

In [14]:
m = np.zeros((len(unique_words_all), len(docs_id_list)))
for i in range(len(unique_words_all)):
    word = unique_words_all[i]
    postings = linked_list_data[word].get_doclist()
    ni = linked_list_data[word].n_docs
    idf = math.log2(processed_files/ni)
    for node in postings:
        docID = docs_id_list.index(node[0])
        freq = node[1]
        m[i, docID] = (1 + math.log2(freq))*idf
#     print(word, m[i])

In [15]:
norm = np.sum(m**2, axis=0)
norm = [math.sqrt(norm[i]) for i in range(len(norm))]
# print(norm)

In [16]:
def vector_model(query):
    query = remove_special_characters(query)
    query = word_tokenize(query)
    query = [q.lower() for q in query]
    q_vector = np.zeros(len(unique_words_all))
    for i in range(len(unique_words_all)):
        word = unique_words_all[i]
        if word in query:
            ni = linked_list_data[word].n_docs
            idf = math.log2(processed_files/ni)
            q_vector[i] = (1 + math.log2(query.count(word)))*idf
    return q_vector
    
    
q_vector = vector_model(queries['76']['title'])
# print(q_vector)

In [17]:
ranking = np.zeros(processed_files)
for j in range(processed_files):
    ranking[j] = np.dot(m[:,j], q_vector) / norm[j]
#     print(docs_id_list[j], ranking[j])

IndexError: index 6146 is out of bounds for axis 1 with size 6146

# Ground Truth

In [None]:
def open_file(filename):
    file = open(filename, "r")
    text = file.read()
    file.close()
    return text

In [None]:
gt_filename = "../FIRE2010/en.qrels.76-125.2010.txt"
gt_file = open_file(gt_filename)

def parse_gt(line):
    data = line.strip().split(' ')
    return {
        'index': data[0],
        'file': data[2],
        'relevant': data[3]
    }

with open(gt_filename) as fd:
    content = fd.readlines()
    print(parse_gt(content[0]))
#     for line in content: