In [79]:
import glob
import re
import os
import sys
import math
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/rafael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [45]:
def remove_special_characters(text):
    regex = re.compile('[^a-zA-Z0-9\s]')
    text_returned = re.sub(regex, '', text)
    return text_returned

In [46]:
def finding_all_unique_words_and_freq(dict_global, words):
    for word in set(words):
        if word in dict_global.keys():
            dict_global[word] += words.count(word)
        else:
            dict_global[word] = words.count(word)

In [47]:
def open_file(filename):
    file = open(filename, "r")
    text = file.read()
    file.close()
    return text

In [48]:
def process_file(words_index, global_dictionary, filename):
    text = remove_special_characters(open_file(filename))
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    finding_all_unique_words_and_freq(global_dictionary, words)
    index = os.path.basename(filename)
    words_index[index] = words
    

In [61]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

In [56]:
class Node:
    def __init__(self, docId, freq):
        self.freq = freq
        self.doc = docId
        self.next = None
    
    def __str__(self):        
        return 'doc:' + str(self.doc) + ', freq:' + str(self.freq)

class LinkedList:
    def __init__(self):
        self.head = None
        self.tail = None
        self.n_docs = 0
    
    def print_list(self):
        aux = self.head
        while aux:
            print(aux)
            aux = aux.next
    
    def get_doclist(self):
        l = []
        aux = self.head
        while aux:
            l.append(aux.doc)
            aux = aux.next
        return l
    
    def add_doc(self, doc, freq):
        node = Node(doc, freq)        
        if self.head == None:
            self.head = node        
        else:
            self.tail.next = node
        self.tail = node
        self.n_docs += 1

# Terms Database

A seção a seguir é responsável por construir o banco de dados por termos

In [137]:
# root = "../FIRE2010/en.doc.2010/TELEGRAPH_UTF8"
root = "../FIRE2010/en.doc.2010/TELEGRAPH_UTF8/2006_utf8/nation"

global_dictionary = {}
words_index = {}
processed_files = 0

for path, subdirs, files in os.walk(root):
    print("processing " + path)
    for name in files:
        filename = os.path.join(path, name)
        if filename.endswith(".utf8"):
            processed_files += 1
            process_file(words_index, global_dictionary, filename)

print(len(words_index))

processing ../FIRE2010/en.doc.2010/TELEGRAPH_UTF8/2006_utf8/nation
6147


In [134]:
unique_words_all = sorted(set(global_dictionary.keys()))
linked_list_data = {}

for word in unique_words_all:
    linked_list_data[word] = LinkedList()

for doc in words_index.keys():
    words = words_index[doc]
    for word in set(words):
        linked_list_data[word].add_doc(doc, words.count(word))

# Queries

A seção a seguir é responsável por carregar o arquivo com as queries

In [108]:
queries_filename = "../FIRE2010/en.topics.76-125.2010.txt"

queries_file = open_file(queries_filename)

In [109]:
import xmltodict

queries = {}

with open(queries_filename) as fd:
    doc = xmltodict.parse(fd.read())
    for query in doc['topics']['top']: 
        queries[query['num']] = {
            'title': query['title'],
            'desc': query['desc'],
            'narr': query['narr'],
            '@lang': query['@lang']
        }

# Probabilistic Model

In [138]:
def probabilistic(query):
    query = remove_special_characters(query)
    query = word_tokenize(query.lower())

    answer = {}
    for doc in words_index.keys():
        words = words_index[doc]
        common_words = intersection(query, words)
        score = 0
        for ki in common_words:
            score += math.log10((processed_files+0.5)/(linked_list_data[ki].n_docs+0.5))
        answer[doc] = score
    return answer


res = probabilistic(queries['76']['title'])
print(res)

{'1061112_nation_index.utf8': 0.026203151236883458, '1060310_nation_index.utf8': 7.065145320656676e-05, '1060601_nation_story_6296584.utf8': 0.026203151236883458, '1060526_nation_story_6271957.utf8': 0.7348166036017689, '1060513_nation_story_6218417.utf8': 0.7348166036017689, '1061126_nation_story_7052028.utf8': 0.026203151236883458, '1060227_nation_story_5901256.utf8': 0.026203151236883458, '1061008_nation_story_6843156.utf8': 0.026203151236883458, '1061020_nation_story_6894819.utf8': 0.026203151236883458, '1060313_nation_story_5962270.utf8': 0.7348166036017689, '1061116_nation_story_7008357.utf8': 0.026203151236883458, '1060303_nation_story_5920317.utf8': 0.026203151236883458, '1060829_nation_story_6671185.utf8': 0.026203151236883458, '1060904_nation_story_6694631.utf8': 0.7348166036017689, '1060227_nation_story_5900447.utf8': 0.026203151236883458, '1060130_nation_story_5780873.utf8': 0.026203151236883458, '1060210_nation_story_5828158.utf8': 0.026203151236883458, '1060614_nation_sto

# Vetorial Model

# Ground Truth

In [122]:
gt_filename = "../FIRE2010/en.qrels.76-125.2010.txt"
gt_file = open_file(gt_filename)

def parse_gt(line):
    data = line.strip().split(' ')
    return {
        'index': data[0],
        'file': data[2],
        'relevant': data[3]
    }

with open(gt_filename) as fd:
    content = fd.readlines()
    print(parse_gt(content[0]))
#     for line in content:

{'index': '76', 'file': '1040901_nation_story_3702283.utf8', 'relevant': '0'}
