In [2]:
%matplotlib inline
#%load_ext autoreload
#%autoreload 2
%reload_ext autoreload
import numpy as np
import matplotlib.pyplot as plt
import math, sys, os
from numpy.random import randn

In [146]:
# loading methods
def get_text_files(cwd = os.getcwd().replace("/nlp_nlu","/data/text/gutenberg")):
    paths = []
    for f in os.listdir(cwd):
        if f.endswith(".txt"):
            paths.append(cwd + "/" + f )
    return paths


def clean_line(line):
    remove_chs = ["\n",".",",",";",":","/","!","[","]","?"]
    for ch in remove_chs:
        line = line.replace(ch,"")
    return line.lower()


def get_local_filename(file_path): 
    return file_path.split("/")[-1].replace(".txt","")


def tokenize_line(line, doc_name = False):
    line = clean_line(line)
    if len(line) == 0: 
        return []
    if doc_name:
        return [((doc_name,word),1) for word in line.split(" ")]
    else:
        return [(word,1) for word in line.split(" ")]


def tokenize_text_file(file_path, with_doc_name = False):
    tokens = []
    if with_doc_name:
        doc_name = get_local_filename(file_path)
    else:
        doc_name = False
    tokens = []
    for line in open(file_path,"r+"):
        tokens.extend(tokenize_line(line, doc_name))
    
    return tokens

total_count_key = "_total"
doc_name_key = "_document_name"

def word_count_hash_from_text_file(file_path):
    wch = {total_count_key: 0, 
           doc_name_key: get_local_filename(file_path)}
    
    for line in open(file_path,"r+"):
        line = clean_line(line)
        if len(line) == 0: 
            continue
        for word in line.split(" "):
            if word in wch:
                wch[word] += 1
            else:
                wch[word] = 1
            wch[total_count_key] += 1
    
    return wch

def word_frequency_hash_from_text_file(file_path):
    wch = word_count_hash_from_text_file(file_path)
    total = float(wch[total_count_key])
    for word, count in wch.items():
        if word == total_count_key:
            continue
        if not type(count) == int:
            continue
        wch[word] = count / total
    return wch
    

In [148]:
# load data
fp = get_text_files()
tokens = tokenize_text_file(fp[1],True)
wch = word_count_hash_from_text_file(fp[1])

print len(tokens) == wch[total_count_key]

True


## Boolean queries

In [149]:
def get_inverted_index_and_counts(file_paths):
    index = {}
    wchs = {}
    for file_path in file_paths:
        wch = word_count_hash_from_text_file(file_path)
        
        for w in wch.keys():
            if w in index:
                index[w].add(wch[doc_name_key]) # add doc to posting list
            else:
                index[w] = set([wch[doc_name_key]]) # create new word key and doc to posting list
        wchs[wch[doc_name_key]] = wch
    
    return (index, wchs)

In [150]:
inverted_index, wchs = get_inverted_index_and_counts(get_text_files())

In [151]:
## WF-IDF

In [152]:
def word_frequency_inverse_document_frequency(file_paths):
    index = {}
    docs_to_wfidf = {}
    for file_path in file_paths:
        wfh = word_frequency_hash_from_text_file(file_path)
        for w in wfh.keys():
            if w in index:
                index[w].add(wfh[doc_name_key])
            else:
                index[w] = set([wfh[doc_name_key]])
        docs_to_wfidf[wfh[doc_name_key]] = wfh
    
    n_docs = len(file_paths)
    wfidf = {}
    for doc, wfh in docs_to_wfidf.items():
        for word, freq in wfh.items():
            if type(freq) == float:
                #            word frequency * inverse document frequency
                docs_to_wfidf[doc][word] = freq * (float(n_docs) / len(index[word]))
    return (index, docs_to_wfidf)

In [153]:
inverted_index, wfidf = word_frequency_inverse_document_frequency(get_text_files())

In [155]:
word = inverted_index.keys()[3]
print word + ": "
for doc in inverted_index[word]:
    print doc + " : " + str(wfidf[doc].setdefault(word,0.0))
    

amplification: 
edgeworth-parents : 0.000101303437563


In [139]:
word

'gentlemen--of'