In [6]:
%matplotlib inline
#%load_ext autoreload
#%autoreload 2
%reload_ext autoreload
import numpy as np
import matplotlib.pyplot as plt
import math, sys, os
from numpy.random import randn

#### loading methods

In [11]:

def get_text_files(cwd = os.getcwd().replace("/information_retrieval","/data/text/gutenberg")):
    paths = []
    for f in os.listdir(cwd):
        if f.endswith(".txt"):
            paths.append(cwd + "/" + f )
    return paths


def clean_line(line):
    remove_chs = ["\n",".",",",";",":","/","!","[","]","?"]
    for ch in remove_chs:
        line = line.replace(ch,"")
    return line.lower()


def get_local_filename(file_path): 
    return file_path.split("/")[-1].replace(".txt","")


def tokenize_line(line, doc_name = False):
    line = clean_line(line)
    if len(line) == 0: 
        return []
    if doc_name:
        return [((doc_name,word),1) for word in line.split(" ")]
    else:
        return [(word,1) for word in line.split(" ")]


def tokenize_text_file(file_path, with_doc_name = False):
    tokens = []
    if with_doc_name:
        doc_name = get_local_filename(file_path)
    else:
        doc_name = False
    tokens = []
    for line in open(file_path,"r+"):
        tokens.extend(tokenize_line(line, doc_name))
    
    return tokens

total_count_key = "_total"
doc_name_key = "_document_name"

def word_count_hash_from_text_file(file_path):
    wch = {total_count_key: 0, 
           doc_name_key: get_local_filename(file_path)}
    
    for line in open(file_path,"r+"):
        line = clean_line(line)
        if len(line) == 0: 
            continue
        for word in line.split(" "):
            if word in wch:
                wch[word] += 1
            else:
                wch[word] = 1
            wch[total_count_key] += 1
    
    return wch

def word_frequency_hash_from_text_file(file_path):
    wch = word_count_hash_from_text_file(file_path)
    total = float(wch[total_count_key])
    for word, count in wch.items():
        if word == total_count_key:
            continue
        if not type(count) == int:
            continue
        wch[word] = count / total
    return wch
    

## The Inverted Index

In [65]:
def get_inverted_index_and_counts(file_paths):
    index = {}
    wchs = {}
    for file_path in file_paths:
        wch = word_count_hash_from_text_file(file_path)
        
        for w in wch.keys():
            if w in index:
                index[w].add(wch[doc_name_key]) # add doc to posting list
            else:
                index[w] = set([wch[doc_name_key]]) # create new word key and doc to posting list
        wchs[wch[doc_name_key]] = wch
    
    return (index, wchs)


In [40]:
inverted_index, wchs = get_inverted_index_and_counts(get_text_files())
# a query method wrapper around the index
docs_with = lambda word: inverted_index[word]

In [41]:
len(inverted_index)

75191

### Query processing

#### By term

In [60]:
word0 = "cooking"
word1 = "dollar"
word2 = "dime"
word3 = "peak"
word4 = "mountain"
word5 = "climb"
word6 = "whale"

print "Search for: '%s'" % word0
print docs_with(word0)

print ""
print "Search for: '%s'" % word1
print docs_with(word1)

print ""
print "Search for: '%s'" % word2
print docs_with(word2)

print ""
print "Search for: '%s'" % word3
print docs_with(word3)

print ""
print "Search for: '%s'" % word6
print docs_with(word6)

Search for: 'cooking'
set(['whitman-leaves', 'chesterton-ball', 'melville-moby_dick', 'bryant-stories'])

Search for: 'dollar'
set(['whitman-leaves', 'chesterton-ball', 'melville-moby_dick'])

Search for: 'dime'
set(['whitman-leaves'])

Search for: 'peak'
set(['whitman-leaves', 'melville-moby_dick', 'chesterton-thursday', 'chesterton-brown'])

Search for: 'whale'
set(['melville-moby_dick', 'shakespeare-hamlet', 'bible-kjv', 'chesterton-ball', 'bryant-stories', 'whitman-leaves'])


#### Using set operations we can handle more complicated queries

In [63]:
print "query: '%s and %s' " % (word0, word1)
print docs_with(word0).intersection(docs_with(word1))
print ""
print "query: '%s and %s and not %s' " % (word0, word1, word2)
print docs_with(word0).intersection(docs_with(word1)).difference(docs_with(word2))
print ""
print "query: '%s or %s and not %s' " % (word0, word3, word6)
print docs_with(word0).union(docs_with(word3)).difference(docs_with(word6))

query: 'cooking and dollar' 
set(['whitman-leaves', 'chesterton-ball', 'melville-moby_dick'])

query: 'cooking and dollar and not dime' 
set(['chesterton-ball', 'melville-moby_dick'])

query: 'cooking or peak and not whale' 
set(['chesterton-thursday', 'chesterton-brown'])


We can of course optimize this futher by ordering the posting lists by, ascending, term frequency, and intelligently ordering the set operations.

## Skip pointers

In [None]:
class Posting_Node:
    
    def __init__(self, document_id, next = None):
        self.documentId = documentId
        self.skip_ponter = None
        self.next = next
        
    def update_skip_pointer(self, n_nodes):
        pointee = self
        for x in range(n_nodes):
            pointee = pointee.next
        self.skip_ponter = pointee
    
    def append(self, document_id):
        new = Posting_Node(document_id, None)
        self.next = new
        return new
    # implicit ordering

def add_to_new_posting_list(l, document_id):
    if l == None:
        return Posting_Node(document_id)
    else:
        return l.append(document_id)

def skip_advance(p_lower, p_upper):
    if (not p_lower.skip_ponter == None) and (p_lower.skip_ponter.document_id < p_upper.document_id):
        return skip_advance(p_lower.skip_ponter, p_upper)
    elif (not p_lower.skip_ponter == None) and (p_lower.skip_ponter.document_id == p_upper.document_id):
        return p_lower.skip_ponter
    else:
        return p_lower.next
    
def intersect(p1, p2):
    """
    p1: set of Postings, head of a singly linked list
    p2: set of postings, head of a singly linked list
    
    we use the skip pointers to collect the intersection of the two in 
    """
    intersect_list = None
    while (not p1.next == Node) and (not p2.next == Node):
        if p1.document_id == p2.document_id:
            intersect_list = add_to_new_posting_list(intersect_list, p1.document_id)
            p1 = p1.next
            p2 = p2.next
        elif p1.document_id < p2.document_id:
            p1 = skip_advance(p1, p2)
        else:
            p2 = skip_advance(p2, p1)
    
    return intersect_list

