# A Simple Boolean Retrieval System

In [1]:
from functools import total_ordering, reduce
import csv
import re

## Postings

In [13]:
"""
COMMENTS ON CODE: 
Given a class defining one or more 
rich comparison ordering methods, 
this class decorator @total_ordering supplies the rest. 
This simplifies the effort involved in specifying all
of the possible rich comparison operations:

The class must define one of __lt__(), __le__(),
__gt__(), or __ge__(). 
In addition, the class should supply an __eq__() 
method.
"""

@total_ordering
class Posting:
    
    def __init__(self, docID):
        self._docID = docID
        
    def get_from_corpus(self, corpus):
        return corpus[self._docID]
    
    def __eq__(self, other):
        return self._docID == other._docID
    
    def __gt__(self, other):
        return self._docID > other._docID

    def __repr__(self):
        return str(self._docID)

## Posting Lists

In [23]:
class PostingList:
    
    def __init__(self):
        self._postings = []
        
    @classmethod
    def from_docID(cls, docID):
        plist = cls()
        plist._postings = [(Posting(docID))]
        return plist
    
    @classmethod
    def from_posting_list(cls, postingList):
        plist = cls()
        plist._postings = postingList
        return plist
    
    def merge(self, other):
        i = 0
        last = self._postings[-1]
        while (i < len(other._postings) and last == other._postings[i]):
            i += 1
        self._postings += other._postings[i:]
        
    def intersection(self, other):
        intersection = []
        i = 0
        j = 0
        while (i < len(self._postings) and j < len(other._postings)):
            if (self._postings[i] == other._postings[j]):
                intersection.append(self._postings[i])
                i += 1
                j += 1
            elif (self._postings[i] < other._postings[j]):
                i += 1
            else:
                j += 1
        return PostingList.from_posting_list(intersection)
    
    def union(self, other):
        union = []
        i = 0
        j = 0 
        while (i < len(self._postings) and j < len(other._postings)):
            if (self._postings[i] == other._postings[j]):
                union.append(self._postings[i])
                i += 1
                j += 1
            elif (self._postings[i] < other._postings[j]):
                union.append(self._postings[i])
                i += 1
            else:
                union.append(other._postings[j])
                j += 1
        for k in range(1, len(self._postings)):
            union.append(self.postings[k])
        for k in range(1, len(self._postings)):
            union.append(other.postings[k])
        return PostingList.from_posting_list(union)
    
    def get_from_corpus(self, corpus):
        return list(map(lambda x: x.get_from_corpus(corpus), self._postings))
    
    def __repr__(self):
        return ', '.join(map(str, self._postings))

## Terms

In [28]:
class ImpossibleMergeError(Exception):
    pass

@total_ordering
class Term:
    
    def __init__(self, term, docID):
        self.term = term
        self.posting_list = PostingList.from_docID(docID)
        
    def merge(self, other):
        if (self.term == other.term):
            self.posting_list.merge(other.posting_list)
        else:
            raise ImpossibleMergeError
    
    def __eq__(self, other):
        return self.term == other.term
    
    def __gt__(self, other):
        return self.term > other.term
    
    def __repr__(self):
        return self.term + ": " + repr(self.posting_list)


## Inverted Index

In [41]:
def normalize(text):
    no_punctuation = re.sub(r'[^\w^\s^-]','',text)
    downcase = no_punctuation.lower()
    return downcase

def tokenize(movie):
    text = normalize(movie.description)
    return list(text.split())

class InvertedIndex:
    
    def __init__(self):
        self._dictionary = []
        
    @classmethod
    def from_corpus(cls, corpus):
        intermediate_dict = {}
        for docID, document in enumerate(corpus):
            tokens = tokenize(document)
            for token in tokens:
                term = Term(token, docID)
                try:
                    intermediate_dict[token].merge(term)
                except KeyError:
                    intermediate_dict[token] = term
            if (docID % 1000 == 0):
                print("ID: " + str(docID))
        idx = cls()
        idx._dictionary = sorted(intermediate_dict.values())
        return idx
    
    def __getitem__(self, key):
        for term in self._dictionary:
            if term.term == key:
                return term.posting_list
        raise KeyError
        
    def __repr__(self):
        return "A dictionary with " + str(len(self._dictionary)) + " terms"

## Reading the Corpus

In [34]:
class MovieDescription:
    
    def __init__(self, title, description):
        self.title = title
        self.description = description
        
    def __repr__(self):
        return self.title
    

def read_movie_description():
    filename = 'MovieSummaries/plot_summaries.txt'
    movie_names_file = 'MovieSummaries/movie.metadata.tsv'
    
    with open(movie_names_file, 'r') as csv_file:
        movie_names = csv.reader(csv_file, delimiter='\t')
        names_table = {}
        for name in movie_names:
            names_table[name[0]] = name[2]
    
    with open(filename, 'r') as csv_file:
        descriptions = csv.reader(csv_file, delimiter='\t')
        corpus = [] 
        for desc in descriptions:
            try:
                movie = MovieDescription(names_table[desc[0]], desc[1])
                corpus.append(movie)
            except KeyError:
                pass
        return corpus

## Putting all together

In [58]:
class IRsystem:
    
    def __init__(self, corpus, index):
        self._corpus = corpus
        self._index = index
        
    @classmethod
    def from_corpus(cls, corpus):
        index = InvertedIndex.from_corpus(corpus)
        return cls(corpus, index)
    
    def answer_query(self, words): #['cats', 'batman']
        norm_words = map(normalize, words)
        postings = map(lambda w: self._index[w], norm_words)
        plist = reduce(lambda x, y: x.intersection(y), postings)
        return plist.get_from_corpus(self._corpus)
    
    
def query(ir, text):
    words = text.split()
    answer = ir.answer_query(words)
    for movie in answer:
        print(movie)

In [42]:
corpus = read_movie_description()

In [43]:
idx = InvertedIndex.from_corpus(corpus)

ID: 0
ID: 1000
ID: 2000
ID: 3000
ID: 4000
ID: 5000
ID: 6000
ID: 7000
ID: 8000
ID: 9000
ID: 10000
ID: 11000
ID: 12000
ID: 13000
ID: 14000
ID: 15000
ID: 16000
ID: 17000
ID: 18000
ID: 19000
ID: 20000
ID: 21000
ID: 22000
ID: 23000
ID: 24000
ID: 25000
ID: 26000
ID: 27000
ID: 28000
ID: 29000
ID: 30000
ID: 31000
ID: 32000
ID: 33000
ID: 34000
ID: 35000
ID: 36000
ID: 37000
ID: 38000
ID: 39000
ID: 40000
ID: 41000
ID: 42000


In [44]:
idx['batman']

334, 2990, 3463, 3519, 3545, 5510, 6854, 7105, 7358, 8467, 9503, 10360, 10727, 10933, 12458, 12492, 12967, 13095, 14199, 17366, 18875, 19381, 19675, 20598, 20808, 21070, 22147, 24393, 24484, 24658, 25866, 30601, 31272, 31508, 33213, 33638, 35356, 35980, 37238, 37389, 38152, 39092, 40499, 40596, 40821

In [48]:
ir = IRsystem(corpus, idx)

In [59]:
query(ir, "frodo Gandalf")

The Lord of the Rings: The Fellowship of the Ring
The Lord of the Rings
The Hunt for Gollum
The Return of the King
Date Movie
The Lord of the Rings: The Two Towers
The Lord of the Rings: The Return of the King


In [60]:
query(ir, "luke yoda")

Star Wars Episode V: The Empire Strikes Back
Something, Something, Something Dark Side
Return of the Ewok
Star Wars Episode III: Revenge of the Sith
Star Wars Episode VI: Return of the Jedi
It's a Trap!
