Skip to content

Commit

Permalink
start to create a real indexer
Browse files Browse the repository at this point in the history
  • Loading branch information
SupermanScott committed Feb 11, 2012
1 parent ddcd0f6 commit 5a2062a
Showing 1 changed file with 32 additions and 2 deletions.
34 changes: 32 additions & 2 deletions indexing/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
import readability.readability as readability
import urllib
import re
import pymongo

# Probably should leverage lxml or BS to really strip tags to get the text
STRIP_HTML = re.compile(r'<[^<]*?/?>')

class Indexer(object):
class DocumentFetcher(object):
"""
Maps a given url to a dictionary of fields and raw text
"""
Expand Down Expand Up @@ -49,7 +50,7 @@ def get_document(self):

document = property(get_document, "Document at the provided url")

def to_dictionary(self)
def to_dictionary(self):
"""
Returns itself as a dictionary
"""
Expand All @@ -59,3 +60,32 @@ def to_dictionary(self)
summary=self.summary,
content=self.content,
)

class Indexer(object):
"""
Defines how to save the documents to Mongo
"""

def __init__(self, server='localhost', port=27017, database_name='arya',
collection_name='index'):
"""
Connect to mongo and connect to the proper collection.
"""
connection = pymongo.Connection(server, port)
self.collection = connection[database_name][collection_name]

# @TODO provide a way to add a config object for how the fields are
# tokenized etc.

def add_document(self, document):
"""
Adds a document to the search index.
"""
doc = dict(document)
print document

def index_url(self, url):
"""
Index a provided url
"""
self.add_document(DocumentFetcher(url).to_dictionary())

0 comments on commit 5a2062a

Please sign in to comment.