Browse files

add optional use of stemmer and double metaphone algorithm

  • Loading branch information...
1 parent 31c4385 commit cffb5ba3125c0e411ff65090403ecdd099a9cd7d @Rafiot committed Aug 22, 2011
Showing with 63 additions and 10 deletions.
  1. +15 −3 README
  2. +48 −7 redis_search.py
View
18 README
@@ -1,22 +1,34 @@
# Context
* You have a directory wiht stuff and you can extract strings from this stuff
- using the command "strings".
+ using the command "strings" (ie. your stuff is not compressed...).
* You are lazy and don't want to open all this files to see what they contains.
* You know that some of this files contains some particular words (ex: a name)
* You have a lot of memory available (at least the same size as all the files
you want to search in)
# Requirements
-* Linux
+* Linux / strings
* redis (http://redis.io/, https://github.com/antirez/redis)
* redis-py (https://github.com/andymccurdy/redis-py)
+
+# Optional requirements
+
* ipython (optional but I like it)
+* pystemmer library (used if installed on the system)
+
+::
+ sudo easy_install pystemmer
+
+* fuzzy library (used if installed on the system)
+
+::
+ sudo easy_install fuzzy
# Usage
* Run a redis server
-* Edit directory in strings_generator.py
* Run strings_generator.py in an interactive python shell (like ipython)
+ with the directory you want to search in as argument
View
55 redis_search.py
@@ -17,6 +17,10 @@
The blog post discussing the development of this Gist:
http://dr-josiah.blogspot.com/2010/07/building-search-engine-using-redis-and.html
+Improved by Raphael Vinot August 06 2011:
+- Optional usage of the Porter Stemmer algorithm
+- Optional usage of the Double Metaphone algorythm
+
'''
import collections
@@ -27,6 +31,20 @@
import redis
+try:
+ import Stemmer
+ use_stem = True
+ print("Use Porter Stemmer algorithm")
+except ImportError:
+ use_stem = False
+
+try:
+ import fuzzy
+ use_metaphone = True
+ print("Use the Double Metaphone algorythm")
+except ImportError:
+ use_metaphone = False
+
NON_WORDS = re.compile("[^a-z0-9' ]")
# stop words pulled from the below url
@@ -61,11 +79,18 @@ def get_index_keys(content, add=True):
words = NON_WORDS.sub(' ', content.lower()).split()
words = [word.strip("'") for word in words]
words = [word for word in words
- if word not in STOP_WORDS and len(word) > 1]
- # Apply the Porter Stemmer here if you would like that functionality.
+ if word not in STOP_WORDS and len(word) > 1]
- # Apply the Metaphone/Double Metaphone algorithm by itself, or after
- # the Porter Stemmer.
+ if use_stem:
+ stemmer = Stemmer.Stemmer('english')
+ words = stemmer.stemWords(words)
+
+ if use_metaphone:
+ dmeta = fuzzy.DMetaphone()
+ import itertools
+ w = []
+ [ w.extend(list(itertools.chain(dmeta(word)))) for word in words]
+ words = filter (lambda a: a != None, w)
if not add:
return words
@@ -158,8 +183,8 @@ def idf(count):
class TestIndex(unittest.TestCase):
def test_index_basic(self):
- t = ScoredIndexSearch('unittest', 'dev.ad.ly')
- t.connection.delete(*t.connection.keys('unittest:*'))
+ t = ScoredIndexSearch('unittest')
+ t.connection.delete(t.connection.keys('unittest:*'))
t.add_indexed_item(1, 'hello world')
t.add_indexed_item(2, 'this world is nice and you are really special')
@@ -175,5 +200,21 @@ def test_index_basic(self):
t.search('hello really special nice world'),
([('2', 0.75), ('1', 0.5)], 2))
+def stress_test(filename):
+ t = ScoredIndexSearch('stress_test')
+ t.connection.flushdb()
+ f = open(filename)
+ data = ""
+ for line in f:
+ data += " " + line
+ f.close()
+ t.add_indexed_item(1, data)
+
if __name__ == '__main__':
- unittest.main()
+ import cProfile
+ cProfile.run("stress_test('test_file')", 'stress_test_output_porter2_1')
+ import pstats
+ p = pstats.Stats('stress_test_output_porter2')
+ p.sort_stats('time').print_stats(10)
+ p = pstats.Stats('stress_test_output_porter2_1')
+ p.sort_stats('time').print_stats(10)

0 comments on commit cffb5ba

Please sign in to comment.