In [1]:
% cd ../..

/home/jayant/projects/gensim


In [2]:
import logging
import time

from collections import defaultdict, Counter

from gensim.models.word2vec import Text8Corpus
from gensim.models import counters
from gensim.models import fast_counter
from gensim.models import fast_unigram_counter

In [9]:
corpus_file = '/home/jayant/projects/text8'

! ls -hl $corpus_file

-rw-r--r-- 1 jayant develop 96M Jun  9  2006 /home/jayant/projects/text8


In [10]:
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)

In [11]:
def run_dict(sents):
    counter = {}
    for sent in sents:
        for word in sent:
            if word in counter:
                counter[word] += 1
            else:
                counter[word] = 1
    return counter

def run_dict_try(sents):
    counter = {}
    for sent in sents:
        for word in sent:
            try:
                counter[word] += 1
            except KeyError:
                counter[word] = 1
    return counter

def run_defaultdict(sents):
    counter = defaultdict(int)
    for sent in sents:
        for word in sent:
            counter[word] += 1
    return counter

def run_counter_collections(sents):
    counter = Counter()
    for sent in sents:
        counter.update(sent)
    return counter

def run_counter_cython(sents):
    counter = counters.CounterCython()
    counter.update_sents(sents)
    return counter

def run_counter_cython_defaultdict(sents):
    counter = counters.CounterCythonDefaultDict()
    counter.update_sents(sents)
    return counter

def run_counter_cython_unordered_map(sents):
    counter = counters.CounterCythonUnorderedMap()
    counter.update_sents(sents)
    return counter

def run_fastcounter_cython(sents):
    counter = fast_unigram_counter.FastCounterCython()
    counter.update(sents)
    return counter

def run_fastcounter_preshed(sents):
    counter = fast_unigram_counter.FastCounterPreshed()
    counter.update(sents)
    return counter

In [18]:
def benchmark_count_method(count_method, sents, test_word, test_count):
    print('Benchmarking %s' % count_method)
    start = time.time()
    counts = count_method(sents)
    time_taken = time.time() - start
    assert counts.get(test_word) == test_count, \
        'expected count for "%s": %s, actual: %s' % (test_word, test_count, counts.get(test_word))  
    print('Time taken: %.2fs' % time_taken)

In [19]:
for method in [
        run_dict, run_dict_try, run_defaultdict, run_counter_collections,
        run_counter_cython, run_counter_cython_defaultdict, run_counter_cython_unordered_map,
        run_fastcounter_cython, run_fastcounter_preshed]:
    text8_sents = Text8Corpus(corpus_file)
    benchmark_count_method(method, text8_sents, 'word', 5678)

Benchmarking <function run_dict at 0x7f6c0b6eb0c8>
Time taken: 3.78s
Benchmarking <function run_dict_try at 0x7f6c0ccaab90>
Time taken: 3.50s
Benchmarking <function run_defaultdict at 0x7f6c0ccaa758>
Time taken: 3.32s
Benchmarking <function run_counter_collections at 0x7f6c0ccaa668>
Time taken: 5.26s
Benchmarking <function run_counter_cython at 0x7f6c0ccaa938>
Time taken: 3.10s
Benchmarking <function run_counter_cython_defaultdict at 0x7f6c0ccaaa28>
Time taken: 3.08s
Benchmarking <function run_counter_cython_unordered_map at 0x7f6c0ccaa6e0>
Time taken: 2.51s
Benchmarking <function run_fastcounter_cython at 0x7f6c0ccaaaa0>
Time taken: 2.69s
Benchmarking <function run_fastcounter_preshed at 0x7f6c0ccaa8c0>
Time taken: 1.49s


In [20]:
text9_corpus_file = '/home/jayant/projects/text9'
for method in [
        run_dict, run_dict_try, run_defaultdict, run_counter_collections,
        run_counter_cython, run_counter_cython_defaultdict, run_counter_cython_unordered_map,
        run_fastcounter_cython, run_fastcounter_preshed]:
    sents = Text8Corpus(text9_corpus_file)
    benchmark_count_method(method, sents, 'word', 26358)

Benchmarking <function run_dict at 0x7f6c0b6eb0c8>
Time taken: 27.67s
Benchmarking <function run_dict_try at 0x7f6c0ccaab90>
Time taken: 24.94s
Benchmarking <function run_defaultdict at 0x7f6c0ccaa758>
Time taken: 24.19s
Benchmarking <function run_counter_collections at 0x7f6c0ccaa668>
Time taken: 38.53s
Benchmarking <function run_counter_cython at 0x7f6c0ccaa938>
Time taken: 22.56s
Benchmarking <function run_counter_cython_defaultdict at 0x7f6c0ccaaa28>
Time taken: 22.18s
Benchmarking <function run_counter_cython_unordered_map at 0x7f6c0ccaa6e0>
Time taken: 18.31s
Benchmarking <function run_fastcounter_cython at 0x7f6c0ccaaaa0>
Time taken: 19.46s
Benchmarking <function run_fastcounter_preshed at 0x7f6c0ccaa8c0>
Time taken: 10.86s
