In [1]:
% cd ../..

/home/jayant/Projects/gensim


In [2]:
import logging
import time

from collections import defaultdict, Counter

from gensim.models.word2vec import Text8Corpus
from gensim.models import counters
from gensim.models import fast_counter

In [3]:
corpus_file = '/home/jayant/Projects/notebooks/text8'

! ls -hl $corpus_file

-rw-rw-r-- 1 jayant jayant 71M Jun 26 07:32 /home/jayant/Projects/notebooks/text8


In [4]:
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
def run_dict(sents):
    counter = {}
    for sent in sents:
        for word in sent:
            if word in counter:
                counter[word] += 1
            else:
                counter[word] = 1
    return counter

def run_dict_try(sents):
    counter = {}
    for sent in sents:
        for word in sent:
            try:
                counter[word] += 1
            except KeyError:
                counter[word] = 1
    return counter

def run_defaultdict(sents):
    counter = defaultdict(int)
    for sent in sents:
        for word in sent:
            counter[word] += 1
    return counter

def run_counter_collections(sents):
    counter = Counter()
    for sent in sents:
        counter.update(sent)
    return counter

def run_counter_cython(sents):
    counter = counters.CounterCython()
    counter.update_sents(sents)
    return counter

def run_counter_cython_defaultdict(sents):
    counter = counters.CounterCythonDefaultDict()
    counter.update_sents(sents)
    return counter

def run_counter_cython_unordered_map(sents):
    counter = counters.CounterCythonUnorderedMap()
    counter.update_sents(sents)
    return counter

def run_fastcounter(sents):
    def doc2items(doc):
        for item in doc:
            yield item
    counter = fast_counter.FastCounter(doc2items=doc2items)
    counter.update(sents)
    return counter

def run_fastcounter_cython(sents):
    counter = fast_counter.FastCounterCython()
    counter.update(sents)
    return counter

def run_fastcounter_preshed(sents):
    counter = fast_counter.FastCounterPreshed()
    counter.update(sents)
    return counter

In [6]:
def benchmark_count_method(count_method, sents):
    print('Benchmarking %s' % count_method)
    start = time.time()
    counts = count_method(sents)
    time_taken = time.time() - start
    assert counts.get('word') == 4336, 'expected count for "word": %s, actual: %s' % (4336, counts.get('word'))  
    print('Time taken: %.2fs' % time_taken)

In [7]:
for method in [
        run_dict, run_dict_try, run_defaultdict, run_counter_collections,
        run_counter_cython, run_counter_cython_defaultdict, run_counter_cython_unordered_map,
        run_fastcounter, run_fastcounter_cython,
        run_fastcounter_preshed][5:]:
    text8_sents = Text8Corpus(corpus_file)
    benchmark_count_method(method, text8_sents)

Benchmarking <function run_counter_cython_defaultdict at 0x7fe153e3c1b8>
Time taken: 4.24s
Benchmarking <function run_counter_cython_unordered_map at 0x7fe153e3c320>
Time taken: 3.16s
Benchmarking <function run_fastcounter at 0x7fe153e3c488>
Time taken: 5.68s
Benchmarking <function run_fastcounter_cython at 0x7fe153e3c500>
Time taken: 7.56s
Benchmarking <function run_fastcounter_preshed at 0x7fe153e3c578>
Time taken: 2.62s
