# Word Counting

In [3]:
sentence = "This cat jumped over this other cat!"

In [5]:
def stem(word):
    """ Stem word to primitive form 
    
    >>> stem("Hello!")
    'hello'
    """
    return word.lower().rstrip(",.!)-*_?:;$'-\"").lstrip("-*'\"(_$'")

In [3]:
def wordcount(string):
    words = string.split()
    
    stemmed_words = []
    for word in words:
        stemmed_words.append(stem(word))
    
    counts = dict()
    for word in stemmed_words:
        if word not in counts:
            counts[word] = 1
        else:
            counts[word] += 1
    
    return counts

wordcount(sentence)

{'cat': 2, 'jumped': 1, 'other': 1, 'over': 1, 'this': 2}

现在我们用`map`和`toolz`的`frequencies`函数来计算相同的结果


In [7]:
from toolz import frequencies

frequencies(map(stem, sentence.split()))

{'cat': 2, 'jumped': 1, 'other': 1, 'over': 1, 'this': 2}

### Pipe

功能代码行可以包含**很多**括号。 为了解决这个问题，可以学习`pipe`函数。 考虑下面的代码来洗衣服。

```
# Do Laundry
clothes = ...
wet_clothes = wash(clothes)
dry_clothes = dry(wet_clothes)
result = fold(dry_clothes)
```

通过功能流水线， `wash`, `dry` 和 `fold` 来推送数据。 这是一种常见的模式。 使用“管道”，我们按顺序进行三次转换：


```
result = pipe(clothes, wash, dry, fold)

result = compose(fold, dry, wash, clothes)
```   

管道通过一系列函数（其余的参数）从左到右推动数据（第一个参数）。


In [1]:
from toolz import pipe

# Simple example
def double(x):
    return 2 * x

pipe(3, double, double, str)

'12'

In [8]:
from toolz.curried import map
    
pipe(sentence, str.split, map(stem), frequencies)

{'cat': 2, 'jumped': 1, 'other': 1, 'over': 1, 'this': 2}

## 将示例扩展为多行文件

我们在整个文件上实现wordcount而不是单个句子。 我们看到上面的实现需要如何适应这个新的变化


In [12]:
def wordcount(file):

    counts = dict()
    
    for line in file:
        words = line.split()
        
        stemmed_words = []
        for word in words:
            stemmed_words.append(stem(word))

        for word in stemmed_words:
            if word not in counts:
                counts[word] = 1
            else:
                counts[word] += 1
    
    return counts

with open('data/tale-of-two-cities.txt') as f:
    for i in range(112):  # Burn first 112 lines - they include the Gutenberg header
        next(f)
    result = wordcount(f)

result
    

{'it': 2013,
 'was': 1764,
 'the': 8157,
 'best': 40,
 'of': 4119,
 'times': 51,
 'worst': 18,
 'age': 20,
 'wisdom': 2,
 'foolishness': 1,
 'epoch': 2,
 'belief': 6,
 'incredulity': 1,
 'season': 7,
 'light': 89,
 'darkness': 28,
 'spring': 5,
 'hope': 84,
 'winter': 11,
 'despair': 6,
 'we': 177,
 'had': 1297,
 'everything': 27,
 'before': 232,
 'us': 107,
 'nothing': 149,
 'were': 657,
 'all': 571,
 'going': 87,
 'direct': 9,
 'to': 3540,
 'heaven': 35,
 'other': 193,
 'way': 180,
 'in': 2634,
 'short': 40,
 'period': 5,
 'so': 582,
 'far': 81,
 'like': 198,
 'present': 41,
 'that': 1904,
 'some': 229,
 'its': 227,
 'noisiest': 1,
 'authorities': 3,
 'insisted': 2,
 'on': 932,
 'being': 134,
 'received': 28,
 'for': 971,
 'good': 209,
 'or': 434,
 'evil': 9,
 'superlative': 1,
 'degree': 11,
 'comparison': 4,
 'only': 121,
 'there': 567,
 'a': 2967,
 'king': 22,
 'with': 1351,
 'large': 37,
 'jaw': 2,
 'and': 4993,
 'queen': 11,
 'plain': 16,
 'face': 187,
 'throne': 3,
 'england': 

In [13]:
from toolz import concat
from toolz.curried import drop

pipe('data/tale-of-two-cities.txt', open, drop(112), map(str.split), concat, map(stem), frequencies)

{'it': 2013,
 'was': 1764,
 'the': 8157,
 'best': 40,
 'of': 4119,
 'times': 51,
 'worst': 18,
 'age': 20,
 'wisdom': 2,
 'foolishness': 1,
 'epoch': 2,
 'belief': 6,
 'incredulity': 1,
 'season': 7,
 'light': 89,
 'darkness': 28,
 'spring': 5,
 'hope': 84,
 'winter': 11,
 'despair': 6,
 'we': 177,
 'had': 1297,
 'everything': 27,
 'before': 232,
 'us': 107,
 'nothing': 149,
 'were': 657,
 'all': 571,
 'going': 87,
 'direct': 9,
 'to': 3540,
 'heaven': 35,
 'other': 193,
 'way': 180,
 'in': 2634,
 'short': 40,
 'period': 5,
 'so': 582,
 'far': 81,
 'like': 198,
 'present': 41,
 'that': 1904,
 'some': 229,
 'its': 227,
 'noisiest': 1,
 'authorities': 3,
 'insisted': 2,
 'on': 932,
 'being': 134,
 'received': 28,
 'for': 971,
 'good': 209,
 'or': 434,
 'evil': 9,
 'superlative': 1,
 'degree': 11,
 'comparison': 4,
 'only': 121,
 'there': 567,
 'a': 2967,
 'king': 22,
 'with': 1351,
 'large': 37,
 'jaw': 2,
 'and': 4993,
 'queen': 11,
 'plain': 16,
 'face': 187,
 'throne': 3,
 'england': 

In [16]:
timeit pipe('data/tale-of-two-cities.txt', open, drop(112), map(str.split), concat, map(stem), frequencies)

166 ms ± 7.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
timeit with open('data/tale-of-two-cities.txt') as f: wordcount(f)

190 ms ± 5.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
