In [1]:
import dask.bag as db
import re

## Pride and prejudice from  
`http://www.gutenberg.org`  

Book Link : https://www.gutenberg.org/files/1342/1342-0.txt  
Jane Austin's Project Gutenberg page : https://www.gutenberg.org/ebooks/author/68

In [2]:
book_bag = db.from_url('https://www.gutenberg.org/files/1342/1342-0.txt')

In [3]:
book_bag.take(5)

(b'\xef\xbb\xbfThe Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\r\n',
 b'\r\n',
 b'This eBook is for the use of anyone anywhere at no cost and with\r\n',
 b'almost no restrictions whatsoever.  You may copy it, give it away or\r\n',
 b're-use it under the terms of the Project Gutenberg License included\r\n')

In [4]:
strip_spaces = book_bag.map(lambda x:x.strip())

In [6]:
strip_spaces.take(10)

(b'\xef\xbb\xbfThe Project Gutenberg EBook of Pride and Prejudice, by Jane Austen',
 b'',
 b'This eBook is for the use of anyone anywhere at no cost and with',
 b'almost no restrictions whatsoever.  You may copy it, give it away or',
 b're-use it under the terms of the Project Gutenberg License included',
 b'with this eBook or online at www.gutenberg.org',
 b'',
 b'',
 b'Title: Pride and Prejudice',
 b'')

In [8]:
def decode_to_ascii(x):
    return x.decode("ascii","ignore") 

In [9]:
ascii_text = strip_spaces.map(decode_to_ascii)

In [10]:
ascii_text.take(10)

('The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included',
 'with this eBook or online at www.gutenberg.org',
 '',
 '',
 'Title: Pride and Prejudice',
 '')

In [11]:
def remove_punctuation(x):
    return re.sub(r'[^\w\s]','',x)

In [12]:
remove_punctuation = ascii_text.map(remove_punctuation)

In [13]:
remove_punctuation.take(10)

('The Project Gutenberg EBook of Pride and Prejudice by Jane Austen',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever  You may copy it give it away or',
 'reuse it under the terms of the Project Gutenberg License included',
 'with this eBook or online at wwwgutenbergorg',
 '',
 '',
 'Title Pride and Prejudice',
 '')

In [14]:
lower_text = remove_punctuation.map(str.lower)

In [16]:
lower_text.take(10)

('the project gutenberg ebook of pride and prejudice by jane austen',
 '',
 'this ebook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever  you may copy it give it away or',
 'reuse it under the terms of the project gutenberg license included',
 'with this ebook or online at wwwgutenbergorg',
 '',
 '',
 'title pride and prejudice',
 '')

In [17]:
split_word_list = lower_text.map(lambda x: x.split(' '))

In [18]:
split_word_list.take(10)

(['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'pride',
  'and',
  'prejudice',
  'by',
  'jane',
  'austen'],
 [''],
 ['this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost',
  'no',
  'restrictions',
  'whatsoever',
  '',
  'you',
  'may',
  'copy',
  'it',
  'give',
  'it',
  'away',
  'or'],
 ['reuse',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included'],
 ['with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg'],
 [''],
 [''],
 ['title', 'pride', 'and', 'prejudice'],
 [''])

In [20]:
def remove_empty_words(word_list):
    return list(filter(lambda a: a != '', word_list))

non_empty_words = split_word_list.filter(remove_empty_words)

In [22]:
non_empty_words.take(10)

(['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'pride',
  'and',
  'prejudice',
  'by',
  'jane',
  'austen'],
 ['this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost',
  'no',
  'restrictions',
  'whatsoever',
  '',
  'you',
  'may',
  'copy',
  'it',
  'give',
  'it',
  'away',
  'or'],
 ['reuse',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included'],
 ['with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg'],
 ['title', 'pride', 'and', 'prejudice'],
 ['author', 'jane', 'austen'],
 ['posting', 'date', 'august', '26', '2008', 'ebook', '1342'],
 ['release', 'date', 'june', '1998'],
 ['last', 'updated', 'march', '10', '2018'])

In [23]:
all_words = non_empty_words.flatten()

In [24]:
type(all_words)

dask.bag.core.Bag

In [26]:
all_words.take(30)

('the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'pride',
 'and',
 'prejudice',
 'by',
 'jane',
 'austen',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever',
 '')

In [27]:
change_to_key_value = all_words.map(lambda x: (x, 1))

In [28]:
change_to_key_value.take(4)

(('the', 1), ('project', 1), ('gutenberg', 1), ('ebook', 1))

## Using `groupby`

In [29]:
grouped_words = all_words.groupby(lambda x:x)

In [30]:
grouped_words.take(1)

(('the',
  ['the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
 

In [31]:
word_count = grouped_words.map(lambda x: (x[0], len(x[1])))

In [32]:
word_count.take(10)

(('the', 4496),
 ('project', 88),
 ('gutenberg', 30),
 ('ebook', 11),
 ('of', 3716),
 ('pride', 50),
 ('and', 3602),
 ('prejudice', 11),
 ('by', 657),
 ('jane', 263))

## Using `foldby`

In [33]:
change_to_key_value.take(10)

(('the', 1),
 ('project', 1),
 ('gutenberg', 1),
 ('ebook', 1),
 ('of', 1),
 ('pride', 1),
 ('and', 1),
 ('prejudice', 1),
 ('by', 1),
 ('jane', 1))

In [34]:
# Take a running count of a word
# In this case, the default value of 
# count needs to be provided
def add_bin_op(count, x):
    return count + x[1]

# Take the output from multiple bin_op(s)
# and add them to get the total count of
# a word
def add_combine_op(x, y):
    return x + y

word_count = change_to_key_value.foldby(lambda x: x[0],
                                       add_bin_op, 0,
                                       add_combine_op)

In [35]:
word_count.take(10)

(('the', 4496),
 ('project', 88),
 ('gutenberg', 30),
 ('ebook', 11),
 ('of', 3716),
 ('pride', 50),
 ('and', 3602),
 ('prejudice', 11),
 ('by', 657),
 ('jane', 263))

## Using `frequencies`

In [36]:
much_easier = all_words.frequencies()

In [37]:
much_easier.take(10)

(('the', 4496),
 ('project', 88),
 ('gutenberg', 30),
 ('ebook', 11),
 ('of', 3716),
 ('pride', 50),
 ('and', 3602),
 ('prejudice', 11),
 ('by', 657),
 ('jane', 263))

We see that there are stopwords with most frequencies. Lets remove them.

In [38]:
from spacy.lang.en import STOP_WORDS

In [39]:
without_stopwords = all_words.filter(lambda x: x not in STOP_WORDS)

In [40]:
new_freq = without_stopwords.frequencies()

In [41]:
new_freq.take(20)

(('project', 88),
 ('gutenberg', 30),
 ('ebook', 11),
 ('pride', 50),
 ('prejudice', 11),
 ('jane', 263),
 ('austen', 4),
 ('use', 26),
 ('cost', 7),
 ('restrictions', 2),
 ('whatsoever', 2),
 ('', 98),
 ('copy', 12),
 ('away', 119),
 ('reuse', 2),
 ('terms', 43),
 ('license', 15),
 ('included', 5),
 ('online', 4),
 ('wwwgutenbergorg', 3))

In [42]:
new_freq.topk(10)

dask.bag<topk-ag..., npartitions=1>

In [44]:
new_freq.topk(10, key=lambda x: x[1]).compute()

[('mr', 782),
 ('elizabeth', 594),
 ('said', 401),
 ('darcy', 370),
 ('mrs', 343),
 ('bennet', 293),
 ('miss', 283),
 ('jane', 263),
 ('bingley', 257),
 ('know', 237)]