# Bag Of Words

[Link Artigo](https://nlpforhackers.io/language-models/)

* It has an oversimplified view of the language
* It takes into account only the frequency of the words in the language, not their order or position

In [3]:
from nltk.corpus import reuters
from collections import Counter

In [4]:
counts = Counter(reuters.words())
total_count = len(reuters.words())

In [5]:
# The most common 20 words are ...
counts.most_common(n=20)

[('.', 94687),
 (',', 72360),
 ('the', 58251),
 ('of', 35979),
 ('to', 34035),
 ('in', 26478),
 ('said', 25224),
 ('and', 25043),
 ('a', 23492),
 ('mln', 18037),
 ('vs', 14120),
 ('-', 13705),
 ('for', 12785),
 ('dlrs', 11730),
 ("'", 11272),
 ('The', 10968),
 ('000', 10277),
 ('1', 9977),
 ('s', 9298),
 ('pct', 9093)]

In [6]:
# Compute the frequencies
for word in counts:
    counts[word] /= float(total_count)

In [7]:
counts.most_common(n=5)

[('.', 0.055021758950689205),
 (',', 0.042047741270415905),
 ('the', 0.033849129031826936),
 ('of', 0.02090707135390124),
 ('to', 0.01977743054365126)]

In [8]:
# The frequencies should add up to 1
sum(counts.values())  # 1.0

1.0000000000006808

In [9]:
import random

# Generate 100 words of language
text = []
 
for _ in range(100):
    r = random.random()
    accumulator = .0
    for word, freq in counts.items():
        accumulator += freq
        if accumulator >= r:
            text.append(word)
            break
 
' '.join(text)

'. , end all cents refineries was and UNIONS foreign markets of could The SHR mln middle contract was the , to Oct international of billion , ," , off , 4 lt their said back dirhams analysts emphasised 616 ICCO hold than income pct . government 000 . station problem filed 5 acquire defense for loophole major and provision to . in 40 , , the terms PRIMARY on lt to . > , ; , a finance partnership Driel by which production 24 the Gene before imported . Analysts , gold the Earlier Fed season loss billion reminder'

Now that we know the probability of all the words, we can compute the probability of a text. Because the words have been generated independently we just need to multiply all of the probabilities together:

In [10]:
# The probability of a text
from operator import mul
from functools import reduce

reduce(mul, [counts[w] for w in text], 1.0)

1.3493098526332951e-294

# Bigrams and Trigrams

In [11]:
from nltk import bigrams, trigrams
from collections import defaultdict

In [12]:
first_sentence = reuters.sents()[0]
print(first_sentence)

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']


In [13]:
# Get the bigrams
print(list(bigrams(first_sentence)))

[('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), ('that', 'the'), ('the', 'row'), ('row', 'could'), ('could', 'inflict'), ('inflict', 'far'), ('far', '-'), ('-', 'reaching'), ('reaching', 'economic'), ('economic', 'damage'), ('damage', ','), (',', 'businessmen'), ('businessmen', 'and'), ('and', 'officials'), ('officials', 'said'), ('said', '.')]


In [14]:
# Get the padded bigrams
print (list(bigrams(first_sentence, pad_left=True, pad_right=True)))

[(None, 'ASIAN'), ('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), ('that', 'the'), ('the', 'row'), ('row', 'could'), ('could', 'inflict'), ('inflict', 'far'), ('far', '-'), ('-', 'reaching'), ('reaching', 'economic'), ('economic', 'damage'), ('damage', ','), (',', 'businessmen'), ('businessmen', 'and'), ('and', 'officials'), ('officials', 'said'), ('said', '.'), ('.', None)]


In [15]:
# Get the trigrams
print(list(trigrams(first_sentence)))

[('ASIAN', 'EXPORTERS', 'FEAR'), ('EXPORTERS', 'FEAR', 'DAMAGE'), ('FEAR', 'DAMAGE', 'FROM'), ('DAMAGE', 'FROM', 'U'), ('FROM', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.-'), ('S', '.-', 'JAPAN'), ('.-', 'JAPAN', 'RIFT'), ('JAPAN', 'RIFT', 'Mounting'), ('RIFT', 'Mounting', 'trade'), ('Mounting', 'trade', 'friction'), ('trade', 'friction', 'between'), ('friction', 'between', 'the'), ('between', 'the', 'U'), ('the', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.'), ('S', '.', 'And'), ('.', 'And', 'Japan'), ('And', 'Japan', 'has'), ('Japan', 'has', 'raised'), ('has', 'raised', 'fears'), ('raised', 'fears', 'among'), ('fears', 'among', 'many'), ('among', 'many', 'of'), ('many', 'of', 'Asia'), ('of', 'Asia', "'"), ('Asia', "'", 's'), ("'", 's', 'exporting'), ('s', 'exporting', 'nations'), ('exporting', 'nations', 'that'), ('nations', 'that', 'the'), ('that', 'the', 'row'), ('the', 'row', 'could'), ('row', 'could', 'inflict'), ('could', 'inflict', 'far'), ('inflict', 'far', '-'), ('far', '-', 'rea

In [16]:
print(list(trigrams(first_sentence, pad_left=True, pad_right=True)))

[(None, None, 'ASIAN'), (None, 'ASIAN', 'EXPORTERS'), ('ASIAN', 'EXPORTERS', 'FEAR'), ('EXPORTERS', 'FEAR', 'DAMAGE'), ('FEAR', 'DAMAGE', 'FROM'), ('DAMAGE', 'FROM', 'U'), ('FROM', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.-'), ('S', '.-', 'JAPAN'), ('.-', 'JAPAN', 'RIFT'), ('JAPAN', 'RIFT', 'Mounting'), ('RIFT', 'Mounting', 'trade'), ('Mounting', 'trade', 'friction'), ('trade', 'friction', 'between'), ('friction', 'between', 'the'), ('between', 'the', 'U'), ('the', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.'), ('S', '.', 'And'), ('.', 'And', 'Japan'), ('And', 'Japan', 'has'), ('Japan', 'has', 'raised'), ('has', 'raised', 'fears'), ('raised', 'fears', 'among'), ('fears', 'among', 'many'), ('among', 'many', 'of'), ('many', 'of', 'Asia'), ('of', 'Asia', "'"), ('Asia', "'", 's'), ("'", 's', 'exporting'), ('s', 'exporting', 'nations'), ('exporting', 'nations', 'that'), ('nations', 'that', 'the'), ('that', 'the', 'row'), ('the', 'row', 'could'), ('row', 'could', 'inflict'), ('could', 'inflict

We’re going to build a trigram model from the Reuters corpus. Building a bigram model is completely analogous and easier.

In [17]:
model = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1

In [21]:
# "economists" follows "what the" 2 times
print(model["what", "the"]["economists"])

2


In [23]:
print(model["what", "the"]["nonexistingword"]) # 0 times

0


In [25]:
print(model[None, None]["The"]) # 8839 sentences start with "The"

8839


In [59]:
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        if total_count > 0:
            model[w1_w2][w3] /= total_count
        else:
             model[w1_w2][w3] = 0.0

In [60]:
model["what", "the"]["economists"]

0.043478260869565244

In [61]:
model["what", "the"]["nonexistingword"]

0.0

In [62]:
model[None, None]["The"]

0.16154324146505644

Vamos gerar algum texto com esse modelo

In [66]:
import random
 
text = [None, None]
 
sentence_finished = False
prob = 1.0
while not sentence_finished:
    r = random.random()
    accumulator = .0
 
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
 
        if accumulator >= r:
            if model[tuple(text[-2:])][word] != 0.0:
                prob *= model[tuple(text[-2:])][word]
            text.append(word)
            break
 
    if text[-2:] == [None, None]:
        sentence_finished = True

In [67]:
' '.join([t for t in text if t])

'ICCO TO EXAMINE BUFFER STOCK ACCORD CLOSER , DELEGATES SAY The Bank of England said it will drill an exploratory well , it said the other extreme , that is what is due begin producing .'

In [68]:
prob

2.786301029126488e-26