# Exercise - Language Model

## n-Grams and Language Models

### Tokenise the corpus

In [26]:
# tokenize
!cat wiki-en-flower.txt | tr ' ' '\n' > wiki-en-flower-token.txt

### Determine the number of word tokens and the number of word types in the corpus.

In [28]:
# word types
!sort wiki-en-flower-token.txt | uniq -c | wc -l

7454


In [29]:
# tokens
!cat wiki-en-flower-token.txt | wc -l

33584


### Generate the bigrams and the trigrams that appear in the corpus.

In [7]:
!tail -n+2 wiki-en-flower-token.txt > tmp1.txt
!paste -d ' ' wiki-en-flower-token.txt tmp1.txt > bigram.txt

In [10]:
!tail -n+2 tmp1.txt > tmp2.txt
!paste -d ' ' wiki-en-flower-token.txt tmp1.txt tmp2.txt > trigram.txt

In [11]:
# clean up
!rm tmp1.txt
!rm tmp2.txt

### How many bigram and trigram types and tokens does the corpus have?

In [8]:
# bigrams
!sort bigram.txt | uniq -c | wc -l

21878


In [12]:
# trigrams
!sort trigram.txt | uniq -c | wc -l

29588


### Name two bigrams and two trigrams that contain the word sunflower and appear more often than once in the corpus. How often do these bigrams and trigrams appear in the corpus?

In [13]:
with open("bigram.txt", "r", encoding="utf-8") as f:
    bigrams = f.readlines()
with open("trigram.txt", "r", encoding="utf-8") as f:
    trigrams = f.readlines()

In [14]:
sunflower = []

for gram in bigrams + trigrams:
    if "sunflower" in gram:
        sunflower.append(gram)

In [15]:
print(sunflower)

['or sunflower\n', 'sunflower ,\n', 'the sunflower\n', 'sunflower family\n', '( sunflowers\n', 'sunflowers ,\n', 'including sunflower\n', 'sunflower (\n', '" \'sunflower\n', "'sunflower family\n", '( sunflower\n', 'sunflower )\n', 'domestic sunflower\n', 'sunflower )\n', ', sunflower\n', 'sunflower seeds\n', 'as sunflower\n', 'sunflower and\n', '; sunflower\n', 'sunflower 1196.6\n', 'in sunflowers\n', 'sunflowers ,\n', 'as sunflower\n', 'sunflower oil\n', ', sunflower\n', 'sunflower seed\n', 'and sunflowers\n', 'sunflowers .\n', 'a sunflower\n', "sunflower 's\n", 'as sunflower\n', 'sunflower ,\n', 'a sunflower\n', 'sunflower was\n', 'that sunflowers\n', 'sunflowers and\n', 'from sunflower\n', 'sunflower ,\n', ', sunflower\n', 'sunflower ,\n', ', sunflower\n', 'sunflower seed\n', 'with sunflowers\n', 'sunflowers ,\n', ', sunflower\n', 'sunflower ,\n', ', sunflower\n', 'sunflower ,\n', 'and sunflower\n', 'sunflower production\n', ', sunflower\n', 'sunflower ,\n', ', sunflower\n', 'sunflo

In [16]:
import collections

c = collections.Counter(sunflower)
# appears more than once
significants = []
for k in c.keys():
    if c[k] > 1:
        significants.append(k)

In [17]:
significants

['sunflower ,\n',
 'sunflowers ,\n',
 'sunflower (\n',
 'sunflower )\n',
 ', sunflower\n',
 'sunflower seeds\n',
 'as sunflower\n',
 'sunflower and\n',
 'sunflower seed\n',
 'and sunflowers\n',
 'a sunflower\n',
 'and sunflower\n',
 'sunflower production\n',
 'of sunflower\n',
 '" sunflower\n',
 ', sunflowers\n',
 'sunflower ) ,\n',
 ', sunflower seeds\n',
 'sunflower seeds ,\n',
 'such as sunflower\n',
 'beets , sunflower\n',
 ', sunflower seed\n',
 'sunflower seed ,\n',
 'sunflower , and\n',
 ', sunflower ,\n',
 'sunflowers , and\n',
 'and sunflower production\n',
 'producer of sunflower\n',
 'of sunflower seed\n',
 'sunflower seed and\n',
 'the " sunflower\n',
 ', sunflowers ,\n']

In [18]:
# frequency of appearing
n_appearence = len(significants)
n_appearence

32

### Estimate the probability of the bigram sunflower seeds using maximum likelihood estimation.

In [21]:
sun_seeds = "sunflower seeds\n"

In [22]:
mle = c[sun_seeds] / n_appearence
mle

0.15625

## Smoothing

### Determine the unigram frequencies for the four word forms and, of, sunflower, seeds, and the bigram frequencies for the 16 bigram combinations of these four word forms.

In [23]:
with open("wiki-en-flower-token.txt", encoding="utf-8") as f:
    tokens = f.readlines()

len(tokens)

33584

In [24]:
word_freqs = collections.Counter(tokens)

In [25]:
unigrams = ["and\n", "of\n", "sunflower\n", "seeds\n"]
for u in unigrams:
    print("{} => {}".format(u, word_freqs[u]))

and
 => 952
of
 => 1034
sunflower
 => 36
seeds
 => 30


In [26]:
from itertools import chain, combinations

combos = list(chain.from_iterable(combinations(unigrams, r) for r in range(len(unigrams) + 1)))

In [27]:
len(combos)

16

In [28]:
combos

[(),
 ('and\n',),
 ('of\n',),
 ('sunflower\n',),
 ('seeds\n',),
 ('and\n', 'of\n'),
 ('and\n', 'sunflower\n'),
 ('and\n', 'seeds\n'),
 ('of\n', 'sunflower\n'),
 ('of\n', 'seeds\n'),
 ('sunflower\n', 'seeds\n'),
 ('and\n', 'of\n', 'sunflower\n'),
 ('and\n', 'of\n', 'seeds\n'),
 ('and\n', 'sunflower\n', 'seeds\n'),
 ('of\n', 'sunflower\n', 'seeds\n'),
 ('and\n', 'of\n', 'sunflower\n', 'seeds\n')]