In [25]:
import re
from collections import defaultdict as dd, Counter
from nltk.util import bigrams
import nltk
from sklearn.model_selection import train_test_split as tts
import math
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [26]:

with open('transcript.txt', 'r') as f:
  transcript = [re.split(r'(\d+\w\d\w\d+)', l)[-1].strip() for l in f.readlines()]

In [27]:

# divide data 80 : 20
train_transcript, test_transcipt =  tts(transcript, test_size=0.2, random_state=42)


In [28]:
# replace low frequency word with <UNK> in train set
word_counter = Counter()
for sentence in train_transcript:
  tokens = nltk.word_tokenize(sentence)
  word_counter.update(tokens)

threshold = 10

def replace_word_with_unk(tokens, threshold, word_counter):
  return [token if word_counter[token] > threshold else '<unk>' for token in tokens]

sentences_with_unk = []
for sentence in train_transcript:
  tokens = nltk.word_tokenize(sentence)
  sentences_with_unk.append(replace_word_with_unk(tokens, threshold, word_counter))

In [29]:
# build bigram, unigram table
bigram_model = dd(lambda: dd(lambda: 0))
unigram_model = dd(lambda: 0)
total_token = 0

for sentence in sentences_with_unk:
  # bigram
  for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
    bigram_model[w1][w2] += 1
  # unigram
  for w in sentence:
    unigram_model[w] += 1
    total_token += 1


In [30]:
# normalize
for w in unigram_model.keys():
  unigram_model[w] /= total_token 

for w1 in bigram_model:
  total_count = sum(bigram_model[w1].values())
  for w2 in bigram_model[w1]: 
    bigram_model[w1][w2] /= total_count

In [31]:
# replace unknown word in test set with <unk>
test_sentence_with_unk = []

for sentence in test_transcipt:
  tokens = nltk.word_tokenize(sentence)
  test_sentence_with_unk.append(replace_word_with_unk(tokens, threshold, word_counter))

In [32]:

# evaluate function (log perplexity)
def cal_perplexity_bigram(test_sentence, bigram_model_):
  prob = 0
  for w1, w2 in bigrams(test_sentence, pad_right=True, pad_left=True):
    if bigram_model_[w1][w2] > 0:
      prob += math.log(bigram_model_[w1][w2])
  return math.exp(-(prob / len(test_sentence))) # perplexity

def cal_perplexity_unigram(test_sentence, unigram_model_):
  prob = 0
  for w in test_sentence:
    if unigram_model_[w] > 0:
      prob += math.log(unigram_model_[w])
  return math.exp(-(prob / len(test_sentence)))

In [33]:
bigram_perplexity = sum(map(lambda x : math.log(x), [cal_perplexity_bigram(sentence, bigram_model) for sentence in test_sentence_with_unk]))
unigram_perplexity = sum(map(lambda x : math.log(x), [cal_perplexity_unigram(sentence, unigram_model) for sentence in test_sentence_with_unk]))



In [34]:
print("The Unigram perplexity is : ", unigram_perplexity)

The Unigram perplexity is :  8383.284551889115


In [35]:
print("The Bigram perplexity is: ", bigram_perplexity)

The Bigram perplexity is:  4773.931733399603
