In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import random
import string

In [2]:
import nltk
from nltk.tag import pos_tag
from nltk import wordnet, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Dataset

In [5]:
X = []

In [6]:
with open('edgar_allan_poe.txt') as f:
    line = ''
    for l in f:
        l = l.replace('\n','')
        if len(l) < 2:
          X.append([line])
          line = ''
        else:
          if line == '':
            line = l
          else:
            line = line + ' ' + l

In [7]:
len(X)

79

# Class

In [23]:
class LanguageModel():
  def __init__(self):
    self.TF = {}
    self.Pi = {}
    self.A = {}
    self.A3 = {}
    self.n = 0

  def text_preprocess(self, s):
    s = s.lower()
    s = s.translate(str.maketrans('', '', string.punctuation))
    return s

  def add_TF(self, word):
    if word in self.TF.keys():
      self.TF[word] += 1
    else:
      self.TF[word] = 1

  def fit(self, docs):
    for doc in docs:
      doc = word_tokenize(self.text_preprocess(doc[0]))
      order = 0
      t_2 = ''
      t_1 = ''
      for word in doc:
        if order == 0:
          if word in self.Pi.keys():
            self.Pi[word] += 1
          else:
            self.Pi[word] = 1
          self.add_TF(word)
          t_1 = word

        elif order == 1:
          if t_1 in self.A.keys():
            if word in self.A[t_1].keys():
              self.A[t_1][word] += 1
            else:
              self.A[t_1][word] = 1
          else:
            self.A[t_1] = {}
            self.A[t_1][word] = 1

          self.add_TF(word)
          t_2 = t_1
          t_1 = word

        elif order > 1:
          if t_2 in self.A3.keys():
            if t_1 in self.A3[t_2].keys():
              if word in self.A3[t_2][t_1].keys():
                self.A3[t_2][t_1][word] += 1
              else:
                self.A3[t_2][t_1][word] = 1
            else:
              self.A3[t_2][t_1] = {}
              self.A3[t_2][t_1][word] = 1
          else:
            self.A3[t_2] = {}
            self.A3[t_2][t_1] = {}
            self.A3[t_2][t_1][word] = 1

          self.add_TF(word)
          t_2 = t_1
          t_1 = word
        order += 1
      self.n += 1

  def generate(self, start='', token_limit=20):
    buffer = []
    if start != '':
      buffer.append(start.lower())
    for i in range(token_limit):
      try:
        if len(buffer) == 0:
          keys = list(self.Pi.keys())
          values = list(self.Pi.values())
          probs = np.array(values)
          probs = probs / self.n
          idx = np.random.choice(len(keys), p=probs)
          word = keys[idx]
          buffer.append(word)
        elif len(buffer) == 1:
          t_1 = buffer[-1]
          keys = list(self.A[t_1].keys())
          values = list(self.A[t_1].values())
          probs = np.array(values) / sum(values)
          idx = np.random.choice(len(keys), p=probs)
          word = keys[idx]
          buffer.append(word)
        elif len(buffer) > 1:
          t_2 = buffer[-2]
          t_1 = buffer[-1]
          keys = list(self.A3[t_2][t_1].keys())
          values = list(self.A3[t_2][t_1].values())
          probs = np.array(values) / sum(values)
          idx = np.random.choice(len(keys), p=probs)
          word = keys[idx]
          buffer.append(word)
      except KeyError:
        break

    text = ''
    for token in buffer:
      if text == '':
        text = token.capitalize()
      else:
        text = text + ' ' + token
    return text

In [24]:
model = LanguageModel()

In [25]:
model.fit(X)

In [26]:
model.generate(start='i', token_limit=100)

'I saw but them they were the world thy gentle ways thy grace thy more than beauty shall be and given in beauty from his birth whose fervid flickering torch of life was lit from the silver tinkling throats of the bells ah the bells the heavy iron bells how horrible a monody there floats from their throats from their deeptoned throats from their throats from the sun did rivulets run and all thy melody of lipbegotten words'