In [1]:
import pandas as pd
import numpy as np
import os
import re
import requests
import time

This is an NLP project that aims to create new text based on patterns in any body of text entered. It works using a series of conditional probabilities computed from an 'N-gram' division of words, and then produces a sample of text based on this model. It also includes a webscraping component, since I got my text data from the project gutenberg website. Below is an example of this model being using on the book beowulf.

In [2]:
def get_book(url):
    time.sleep(7)
    text = str(requests.get(url).text).replace("\r\n", "\n")
    first = text.find("***")
    text = text[first+ 4:]
    second = text[3:].find("***")
    text = text[second + 6:]
    return text[:text.find("***")]


text = str(requests.get('https://www.gutenberg.org/files/57988/57988-0.txt').text).replace("\r\n", "\n")
first = text.find("***")
text = text[first+ 4:]
second = text[3:].find("***")
#end = text[second + 6:].find("END")

text = text[second + 6:]
#text[:text.find("***")]


In [5]:
#Note \xo2 indicates the start of a sentence and xo3 indicates the end of one.
def tokenize(book_string):
    works = book_string
    works = re.sub("^\n+", " \x02 ", works)
    works = re.sub("\n+$", " \x03", works)
    works = re.sub("(\n\n)+", ' \x03 \x02 ', works)
    works = works.replace("\n", " ")
    for i in '''!()-[]{};:'"\,’<>./?@#$%^&*_~''':
        works = works.replace(i, " " + i + " ")
    works = works.replace("...", " ... ")
    
    works = re.sub(" +", " ", works).split(" ")
    if "" in works:
        works.remove("")
    if works[0] != "\x02":
        works = ["\x02"] + works
    if works[-1] != "\x03":
        works = works + ["\x03"]
    return works

In [6]:
class UnigramLM(object):
    
    def __init__(self, tokens):
        """
        Initializes a Unigram languange model using a
        list of tokens. It trains the language model
        using `train` and saves it to an attribute
        self.mdl.
        """
        self.mdl = self.train(tokens)
    
    def train(self, tokens):
        """
        Trains a unigram language model given a list of tokens.
        The output is a series indexed on distinct tokens, and
        values giving the probability of a token occuring
        in the language.

        :Example:
        >>> tokens = tuple('one one two three one two four'.split())
        >>> unig = UnigramLM(tokens)
        >>> isinstance(unig.mdl, pd.Series)
        True
        >>> set(unig.mdl.index) == set('one two three four'.split())
        True
        >>> unig.mdl.loc['one'] == 3 / 7
        True
        """
        df = pd.DataFrame(data = pd.Series(tokens), columns = ["words"])
        df["count"] = np.full(len(tokens), 1)
        count = df.groupby("words").count()["count"]
        count = count/count.sum()
        return count
    
    def probability(self, words):
        """
        probability gives the probabiliy a sequence of words
        appears under the language model.
        :param: words: a tuple of tokens
        :returns: the probability `words` appears under the language
        model.

        :Example:
        >>> tokens = tuple('one one two three one two four'.split())
        >>> unig = UnigramLM(tokens)
        >>> unig.probability(('five',))
        0
        >>> p = unig.probability(('one', 'two'))
        >>> np.isclose(p, 0.12244897959, atol=0.0001)
        True
        """ 
        
        try:
            prob = np.prod(self.mdl.loc[list(words)])
        except:
            prob = 0
        return prob
        
    def sample(self, M):
        """
        sample selects tokens from the language model of length M, returning
        a string of tokens.

        >>> tokens = tuple('one one two three one two four'.split())
        >>> unig = UnigramLM(tokens)
        >>> samp = unig.sample(1000)
        >>> isinstance(samp, str)
        True
        >>> len(samp.split()) == 1000
        True
        >>> s = pd.Series(samp.split()).value_counts(normalize=True).loc['one']
        >>> np.isclose(s, 0.41, atol=0.05).all()
        True
        """
        return " ".join(list(np.random.choice(list(self.mdl.index), M, p = list(self.mdl.values))))

In [7]:
class NGramLM(object):
    
    def __init__(self, N, tokens):
        """
        Initializes a N-gram languange model using a
        list of tokens. It trains the language model
        using `train` and saves it to an attribute
        self.mdl.
        """
        # You don't need to edit the constructor,
        # but you should understand how it works!
        
        self.N = N

        ngrams = self.create_ngrams(tokens)

        self.ngrams = ngrams
        self.mdl = self.train(ngrams)

        if N < 2:
            raise Exception('N must be greater than 1')
        elif N == 2:
            self.prev_mdl = UnigramLM(tokens)
        else:
            self.prev_mdl = NGramLM(N-1, tokens)

    def create_ngrams(self, tokens):
        """
        create_ngrams takes in a list of tokens and returns a list of N-grams. 
        The START/STOP tokens in the N-grams should be handled as 
        explained in the notebook.

        :Example:
        >>> tokens = tuple('\x02 one two three one four \x03'.split())
        >>> bigrams = NGramLM(2, [])
        >>> out = bigrams.create_ngrams(tokens)
        >>> isinstance(out[0], tuple)
        True
        >>> out[0]
        ('\\x02', 'one')
        >>> out[2]
        ('two', 'three')
        """
        temp = zip(*[tokens[i:] for i in range(0,self.N)])
        lis = []
        for i in temp:
            lis.append((i))
        return lis
        
    def train(self, ngrams):
        """
        Trains a n-gram language model given a list of tokens.
        The output is a dataframe with three columns (ngram, n1gram, prob).

        :Example:
        >>> tokens = tuple('\x02 one two three one four \x03'.split())
        >>> bigrams = NGramLM(2, tokens)
        >>> set(bigrams.mdl.columns) == set('ngram n1gram prob'.split())
        True
        >>> bigrams.mdl.shape == (6, 3)
        True
        >>> bigrams.mdl['prob'].min() == 0.5
        True
        """
        if len(ngrams) == 0:
            return []
        df = pd.DataFrame()
        df["ngram"] = ngrams
        df["second"] = pd.Series(ngrams).str[0:len(ngrams[0]) - 1]
        df.columns = ["ngram","n1gram"]



        col1 = df.groupby("ngram")["n1gram"].transform('count')
        col2 = df.groupby("n1gram")["ngram"].transform('count')
        df["prob"] = col1/col2
        return df.drop_duplicates()

        
    
    def probability(self, words):
        """
        probability gives the probabiliy a sequence of words
        appears under the language model.
        :param: words: a tuple of tokens
        :returns: the probability `words` appears under the language
        model.

        :Example:
        >>> tokens = tuple('\x02 one two one three one two \x03'.split())
        >>> bigrams = NGramLM(2, tokens)
        >>> p = bigrams.probability('two one three'.split())
        >>> np.isclose(p, (1/4) * (1/2) * (1/3))
        True
        >>> bigrams.probability('one two five'.split()) == 0
        True
        """
        try:
            lat_len = len(words) - (self.N - 1)
            ser = self.mdl.set_index('ngram').loc[self.create_ngrams(words)]['prob']
            latter = np.prod(ser.groupby(ser.index).first())
        except:
            return 0
        

        previous = self.prev_mdl
        while isinstance(previous.mdl, pd.DataFrame):
            latter = latter * previous.mdl.set_index("ngram").loc[[tuple(words[0:previous.N])]]["prob"][0]
            previous = previous.prev_mdl
        latter2 = latter * previous.mdl.loc[words[0]]
        return latter2
    

    def sample(self, M):
        """
        sample selects tokens from the language model of length M, returning
        a string of tokens.

        :Example:
        >>> tokens = tuple('\x02 one two three one four \x03'.split())
        >>> bigrams = NGramLM(2, tokens)
        >>> samp = bigrams.sample(3)
        >>> len(samp.split()) == 4  # don't count the initial START token.
        True
        >>> samp[:2] == '\\x02 '
        True
        >>> set(samp.split()) <= {'\\x02', '\\x03', 'one', 'two', 'three', 'four'}
        True
        """
        grass_model = self
        prev_model = self
        string = "\x02"
        # Use a helper function to generate sample tokens of length `length`
        if grass_model.N == 2:
            first_prob = prev_model.mdl[pd.Series(prev_model.mdl["ngram"]).str[0] == "\x02"]
            ans = first_prob.drop_duplicates()
            next_words = np.random.choice(ans["ngram"], 1, p = ans["prob"])
            next_word = next_words[0][1]
            string = string + " "+ next_word
            for i in range(M-2):
                x = grass_model.mdl[pd.Series(prev_model.mdl["ngram"]).str[0:1] == (next_words[0][1],)]
                ans = x.drop_duplicates()
                next_words = [np.random.choice(ans["ngram"], 1, p = ans["prob"])[0][0:]]
                next_word = next_words[0][1]
                string = string + " "+ next_word
            return string + " \x03"
        else:
            
        # Transform the tokens to strings
            while prev_model.N != 2:
                prev_model = prev_model.prev_mdl
            first_prob = prev_model.mdl[pd.Series(prev_model.mdl["ngram"]).str[0] == "\x02"]
            ans = first_prob.drop_duplicates()
            next_words = np.random.choice(ans["ngram"], 1, p = ans["prob"])
            next_word = next_words[0][1]
            string = string + " "+ next_word

            for i in range(3,self.N):
                prev_model = self
                for j in range(self.N - i):
                    prev_model = prev_model.prev_mdl
                x = prev_model.mdl[pd.Series(prev_model.mdl["ngram"]).str[0:i-1] == next_words[0]]
                ans = x.drop_duplicates()
                next_words = [np.random.choice(ans["ngram"], 1, p = ans["prob"])[0][0:]]
                #print(next_words)
                next_word = next_words[0][i-1]
                string = string + " "+ next_word
            num_left = (M - (self.N - 1))
            for i in range(num_left):
                try:
                    x = self.mdl[pd.Series(self.mdl["ngram"]).str[0:len(next_words[0])] == next_words[0]]
                    ans = x.drop_duplicates()
                    next_words = [np.random.choice(ans["ngram"], 1, p = ans["prob"])[0][1:]]
                    next_word = next_words[0][-1]
                    string = string + " "+ next_word
                except:
                    string = string + " " + "\x03"

            string = string + " \x03"
            return string

In [13]:
beowulf = get_book('https://www.gutenberg.org/ebooks/16328.txt.utf-8')
beo_tokens = tokenize(beowulf)

In [14]:
beo = NGramLM(5, beo_tokens )

In [15]:
beo.sample(100)

'\x02 _ When the hero arrives in his own land , Higelac treats him as a distinguished guest . He is the great - grandfather of Hrothgar , so prominent in the poem . } \x03 \x02 It pains me in spirit to any to tell it , What grief in Heorot Grendel hath caused me , 20 What horror unlooked - for , by hatred unceasing . Waned is my war - band , purify Heorot . I have heard on inquiry , Outstruck in its stroke , when to struggle he carried The wonderful war - sword : \x03'