# Final Project

In [1]:
with open('emma-full.txt', 'r', encoding='UTF-8') as f:
    s = f.read()

In [2]:
import random
import string
import re

def count_nwords(s, n = 1):
    '''
    count the number of occurance of n-words in a string
    Input: s, the string
           n, the size of n-words to count
    Output: a dictionary keyed by n-words whose values are number of occurance in s
    '''
    counts = {}
    words = s.split()
    for i in range(len(words) - n + 1):
        nwords = tuple(words[i:(i+n)])
        if nwords in counts:
            counts[nwords] += 1
        else:
            counts[nwords] = 1
    return counts

class MarkovText():
    def __init__(self, text, n, length, seed):
        '''
        Introduce this class! TO-BE-ADDED
        '''
        self.text = text
        self.n = n
        self.length = length
        self.seed = seed

    def markov_text_word(self):
        '''
        Generate fake text with words as units.
        Args:
            None
        Returns:
            fake: a string, the fake text
        '''
         # check the type of the input seed
        try:
            self.seed.split()
        except AttributeError as a:
            print("The seed should be a string.")
            return a
        
        # test the input
        if self.n > len(self.seed.split()):
            raise ValueError("n has to be smaller or equal to the number of words in seed.")
 
        # initialize
        word_dict = count_nwords(self.text, self.n + 1)
        fake = self.seed
        
        while len(fake.split()) < self.length:
            previous = tuple(fake.split()[(-self.n):])

            # filter dict to keep only matching grams
            sub = {}
            for key in word_dict:
                if key[:self.n] == previous:
                    sub[key] = word_dict[key]
            
            # convert to lists for use with random.choices
            choices = list(sub.keys())
            weights = [sub[key] for key in choices]

            # make choice
            # if no possible choice could be found, try to decrease n by 1.
            try:
                new_nword = random.choices(choices, weights)[0]
            except IndexError:
                if self.n > 1:
                    self.n -= 1
                    print("cannot find matching words, trying n-1 =", self.n)
                    return self.markov_text_word()
                else:
                    return ValueError("Please try another seed")

            # add to s
            new_word = new_nword[-1]
            fake += " " + new_word
    
        return fake
    
    def make_a_sentence(self):
        '''
        Generate a sentence. Prompt the user to choose whether to 
        generate this sentence by characters or by words.
        Arg:
            None
        Return:
            s: string, a sentence
        '''
        sentenceEnders = re.compile('[.!?:;]')
        by = input("Do you want to generate text by characters or by words? \nEnter 'c' or 'w': ")
        if by == "c":
            s = self.markov_text_char()
            s = sentenceEnders.split(s)[0]
        elif by == "w":
            s = self.markov_text_word()
            s = sentenceEnders.split(s)[0]
        else:  
            s = "WARNING: No text generated. Please try again and enter either 'c' or 'w'."
        return s