In [3]:
import random
import string

def count_characters(s):
    """
    Count the number of occurrences of each character in a string. 
    s: str, the string in which to count. 
    return counts, a dict keyed by character whose values are the number of occurrences in s
    """
    
    # create an empty dictionary
    counts = {}
    
    # loop through the input string, 
    # creating and incrementing counts
    for i in range(len(s)):
        l = s[i]
        if l in counts:
            counts[l] += 1
        # if it's a new key
        else:
            counts[l] = 1
            
    return counts

def count_ngrams(s, n=1):
    """
    Count the number of occurrences of n-grams in a string. 
    s: str, the string in which to count. 
    n: the size of n-grams to count. 
    return counts, a dict keyed by n-grams whose values are the number of occurrences in s
    """
    
    # initialize an empty dictionary
    counts = {}
    
    # loop through substrings of length n of 
    # the input string, updating counts as before
    for i in range(len(s)-n+1):
        gram = s[i:(i+n)]
        if gram in counts:
            counts[gram] += 1
        else:
            counts[gram] = 1

    return counts


def markov_text(s, n, length = 100, seed = "Emma Woodhouse"):
    """
    Generate fake text according to an n-th order Markov model, with data from a user-supplied corpus. 
    s: the text from which to learn grams
    n: the order of the Markov model. 
    length: the number of synthetic characters to generate. The length of the output string will be equal to this plus the length of the seed string. 
    seed: the initial string.
    """
    counts = count_ngrams(s, n+1)
    
    # initialize
    fake = seed
    for i in range(length):
        previous = fake[(-n):]
        
        # filter dict to keep only matching grams
        sub = {}
        for key in counts:
            if key[:-1] == previous:
                sub[key] = counts[key]
        
        # slightly simpler, if you know dictionary comprehensions (not discussed in class)
        # sub = {key : counts[key] for key in counts if key[:-1] == previous}
        
        # convert to lists for use with random.choices
        choices = list(sub.keys())
        weights = [sub[key] for key in choices]
          
        # make choice 
        new_gram = random.choices(choices, weights)[0]
        
        # add to s
        new_char = new_gram[-1]
        fake += new_char
    
    return fake

In [None]:
with open('emma-full.txt', 'r', encoding='UTF-8') as f:
    s = f.read()

In [12]:
markov_text(s, n = 4, length = 100, seed = "Emma")

'Emma.”\n\n“Aye, preferableness, quite end at Marticular disengaged do noise. You speak it make too, if her'

In [204]:
def count_nwords(s, n = 1):
    '''
    count the number of occurance of n-words in a string
    Input: s, the string
           n, the size of n-words to count
    Output: a dictionary keyed by n-words whose values are number of occurance in s
    '''
    counts = {}
    words = s.split()
    for i in range(len(words) - n + 1):
        nwords = " ".join(words[i:(i+n)])
        if nwords in counts:
            counts[nwords] += 1
        else:
            counts[nwords] = 1
    return counts

In [16]:
import random

def markov_text_word(s, n, length = 100, seed = "Emma Woodhouse"):
    """
    Generate fake text according to an n-th order Markov model
    s: the text from which to learn words
    n: the order of the Markov model
    length: the number of synthetic words to generate.
    seed: the initial string.
    """
    counts = count_nwords(s, n+1)
    
    # initialize
    fake = seed
    for i in range(length):
        previous = tuple(fake.split()[(-n):])
        
        # filter dict to keep only matching grams
        sub = {}
        for key in counts:
            if key[:n] == previous:
                sub[key] = counts[key]
                
        # convert to lists for use with random.choices
        choices = list(sub.keys())
        weights = [sub[key] for key in choices]

        # make choice 
        new_nword = random.choices(choices, weights)[0]
        
        # add to s
        new_word = new_nword[-1]
        fake += " " + new_word
    
    return fake

# project starts here

In [14]:
import random
import string
import re

In [15]:
def count_ngrams(s, n=1):
    """
    Count the number of occurrences of n-grams in a string. 
    s: str, the string in which to count. 
    n: the size of n-grams to count. 
    return counts, a dict keyed by n-grams whose values are the number of occurrences in s
    """
    
    # initialize an empty dictionary
    counts = {}
    
    # loop through substrings of length n of 
    # the input string, updating counts as before
    for i in range(len(s)-n+1):
        gram = s[i:(i+n)]
        if gram in counts:
            counts[gram] += 1
        else:
            counts[gram] = 1

    return counts

In [16]:
def count_characters(s):
    """
    Count the number of occurrences of each character in a string. 
    s: str, the string in which to count. 
    return counts, a dict keyed by character whose values are the number of occurrences in s
    """
    
    # create an empty dictionary
    counts = {}
    
    # loop through the input string, 
    # creating and incrementing counts
    for i in range(len(s)):
        l = s[i]
        if l in counts:
            counts[l] += 1
        # if it's a new key
        else:
            counts[l] = 1
            
    return counts


def count_nwords(s, n = 1):
    '''
    count the number of occurance of n-words in a string
    Input: s, the string
           n, the size of n-words to count
    Output: a dictionary keyed by n-words whose values are number of occurance in s
    '''
    counts = {}
    words = s.split()
    for i in range(len(words) - n + 1):
        nwords = tuple(words[i:(i+n)])
        if nwords in counts:
            counts[nwords] += 1
        else:
            counts[nwords] = 1
    return counts

In [19]:
class MarkovText():
    def __init__(self, text, n, length, seed):
        '''
        Introduce this class! TO-BE-ADDED
        '''
        self.text = text
        self.n = n
        self.length = length
        self.seed = seed
        self.char_dict = count_ngrams(self.text, self.n + 1)
        self.word_dict = count_nwords(self.text, self.n + 1)
    
    def markov_text_char(self):
        '''
        generate fake text with characters as units
        Args:
            None
        Returns:
            fake: a string, the fake text
        '''
        # test the input
        if self.n > len(self.seed):
            raise ValueError("n has to be smaller or equal to the number of characters in seed.")
        
        # initialize
        fake = self.seed
        for i in range(self.length):
            previous = fake[(-self.n):]
        
        # filter dict to keep only matching grams
            sub = {}
            for key in self.char_dict:
                if key[:-1] == previous:
                    sub[key] = self.char_dict[key]
                    
            # convert to lists for use with random.choices
            choices = list(sub.keys())
            weights = [sub[key] for key in choices]

            # make choice 
            try:
                new_gram = random.choices(choices, weights)[0]
            except IndexError
            
            # add to s
            new_char = new_gram[-1]
            fake += new_char
    
        return fake


    def markov_text_word(self):
        '''
        Generate fake text with words as units.
        Args:
            None
        Returns:
            fake: a string, the fake text
        '''
        # test the input
        if self.n > len(self.seed.split()):
            raise ValueError("n has to be smaller or equal to the number of words in seed.")
        
        # initialize
        fake = self.seed
        for i in range(self.length):
            previous = tuple(fake.split()[(-self.n):])

            # filter dict to keep only matching grams
            sub = {}
            for key in self.word_dict:
                if key[:self.n] == previous:
                    sub[key] = self.word_dict[key]

            # convert to lists for use with random.choices
            choices = list(sub.keys())
            if len(choices) == 0:
                raise ValueError("No matched n-gram.")
            weights = [sub[key] for key in choices]

            # make choice 
            new_nword = random.choices(choices, weights)[0]

            # add to s
            new_word = new_nword[-1]
            fake += " " + new_word
    
        return fake
    
    def make_a_sentence(self):
        '''
        Generate a sentence. Prompt the user to choose whether to 
        generate this sentence by characters or by words.
        Arg:
            None
        Return:
            s: string, a sentence
        '''
        sentenceEnders = re.compile('[.!?:;]')
        by = input("Do you want to generate text by characters or by words? \nEnter 'c' or 'w': ")
        if by == "c":
            s = self.markov_text_char()
            s = sentenceEnders.split(s)[0]
        elif by == "w":
            s = self.markov_text_word()
            s = sentenceEnders.split(s)[0]
        else:  
            s = "WARNING: No text generated. Please try again and enter either 'c' or 'w'."
        return s
            

### Ideas regarding text comparison:
1. compare the characters/words frequency for two texts
2. Reading ease score

In [8]:
with open('emma-full.txt', 'r', encoding='UTF-8') as f:
    s = f.read()

In [21]:
# test class methods
Test = MarkovText(text = s, n = 4, length = 20, seed = "Emma")
Test.markov_text_char()

'Emma, “in vain, Emma—can'

In [184]:
Test.markov_text_word()

'Emma was very superior intelligence, of some reluctance the suspicion first to Emma, which must often think and who will be'

In [187]:
Test.make_a_sentence()

Do you want to generate text by characters or by words? 
Enter 'c' or 'w': w


'Emma was not suffer more before she could not a sofa'

In [189]:
# generate three sentences
for i in range(3):
    print(Test.make_a_sentence())

Do you want to generate text by characters or by words? 
Enter 'c' or 'w': c
Emmakint mstheave han It
Do you want to generate text by characters or by words? 
Enter 'c' or 'w': t
Do you want to generate text by characters or by words? 
Enter 'c' or 'w': w
Emma and after a consideration of its getting to which bids fair but his principal in approving


In [190]:
# to store the sentences
sentences = []
for i in range(2):
    sentences.append(Test.make_a_sentence())

Do you want to generate text by characters or by words? 
Enter 'c' or 'w': w
Do you want to generate text by characters or by words? 
Enter 'c' or 'w': c


In [191]:
sentences

['Emma said she, “and you discovered, and comfort or settling till this spring',
 'Emmatherdixok yost Eldie']