In [None]:
from collections import defaultdict
import random

def split_text_into_words(text):
    """Splits the given text into a list of words."""
    return text.split()


In [None]:
def add_state_transitions(words, chain_length, markov_chain):
    """
    Builds state transitions from the list of words and updates the markov_chain dictionary.
    """
    for i in range(len(words) - chain_length):
        state = tuple(words[i:i+chain_length])  # Define the current state
        next_word = words[i + chain_length]  # Define the next word after the state
        markov_chain[state][next_word] += 1  # Update the transition count from state to next_word


In [None]:
def convert_counts_to_probabilities(markov_chain):
    """
    Converts transition counts to probabilities in the markov_chain dictionary.
    """
    for state, next_words in markov_chain.items():
        total = sum(next_words.values())
        markov_chain[state] = {word: count / total for word, count in next_words.items()}


In [None]:
def build_markov_chain(text, chain_length):
    """
    Builds a Markov chain model using the specified chain_length from the given text.
    """
    markov_chain = defaultdict(lambda: defaultdict(int))
    words = split_text_into_words(text)
    add_state_transitions(words, chain_length, markov_chain)
    convert_counts_to_probabilities(markov_chain)
    return markov_chain


In [4]:
def generate(text, start_words, chain_length, num_generated):
    markov_chain = build_markov_chain(text, chain_length)
    current_state = tuple(start_words)
    if len(current_state) != chain_length:
        raise ValueError("The length of start_words must be equal to chain_length")

    generated_words = list(current_state)  # Start with the initial state
    for _ in range(num_generated - chain_length):
        next_word_choices = markov_chain.get(current_state)
        if not next_word_choices:
            break  # Break if the next state isn't found
        next_word = random.choices(list(next_word_choices.keys()), weights=next_word_choices.values())[0]
        generated_words.append(next_word)
        current_state = (*current_state[1:], next_word)  # Update the current state

    return ' '.join(generated_words)


In [5]:
# Example usage
text = "This is an example text. It is not very long, but it is meaningful for our example. This example shows how Markov chains can be used."
start_words = ["This", "is"]
chain_length = 2
num_generated = 10
print(generate(text, start_words, chain_length, num_generated))


This is an example text. It is not very long,


In [6]:
# Test Case 1: Simple Repetitive Text
text1 = "hello world hello world hello world"
start_words1 = ["hello", "world"]
chain_length1 = 2
num_generated1 = 6
print(generate(text1, start_words1, chain_length1, num_generated1))


hello world hello world hello world


In [11]:
# Test Case 2: Handling Unknown Start Words
text2 = "one two three, three four five"
start_words2 = ["two", "four"]
chain_length2 = 2
num_generated2 = 5
try:
    print(generate(text2, start_words2, chain_length2, num_generated2))
except ValueError as e:
    print(e)


two four


In [8]:
# Test Case 3: Short Text with Limited Options
text3 = "I like to eat apples. I like to play games."
start_words3 = ["I", "like"]
chain_length3 = 2
num_generated3 = 10
print(generate(text3, start_words3, chain_length3, num_generated3))


I like to play games.


In [9]:
# Test Case 4: Long Text with Punctuation
text4 = "This is a sentence. This sentence is longer. This one? Quite short. Finally, a very long sentence indeed, don't you think?"
start_words4 = ["This", "is"]
chain_length4 = 2
num_generated4 = 20
print(generate(text4, start_words4, chain_length4, num_generated4))


This is a sentence. This sentence is longer. This one? Quite short. Finally, a very long sentence indeed, don't you


In [10]:
# Test Case 5: Non-English Text
text5 = "esto es una prueba. esta prueba es para verificar el comportamiento."
start_words5 = ["esto", "es"]
chain_length5 = 2
num_generated5 = 10
print(generate(text5, start_words5, chain_length5, num_generated5))


esto es una prueba. esta prueba es para verificar el
