In [None]:
import random

def preprocess_text(text):
    # Clean the text and split into words
    words = text.lower().split()
    return words

def build_transition_matrix(words, chain_length):
    # Build the Markov chain transition matrix
    transition_matrix = {}
    for i in range(len(words) - chain_length):
        current_state = tuple(words[i:i + chain_length])
        next_word = words[i + chain_length]
        if current_state in transition_matrix:
            transition_matrix[current_state].append(next_word)
        else:
            transition_matrix[current_state] = [next_word]
    return transition_matrix

def generate_text(word_dict, start_word, output_length, chain_length=10):
    """
    Generates text recursively using a Markov chain model, with a maximum chain length.

    Args:
        word_dict: A dictionary representing word transitions.
        start_word: The word to start the generated text.
        output_length: The desired length of the generated text.
        chain_length: The maximum length of the word chain to consider (default 10).

    Returns:
        A string containing the generated text.
    """

    if output_length <= 0:
        return ""

    if chain_length <= 0:  # Allow chain_length to be 0
        next_word = random.choice(list(word_dict[start_word].keys()))  # Choose a random next word
        return f"{start_word} {generate_text(word_dict, next_word, output_length - 1, chain_length)}"  # Recursive call

    next_word = select_next_word(word_dict, start_word)
    # Limit chain length by stopping recursion if chain_length is reached
    if chain_length == 1:
        return f"{start_word}"
    else:
        return f"{start_word} {generate_text(word_dict, next_word, output_length - 1, chain_length - 1)}"

def select_next_word(word_dict, current_word):
    """Selects the next word based on probabilities or a random choice."""

    if current_word not in word_dict:
        return random.choice(list(word_dict.keys()))  # Random word if not in dict

    word_choices = list(word_dict[current_word].keys())
    word_probs = [word_dict[current_word][word] / sum(word_dict[current_word].values()) for word in word_choices]
    return random.choices(word_choices, word_probs)[0]

# Sample text corpus (replace with your own corpus or file reading logic)
text_corpus = "This is a sample text corpus. It contains various words and phrases that can be used to generate new text. The model will learn the probabilities of word transitions and use them to create a sequence of words that resembles the original text."

# Clean the corpus text
corpus_text = text_corpus.lower()
corpus_text = corpus_text.replace(",", "").replace(".", "").replace("!", "").replace("?", "")
words = corpus_text.split()

# Create a dictionary to store word transitions
word_dict = {}
i = 0
while i < len(words) - 1:
    current_word = words[i]
    if current_word not in word_dict:
        word_dict[current_word] = {}
    next_word = words[i + 1]
    if next_word in word_dict[current_word]:
        word_dict[current_word][next_word] += 1
    else:
        word_dict[current_word][next_word] = 1
    i += 1

# Example usage
text = generate_text(word_dict, start_word="the", output_length=10)
print(text)

# Test cases
test_cases = [
    {"name": "Basic Functionality", "start_word": "the", "output_length": 5, "chain_length": 5},
    {"name": "Handling Unknown Start Word", "start_word": "Pangea", "output_length": 7, "chain_length": 1},
    {"name": "Edge Case - Empty Corpus", "start_word": "any", "output_length": 3, "chain_length": 10},  # Adjust chain_length as needed
    {"name": "Zero Output Length", "start_word": "is", "output_length": 0, "chain_length": 10},
    {"name": "Very Long Output Length", "start_word": "a", "output_length": 1000, "chain_length": 10},  # Adjust output_length as needed
    {"name": "Non-string starting word", "start_word": 10, "output_length": 5, "chain_length": 10},  # Pass an integer as starting word
    {"name": "Non-numeric output length", "start_word": "once", "output_length": "ten",}]
for idx, case in enumerate(test_cases, 1):
    try:
        text = generate_text(word_dict, case["start_word"], case["output_length"], case["chain_length"])
        print(f"Test Case {idx}: {case['name']} - Output:", text)
    except (ValueError, TypeError, KeyError) as e:
        print(f"Test Case {idx}: {case['name']} - Error:", e)


the model will learn the probabilities of words and use
Test Case 1: Basic Functionality - Output: the model will learn the
Test Case 2: Handling Unknown Start Word - Output: Pangea
Test Case 3: Edge Case - Empty Corpus - Output: any use them 
Test Case 4: Zero Output Length - Output: 
Test Case 5: Very Long Output Length - Output: a sequence of words and phrases that can be used
Test Case 6: Non-string starting word - Output: 10 new text corpus it 
Test Case 7: Non-numeric output length - Error: 'chain_length'
