In [None]:
import random

def generate(filename: str, start_words: list[str], chain_length: int, num_generated: int) -> str:
  """
  This function generates text similar to the text in a file, following Markov chains.

  Args:
      filename: The name of the file to read the text from.
      start_words: A list of words to start the generated text with (must be same length as chain_length).
      chain_length: The number of words to consider when predicting the next word.
      num_generated: The number of words to generate in the output sentence.

  Returns:
      A sentence generated using the Markov chain model, num_generated words long.
  """

  # Read text from file and preprocess
  with open(filename, 'r') as f:
    text = f.read().lower().strip()  # Convert to lowercase and remove extra spaces

  # Split text into words and create a dictionary to store transitions
  words = text.split()
  word_dict = {}
  for i in range(len(words) - chain_length):
    key = tuple(words[i:i + chain_length])  # Create a tuple as the key for the chain
    if key not in word_dict:
      word_dict[key] = []
    word_dict[key].append(words[i + chain_length])  # Append the following word

  # Check if start_words length matches chain_length
  if len(start_words) != chain_length:
    raise ValueError("start_words list must be the same length as chain_length")

  # Generate the text using Markov chain
  generated_text = start_words
  for _ in range(num_generated):
    previous_words = tuple(generated_text[-chain_length:])
    if previous_words not in word_dict:
      # If the sequence is not found, randomly choose a word from the entire vocabulary
      next_word = random.choice(words)
    else:
      next_word = random.choice(word_dict[previous_words])
    generated_text.append(next_word)

  return " ".join(generated_text)

# Explanation for start_words length
# The reason why start_words must be the same length as chain_length is because the Markov chain model predicts the next word based on a sequence of previous words. The chain_length defines how many previous words are considered for this prediction.

# If the length of start_words is less than chain_length, there won't be enough context to predict the first word using the Markov chain. Similarly, if it's longer, it won't match the expected sequence length for the model. By keeping them the same size, we ensure a valid starting point for the text generation process.


In [None]:
# Define filename, start_words, chain_length, and num_generated
filename = "file.txt" # Replace with your file path
start_words = ["The", "weather", "is"]  # Must be same length as chain_length
chain_length = 3
num_generated = 10

# Call the generate function
generated_text = generate(filename, start_words, chain_length, num_generated - len(start_words))

# Print the generated text
print(generated_text)


The weather is for lately. in the shining throughout in


In [None]:
filename = "short_text.txt"  # Content: The quick brown fox jumps over the lazy dog.
start_words = ["The"]
chain_length = 1
num_generated = 2

generated_text = generate(filename, start_words, chain_length, num_generated - len(start_words))
print(generated_text)  # Expected output: quick OR lazy (depending on random choice)

# Test Case Verdict: Pass (if the generated text is a single word that could follow "The" in the source text)

The lazy


In [None]:
def run_test_case(filename, start_words, chain_length, num_generated, expected_start):
  try:
    generated_text = generate(filename, start_words, chain_length, num_generated)
    if generated_text.startswith(expected_start):
      print(f"Test Case Passed: Generated text starts with '{expected_start}'.")
      print("generated text: ",generated_text)
    else:
      print(f"Test Case Failed: Expected start '{expected_start}' not found in generated text.")
  except ValueError as e:
    print(f"Error: {e}")

# Test Case 1: Simple Text with Short Chain Length (Incorrect Start Words Length)
filename = "short_text.txt"  # Content: The quick brown fox jumps over the lazy dog.
start_words = ["The"]  # Incorrect length (should be 1)
chain_length = 1
num_generated = 1
expected_start = "The"
run_test_case(filename, start_words, chain_length, num_generated - len(start_words), expected_start)


# Test Case 2: Text with Repeated Phrases with Short Chain Length (Valid Start Words)
filename = "repeated_phrases.txt"  # Content: I like pizza. I like pizza with cheese. I like pizza with cheese and pepperoni.
start_words = ["I", "like"]
chain_length = 1
num_generated = 1
expected_start = "I like"
run_test_case(filename, start_words, chain_length, num_generated - len(start_words), expected_start)

# Test Case 3: Text with Longer Chain Length (Valid Start Words)
filename = "file.txt"  # Use the sample text provided earlier describing various weather conditions
start_words = ["The", "weather", "is"]
chain_length = 3
num_generated = 5
expected_start = "The weather is"
run_test_case(filename, start_words, chain_length, num_generated - len(start_words), expected_start)

# Test Case 4: Text with Unseen Words
filename = "news_article.txt"  # Use a real news article with diverse vocabulary
start_words = ["The", "government", "announced"]
chain_length = 3
num_generated = 3
expected_start = "The government announced"
run_test_case(filename, start_words, chain_length, num_generated - len(start_words), expected_start)


Test Case Passed: Generated text starts with 'The'.
generated text:  The
Error: start_words list must be the same length as chain_length
Test Case Passed: Generated text starts with 'The weather is'.
generated text:  The weather is the difficult
Test Case Passed: Generated text starts with 'The government announced'.
generated text:  The government announced
