<a href="https://colab.research.google.com/github/SofiaAkhtar/WE-Module3/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import random

def preprocess_text(text):
  """Preprocesses the text by splitting it into words and converting them to lowercase."""
  words = text.lower().split()
  return words

def build_transitions(words):
  """Builds a dictionary to store the word transitions.

  Args:
      words: A list of words representing the preprocessed text.

  Returns:
      A dictionary where keys are current words and values are lists of following words.
  """
  transitions = {}
  prev_word = None
  for current_word in words:
    if prev_word is not None:
      if prev_word not in transitions:
        transitions[prev_word] = []
      transitions[prev_word].append(current_word)
    prev_word = current_word
  return transitions

def generate_chain(transitions, start_word, output_length):
  """Generates a markov chain of length output_length starting with start_word.

  Args:
      transitions: A dictionary where keys are current words and values are lists of following words.
      start_word: The word to start the markov chain with.
      output_length: The length of the markov chain to generate.

  Returns:
      A list of words representing the generated markov chain.
  """
  markov_chain = [start_word]
  for _ in range(output_length - 1):
    if start_word not in transitions:
      # If the current word doesn't have any following words, choose a random word
      next_word = random.choice(words)
    else:
      # Choose a random word from the following words based on their probabilities
      next_word = random.choice(transitions[start_word])
    markov_chain.append(next_word)
    start_word = next_word
  return markov_chain

# Example usage
text = "This is an example text to generate a markov chain."
words = preprocess_text(text)
transitions = build_transitions(words)
start_word = "This"
output_length = 10
markov_chain = generate_chain(transitions, start_word, output_length)
print("Markov chain:", markov_chain)


Markov chain: ['This', 'this', 'is', 'an', 'example', 'text', 'to', 'generate', 'a', 'markov']


In [3]:
text = ""
# Test with empty text
try:
  words = preprocess_text(text)
  transitions = build_transitions(words)
  start_word = "This"  # Should raise an error
  output_length = 10
  markov_chain = generate_chain(transitions, start_word, output_length)
except Exception as e:
  print("Expected error for empty text:", e)


Expected error for empty text: list index out of range


In [4]:
text = "Hello"
words = preprocess_text(text)
transitions = build_transitions(words)
start_word = "Hello"
output_length = 5
markov_chain = generate_chain(transitions, start_word, output_length)
print("Markov chain (single word):", markov_chain)


Markov chain (single word): ['Hello', 'hello', 'hello', 'hello', 'hello']


In [5]:
text = "This is a sentence. How does it work?"
words = preprocess_text(text)
transitions = build_transitions(words)
start_word = "This"
output_length = 8
markov_chain = generate_chain(transitions, start_word, output_length)
print("Markov chain (punctuation):", markov_chain)


Markov chain (punctuation): ['This', 'sentence.', 'how', 'does', 'it', 'work?', 'sentence.', 'how']


In [6]:
# Replace with a longer text passage
text = "This is a longer text passage to test the code's performance with a larger dataset. It allows us to see how the Markov chain captures the word relationships over a bigger sample."
words = preprocess_text(text)
transitions = build_transitions(words)
start_word = "This"
output_length = 15
markov_chain = generate_chain(transitions, start_word, output_length)
print("Markov chain (long text):", markov_chain)


Markov chain (long text): ['This', 'is', 'a', 'bigger', 'sample.', 'a', 'larger', 'dataset.', 'it', 'allows', 'us', 'to', 'see', 'how', 'the']


In [7]:
text = "The quick brown fox jumps over the lazy dog."
words = preprocess_text(text)
transitions = build_transitions(words)
start_word = "Apple"  # Not present in the text
output_length = 7
markov_chain = generate_chain(transitions, start_word, output_length)
print("Markov chain (specific start word):", markov_chain)


Markov chain (specific start word): ['Apple', 'dog.', 'fox', 'jumps', 'over', 'the', 'quick']
