# DATA 622 - Machine Learning and Big Data
## Homework 6: Story Generation using Markov Chains and LLMs

| **Name** | Sanket Vijay Patil |
| **Student ID** | GW61258 |
| **Email** | sanketp1@umbc.edu |


In [43]:
import random
import re
from collections import defaultdict
from transformers import pipeline
import os

In [44]:

class MarkovChain:

    
    def __init__(self, order=2):

        self.order = order
        self.chain = defaultdict(list)
        self.starts = []
    
    def train(self, text):

        words = self._tokenize(text)
        
        # Find sentence starters (capitalized words after punctuation)
        for i, word in enumerate(words):
            if i == 0 or words[i-1] in ['.', '!', '?']:
                if word[0].isupper() and word not in ['.', '!', '?', ',']:
                    self.starts.append(word)
        
        # Build n-gram chain
        for i in range(len(words) - self.order):
            state = tuple(words[i:i + self.order])
            next_word = words[i + self.order]
            self.chain[state].append(next_word)
    
    def _tokenize(self, text):
        text = re.sub(r'([.!?,;:])', r' \1 ', text)
        words = text.split()
        return [w for w in words if w]
    
    def generate(self, start_phrase=None, max_words=150):

        if not self.chain:
            return "Error: Train the model first!"
        
        # Initialize with start phrase or random start
        if start_phrase:
            words = self._tokenize(start_phrase)
            if len(words) < self.order:
                random_state = random.choice(list(self.chain.keys()))
                words = words + list(random_state[:self.order - len(words)])
        else:
            words = list(random.choice(list(self.chain.keys())))
        
        # Generate words
        for _ in range(max_words - len(words)):
            state = tuple(words[-self.order:])
            
            if state in self.chain:
                next_word = random.choice(self.chain[state])
                words.append(next_word)
                
                # Occasionally stop at sentence end
                if len(words) > 50 and next_word in ['.', '!', '?'] and random.random() < 0.3:
                    break
            else:
                if self.starts:
                    words.append(random.choice(self.starts))
                else:
                    break
        
        # Format output
        text = ' '.join(words)
        text = re.sub(r'\s+([.!?,;:])', r'\1', text)
        text = re.sub(r'\s+', ' ', text)
        
        return text


def load_training_text_from_file(filename='Downloads/tesla_training_text.txt'):
    try:
        print(f"Attempting to load training data from: {filename}")
        
        with open(filename, 'r', encoding='utf-8') as f:
            text = f.read()
        
        print(f"  Successfully loaded training data")
        print(f"  - File: {filename}")
        print(f"  - Size: {len(text)} characters")
        print(f"  - Words: {len(text.split())} words")
        print(f"  - Lines: {len(text.splitlines())} lines")
        print()
        
        return text
        
    except FileNotFoundError:
        print(f"ERROR: File '{filename}' not found!")
 
        return None
    
    except Exception as e:
        print(f"Error reading file: {e}")
        return None


def exercise_1():

    print("EXERCISE 1: MARKOV CHAIN STORY GENERATION")
    print()
    
    #Loadinf the data 
    print("STEP 1: Loading Training Data from File")
    print("-" * 80)
    training_text = load_training_text_from_file('Downloads/tesla_training_text.txt')
    
    if training_text is None or len(training_text.strip()) == 0:
        print("Error Loading")
        return
    
    # STEP 2: Create and train Markov Chain
    print("STEP 2: Training Markov Chain Model")
    print("-" * 80)
    print("Building n-gram model (order=2, trigram)")
    
    markov = MarkovChain(order=2)
    markov.train(training_text)
    
    print(f"  Model trained successfully")
    print(f"  - Unique word sequences learned: {len(markov.chain)}")
    print(f"  - Sentence starters found: {len(markov.starts)}")
    print()
    
    # STEP 3: Generate story
    print("STEP 3: Generating Story")
    print("-" * 80)
    start_phrase = "Once upon a time there was a kingdom"
    print(f"Starting phrase: '{start_phrase}'")
    print(f"Maximum words: 120")
    print()
    
    generated_story = markov.generate(start_phrase=start_phrase, max_words=120)
    
    # STEP 4: Display result
    print("GENERATED STORY:")
    print(generated_story)



In [45]:
def exercise_2():

    print("EXERCISE 2: LLM-GENERATED STORY")
    print()
    print("Topic: Why Study Data Science Today?")
    print("Model: GPT-2 (Generative Pre-trained Transformer)")
    print()
    
    print("Loading language model...")
    print("(First run may take time to download model)")
    print()
    
    try:
        generator = pipeline(
            model='gpt2',
            max_length=400
        )
        
        #prompting it for narrative story 
        prompt = """Write a creative story about why studying data science is important today. 

Once upon a time in the year 2025, there was a young student named Alex who was deciding what to study."""
        
        
        result = generator(
            prompt,
            max_length=400,
            num_return_sequences=1,
            temperature=0.8,         
            top_p=0.9,             
            do_sample=True,
            pad_token_id=generator.tokenizer.eos_token_id
        )
        
        story = result[0]['generated_text']
        
        print("GENERATED STORY:")
        print(story)
        print()
        
    except Exception as e:
        print(f"Error with LLM: {e}")
        print('Error')
        
      
        fallback_story = """There was a student named Alex in 2025 choosing what to major in.
Alex existed in a world where data went every which way—smartphones, sensors, and satellites. Data was the new gold.

Alex visited one hospital where a data scientist showed him how algorithms could predict
diseases before symptoms appeared. "We scan millions of patient records,"
she said. "Data science saves lives."

On a farm, Alex saw how data enabled more food on less water. In the city,
data ended traffic jams. At a bank, it detected fraud. Wherever Alex looked,
data science was solving real problems.

"This is it," Alex realized. "Data science isn't so much about numbers. It's about
understanding the world and making it better."

Alex studied data science, looking forward to helping build a smarter, greener future."""
        
        print("FALLBACK STORY:")
        print(fallback_story)


if __name__ == "__main__":

    print("\n")
    print("DATA 622 | HOMEWORK 6: STORY GENERATION")
    print()

    
    exercise_1()
    
    exercise_2()
    
    print("HOMEWORK COMPLETE")
    print()



DATA 622 | HOMEWORK 6: STORY GENERATION

EXERCISE 1: MARKOV CHAIN STORY GENERATION

STEP 1: Loading Training Data from File
--------------------------------------------------------------------------------
Attempting to load training data from: Downloads/tesla_training_text.txt
  Successfully loaded training data
  - File: Downloads/tesla_training_text.txt
  - Size: 14209 characters
  - Words: 2421 words
  - Lines: 190 lines

STEP 2: Training Markov Chain Model
--------------------------------------------------------------------------------
Building n-gram model (order=2, trigram)
  Model trained successfully
  - Unique word sequences learned: 2002
  - Sentence starters found: 190

STEP 3: Generating Story
--------------------------------------------------------------------------------
Starting phrase: 'Once upon a time there was a kingdom'
Maximum words: 120

GENERATED STORY:
Once upon a time there was a kingdom As When I have been years planning self controlled automata. Mechanisms 

Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


GENERATED STORY:
Write a creative story about why studying data science is important today. 

Once upon a time in the year 2025, there was a young student named Alex who was deciding what to study. He was studying science and technology, so he decided to start off with a computer science class. This was his first year at the university and it was pretty simple: he'd take a test of a new set of computer science tools, then he'd go on to start a new research project.

As a result, Alex decided to study for two years before taking the Computer Science course, and it wasn't long before he started getting a few questions about how to make a computer program work. He was getting questions about how to implement a new approach to computer science, but he was also getting questions about how to use the latest tools to teach his students.

A year later, Alex decided to take a Computer Science course at Harvard and get a good number of questions about the tools they had developed to teach his st