# Text Analysis and Generation

In [None]:
import unicodedata
import random

def split_line(line):
    """Replaces em-dashes with spaces and splits a string into a list of words."""
    return line.replace('—', ' ').split()

def get_punctuation(filename):
    """Identifies all punctuation characters in a file using the Unicode database."""
    punc_marks = {}
    for line in open(filename):
        for char in line:
            category = unicodedata.category(char)
            if category.startswith('P'):
                punc_marks[char] = 1
    return ''.join(punc_marks)

def clean_word(word, punctuation):
    """Strips specified punctuation from word edges and converts to lowercase."""
    return word.strip(punctuation).lower()

def build_word_counter(filename, punctuation):
    """Builds a frequency histogram of all cleaned words in a text file."""
    word_counter = {}
    for line in open(filename):
        for word in split_line(line):
            word = clean_word(word, punctuation)
            word_counter[word] = word_counter.get(word, 0) + 1
    return word_counter

def second_element(t):
    """Returns the second element of a tuple for sorting by dictionary values."""
    return t[1]

def print_most_common(word_counter, num=5):
    """Prints the most frequent items from a dictionary using an optional parameter."""
    items = sorted(word_counter.items(), key=second_element, reverse=True)
    for word, freq in items[:num]:
        print(freq, word, sep='\t')

def subtract(d1, d2):
    """Returns a dictionary containing keys from d1 that are absent in d2."""
    res = {}
    for key in d1:
        if key not in d2:
            res[key] = d1[key]
    return res

def choose_from_hist(hist):
    """Selects a random word weighted by its frequency in a histogram."""
    t = []
    for word, freq in hist.items():
        t.extend([word] * freq)
    return random.choice(t)

def add_bigram(bigram, successor_map):
    """Adds a two-word sequence to a mapping using the setdefault method."""
    first, second = bigram
    successor_map.setdefault(first, []).append(second)

def shift(prefix, word):
    """Creates a new prefix tuple by shifting out the head and adding a new word."""
    return prefix[1:] + (word,)

def generate_text(mapping, n=50):
    """Generates random text by following bigram transitions in a successor map."""
    if not mapping:
        return

    bigram = random.choice(list(mapping.keys()))
    for _ in range(n):
        suffixes = mapping.get(bigram)
        if not suffixes:
            break
        word = random.choice(suffixes)
        print(word, end=' ')
        bigram = (bigram[1], word)
    print()

### What are the differences between large language models like ChatGPT and Markov chain text analysis?

While Markov chains and Large Language Models (LLMs) both generate text by predicting the next word in a sequence, they differ fundamentally in context, representation, and complexity. A Markov chain is "memoryless," relying on a dictionary of exact string matches (like the successor_map) that only considers a tiny window of previous words—typically just one or two—often leading to nonsensical results as the chain grows. In contrast, LLMs use Transformers and Attention Mechanisms to maintain a "memory" of thousands of words, and they process language through Word Embeddings—mathematical vectors that represent the semantic meaning and relationship between words rather than just their spelling. Consequently, while a Markov chain can only repeat patterns found explicitly in its source text, an LLM can reason, understand context, and generate entirely original, coherent responses based on a deep statistical understanding of human language.

In [None]:
# Write a function that counts the number of times each trigram (sequence of three words) appears.

def count_trigram(trigram, trigram_map):
    """Adds a trigram to the mapping and updates its frequency."""
    prefix = (trigram[0], trigram[1])
    suffix = trigram[2]

    hist = trigram_map.setdefault(prefix, {})
    hist[suffix] = hist.get(suffix, 0) + 1

def process_word_trigram(filename):
    """Reads a file and builds a mapping of trigram frequencies."""
    trigram_map = {}
    words = []

    for line in open(filename):
        for word in split_line(line):
            words.append(clean_word(word))

    for i in range(len(words) - 2):
        trigram = (words[i], words[i+1], words[i+2])
        count_trigram(trigram, trigram_map)

    return trigram_map

In [None]:
# Write a function called add_trigram that takes a list of three words and either adds or updates an item in successor_map, using the first two words as the key and the third word as a possible successor.

def add_trigram(window):
    """
    Takes a list of three words.
    Uses the first two as a tuple key and adds the third to the successor list.
    """
    prefix = tuple(window[:2])
    suffix = window[2]

    successor_map.setdefault(prefix, []).append(suffix)

### Write a loop that generates 50 more words by following these steps:
1. In successor_map, look up the list of words that can follow bigram.
2. Choose one of them at random and print it.
3. For the next iteration, make a new bigram that contains the second word from
bigram and the chosen successor.

In [None]:
import random

successor_map = { ('the', 'cat'): ['sat', 'jumped'],}
successors = list(successor_map)
bigram = random.choice(successors)

for i in range(50):
    possible_suffixes = successor_map.get(bigram)

    if not possible_suffixes:
        break

    next_word = random.choice(possible_suffixes)
    print(next_word, end=' ')

    bigram = (bigram[1], next_word)

sat 