# Day 4: Mathematical Foundations I - Probability Basics

**Learning Objective**: Understand how probability drives text generation in language models.

**Time**: 15 minutes of hands-on practice

**Prerequisites**: Read [Day 4 Guide](../../../docs/daily-guides/week01/day04-probability-basics.md) first (10 minutes)

## 🎯 Today's Focus: Probability in Language Generation

Let's explore how probability makes text generation possible through interactive examples.

In [None]:
# Setup: Import libraries for probability exploration
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from typing import List, Dict, Tuple

print("🚀 Day 4 Environment Ready!")
print("Today we'll explore: Probability in Language Models")

## 🎲 Basic Probability Concepts

Let's start with fundamental probability concepts using simple examples.

In [None]:
def probability_basics_demo():
    """
    Demonstrates basic probability concepts with simple examples.
    """
    print("🎲 PROBABILITY BASICS")
    print("="*30)

    # Simple coin flip probability
    print("\n🪙 Coin Flip Example:")
    print("   Possible outcomes: ['Heads', 'Tails']")
    print("   Probability of Heads: 0.5 (50%)")
    print("   Probability of Tails: 0.5 (50%)")
    print("   Total probability: 0.5 + 0.5 = 1.0 (100%)")

    # Word prediction probability
    print("\n📝 Word Prediction Example:")
    next_word_probs = {
        'good': 0.4,
        'great': 0.3,
        'excellent': 0.2,
        'bad': 0.1
    }

    print("   Context: 'The weather is ___'")
    print("   Possible next words and their probabilities:")
    for word, prob in next_word_probs.items():
        print(f"      '{word}': {prob} ({prob*100}%)")

    total_prob = sum(next_word_probs.values())
    print(f"   Total probability: {total_prob} (must equal 1.0)")

    print("\n💡 Key insight: AI assigns probabilities to every possible next word")

    return next_word_probs


word_probabilities = probability_basics_demo()

## 📊 Visualizing Probability Distributions

Let's visualize how probabilities look and how they affect text generation.

In [None]:
def visualize_probability_distribution(probabilities: Dict[str, float]):
    """
    Creates a visual representation of probability distributions.
    """
    print("📊 VISUALIZING PROBABILITY DISTRIBUTION")

    words = list(probabilities.keys())
    probs = list(probabilities.values())

    # Text-based visualization
    print("\n📈 Probability Bar Chart (text version):")
    max_prob = max(probs)

    for word, prob in probabilities.items():
        bar_length = int((prob / max_prob) * 20)  # Scale to 20 characters
        bar = "█" * bar_length
        print(f"   {word:10} {bar} {prob:.1f}")

    # Simulation: Generate words based on probabilities
    print("\n🎲 Simulation: 10 random selections based on probabilities:")
    selections = []
    for i in range(10):
        chosen_word = random.choices(words, weights=probs)[0]
        selections.append(chosen_word)

    selection_counts = Counter(selections)
    print(f"   Selected words: {selections}")
    print(f"   Selection counts: {dict(selection_counts)}")

    print("\n💡 Notice: Higher probability words appear more often (but not always!)")

    return selections


selections = visualize_probability_distribution(word_probabilities)

## 🔗 Conditional Probability in Language

Language models use conditional probability - the probability of a word given the previous context.

In [None]:
def conditional_probability_demo():
    """
    Demonstrates conditional probability in language context.
    """
    print("🔗 CONDITIONAL PROBABILITY IN LANGUAGE")
    print("="*45)

    # Different contexts lead to different probability distributions
    contexts = {
        "The weather is": {
            'sunny': 0.3,
            'rainy': 0.2,
            'cloudy': 0.2,
            'good': 0.15,
            'terrible': 0.15
        },
        "The dog is": {
            'running': 0.25,
            'sleeping': 0.25,
            'barking': 0.2,
            'good': 0.15,
            'hungry': 0.15
        },
        "I am": {
            'happy': 0.3,
            'tired': 0.25,
            'learning': 0.2,
            'good': 0.15,
            'confused': 0.1
        }
    }

    print("Context affects probability! Same word, different contexts:")

    for context, word_probs in contexts.items():
        print(f"\n📝 Context: '{context} ___'")
        print("   Top 3 likely next words:")

        # Sort by probability and show top 3
        sorted_words = sorted(word_probs.items(),
                              key=lambda x: x[1], reverse=True)[:3]
        for word, prob in sorted_words:
            print(f"      '{word}': {prob} ({prob*100:.0f}%)")

    print("\n🎯 KEY INSIGHT: P(word | context) - Probability depends on what came before!")
    print("\n💡 This is why 'good' has different probabilities in different contexts")

    return contexts


context_probabilities = conditional_probability_demo()

## ⚡ Interactive Probability Exploration

Let's create our own probability-based text generator to see how it works!

In [None]:
def interactive_probability_generator(context: str, num_words: int = 3) -> str:
    """
    Interactive probability-based text generator.
    Shows how probability drives text generation step by step.
    """
    print(f"⚡ INTERACTIVE PROBABILITY GENERATOR")
    print(f"Starting context: '{context}'")
    print("="*50)

    # Simple probability model based on last word
    word_transitions = {
        'the': {'cat': 0.3, 'dog': 0.3, 'weather': 0.2, 'AI': 0.2},
        'cat': {'is': 0.4, 'runs': 0.3, 'sleeps': 0.3},
        'dog': {'is': 0.4, 'runs': 0.3, 'barks': 0.3},
        'weather': {'is': 0.6, 'looks': 0.4},
        'AI': {'is': 0.5, 'learns': 0.3, 'helps': 0.2},
        'is': {'good': 0.25, 'great': 0.25, 'running': 0.2, 'sunny': 0.15, 'learning': 0.15},
        'runs': {'fast': 0.4, 'quickly': 0.3, 'away': 0.3},
        'sleeps': {'peacefully': 0.5, 'quietly': 0.3, 'soundly': 0.2}
    }

    current_text = context
    words = current_text.lower().split()

    for step in range(num_words):
        print(f"\n🔄 Step {step + 1}:")
        print(f"   Current text: '{current_text}'")

        if words:
            last_word = words[-1]
            print(f"   Looking at last word: '{last_word}'")

            if last_word in word_transitions:
                possible_words = word_transitions[last_word]
                print(f"   Possible next words: {possible_words}")

                # Show probability decision process
                print("   🎲 Rolling probability dice...")

                # Generate random number and show selection process
                random_value = random.random()
                print(f"   Random value: {random_value:.3f}")

                # Weighted selection
                word_list = list(possible_words.keys())
                weight_list = list(possible_words.values())
                chosen_word = random.choices(word_list, weights=weight_list)[0]

                print(
                    f"   ✅ Selected: '{chosen_word}' (probability: {possible_words[chosen_word]})")

                current_text += " " + chosen_word
                words.append(chosen_word)
            else:
                # Fallback for unknown words
                fallback_words = ['and', 'the', 'is', 'very']
                chosen_word = random.choice(fallback_words)
                print(f"   ⚠️ Unknown word, using fallback: '{chosen_word}'")
                current_text += " " + chosen_word
                words.append(chosen_word)

        print(f"   📝 Updated text: '{current_text}'")

    print(f"\n🎉 FINAL GENERATED TEXT: '{current_text}'")
    print("\n💡 This shows how probability guides each word choice!")

    return current_text


# Test the interactive generator
generated_text = interactive_probability_generator("The cat", 3)

## 🎯 Temperature: Controlling Randomness

Let's explore how 'temperature' affects probability distributions and creativity.

In [None]:
def temperature_effects_demo():
    """
    Demonstrates how temperature affects probability distributions.
    """
    print("🌡️ TEMPERATURE EFFECTS ON PROBABILITY")
    print("="*45)

    # Original probabilities
    original_probs = {'good': 0.4, 'great': 0.3,
                      'excellent': 0.2, 'amazing': 0.1}

    def apply_temperature(probs: Dict[str, float], temperature: float) -> Dict[str, float]:
        """Apply temperature scaling to probabilities."""
        if temperature == 0:
            # Greedy: pick highest probability
            max_word = max(probs.items(), key=lambda x: x[1])[0]
            return {word: (1.0 if word == max_word else 0.0) for word in probs}

        # Apply temperature scaling (simplified)
        scaled_probs = {}
        for word, prob in probs.items():
            scaled_probs[word] = prob ** (1/temperature)

        # Normalize to sum to 1
        total = sum(scaled_probs.values())
        return {word: prob/total for word, prob in scaled_probs.items()}

    temperatures = [0.1, 0.5, 1.0, 1.5, 2.0]

    print(f"Original probabilities: {original_probs}")
    print("\n🌡️ Effect of different temperatures:")

    for temp in temperatures:
        if temp == 0:
            temp_probs = apply_temperature(
                original_probs, 0.01)  # Very low temp
        else:
            temp_probs = apply_temperature(original_probs, temp)

        print(f"\n   Temperature {temp}:")
        for word, prob in temp_probs.items():
            print(f"      '{word}': {prob:.3f}")

        # Describe the effect
        if temp <= 0.5:
            print("      → More predictable, conservative choices")
        elif temp <= 1.0:
            print("      → Balanced creativity and predictability")
        else:
            print("      → More creative, diverse choices")

    print("\n💡 Key insights:")
    print("   • Low temperature → More predictable text")
    print("   • High temperature → More creative/random text")
    print("   • Temperature = 1.0 → Use original probabilities")

    return temperatures


temperature_effects_demo()

## 🏆 Day 4 Knowledge Check

Test your understanding of probability in language models:

In [None]:
def day4_knowledge_check():
    """
    Interactive knowledge check for Day 4 probability concepts.
    """
    print("📋 Day 4 Knowledge Check: Probability in Language Models")

    probability_quiz = [
        ("What must all probabilities in a distribution sum to?",
         "1.0 (100%)", "Complete certainty across all possibilities"),
        ("How does context affect word probabilities?", "Changes the distribution",
         "P(word|context) varies with different contexts"),
        ("What happens with low temperature (0.1)?", "More predictable output",
         "Concentrates probability on likely words"),
        ("What happens with high temperature (2.0)?",
         "More creative output", "Spreads probability more evenly"),
        ("Why don't language models always pick the highest probability word?",
         "To avoid repetition", "Sampling adds variety and creativity")
    ]

    print("\nProbability concepts in language generation:")

    for i, (question, answer, explanation) in enumerate(probability_quiz, 1):
        print(f"\n{i}. Q: {question}")
        print(f"   A: {answer}")
        print(f"   Why: {explanation}")

    print("\n🎯 If you understand these probability concepts, you've mastered Day 4!")
    return True


day4_knowledge_check()

## 📝 Day 4 Reflection (5 minutes)

Reflect on probability in language generation:

In [None]:
print("📝 Day 4 Reflection Questions:")
print("\n1. How does probability make language generation possible?")
print("   Your answer: [Write your explanation here]")

print("\n2. Why is conditional probability important for coherent text?")
print("   Your answer: [Write your reasoning here]")

print("\n3. When would you prefer low vs high temperature settings?")
print("   Your answer: [Write your use cases here]")

print("\n🎯 Tomorrow: We'll explore LOSS FUNCTIONS - how AI learns to improve")
print("📖 Next guide: Day 5 - Mathematical Foundations II (Loss Functions)")

## ✅ Day 4 Completion Checklist

Before moving to Day 5, confirm you can:

- [ ] Explain how probability drives text generation
- [ ] Understand conditional probability P(word|context)
- [ ] Describe how temperature affects creativity
- [ ] Recognize why probabilities must sum to 1.0
- [ ] Connect probability concepts to language model behavior

**🎉 Day 4 Complete!** Ready for [Day 5: Loss Functions](day05-loss-functions.ipynb)?

In [None]:
# Enhanced Day 4: Probability in AI - Interactive Exploration
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats


def explore_probability_distributions():
    """
    Interactive exploration of probability distributions used in AI.
    """
    print("🎲 Probability Distributions in AI")
    print("=" * 50)

    # Create subplot for different distributions
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

    # 1. Uniform Distribution (baseline for comparison)
    x_uniform = np.linspace(0, 1, 1000)
    y_uniform = np.ones_like(x_uniform)

    ax1.plot(x_uniform, y_uniform, 'b-', linewidth=3,
             label='Uniform Distribution')
    ax1.fill_between(x_uniform, y_uniform, alpha=0.3, color='blue')
    ax1.set_title('Uniform Distribution\n(Equal probability for all outcomes)')
    ax1.set_xlabel('Value')
    ax1.set_ylabel('Probability Density')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # 2. Normal Distribution (Gaussian)
    x_normal = np.linspace(-4, 4, 1000)
    y_normal = stats.norm.pdf(x_normal, 0, 1)

    ax2.plot(x_normal, y_normal, 'g-', linewidth=3, label='Standard Normal')
    ax2.fill_between(x_normal, y_normal, alpha=0.3, color='green')
    ax2.axvline(0, color='red', linestyle='--', alpha=0.7, label='Mean')
    ax2.set_title('Normal Distribution\n(Bell curve - common in AI)')
    ax2.set_xlabel('Value')
    ax2.set_ylabel('Probability Density')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    # 3. Softmax-like distribution (categorical)
    categories = ['Cat', 'Dog', 'Bird', 'Fish', 'Rabbit']
    # Simulate model confidence
    logits = np.array([2.1, 1.8, 0.5, 0.3, 0.8])
    softmax_probs = np.exp(logits) / np.sum(np.exp(logits))

    bars = ax3.bar(categories, softmax_probs, color=[
                   'orange', 'brown', 'blue', 'cyan', 'pink'], alpha=0.7)
    ax3.set_title('Softmax Distribution\n(AI model predictions)')
    ax3.set_xlabel('Categories')
    ax3.set_ylabel('Probability')
    ax3.tick_params(axis='x', rotation=45)

    # Add probability values on bars
    for bar, prob in zip(bars, softmax_probs):
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                 f'{prob:.3f}', ha='center', va='bottom', fontweight='bold')

    ax3.grid(True, alpha=0.3)

    # 4. Attention weights visualization
    sequence_length = 8
    attention_weights = np.random.dirichlet(np.ones(sequence_length) * 2)
    positions = np.arange(sequence_length)

    ax4.plot(positions, attention_weights, 'ro-', linewidth=2,
             markersize=8, label='Attention Weights')
    ax4.fill_between(positions, attention_weights, alpha=0.3, color='red')
    ax4.set_title(
        'Attention Probability Distribution\n(Where the model "looks")')
    ax4.set_xlabel('Sequence Position')
    ax4.set_ylabel('Attention Weight')
    ax4.set_xticks(positions)
    ax4.legend()
    ax4.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Print probability insights
    print("\n📊 Probability Insights:")
    print(
        f"Softmax predictions sum to: {np.sum(softmax_probs):.6f} (should be 1.0)")
    print(
        f"Most confident prediction: {categories[np.argmax(softmax_probs)]} ({softmax_probs.max():.3f})")
    print(
        f"Attention weights sum to: {np.sum(attention_weights):.6f} (should be 1.0)")
    print(
        f"Most attended position: {np.argmax(attention_weights)} ({attention_weights.max():.3f})")

    return {
        'softmax_probs': softmax_probs,
        'attention_weights': attention_weights,
        'categories': categories
    }


# Run probability exploration
prob_data = explore_probability_distributions()