# Tutorial 3: Topic Modeling and Communication Patterns

**Goal:** Discover actual topics and patterns in central bank communications using objective, measurable techniques.

**What you'll learn:**
- Topic modeling with LDA (discover hidden themes)
- Concrete keyword and term tracking
- Language complexity metrics (objective readability)
- Vocabulary diversity analysis
- Communication pattern changes
- Comparing language use across banks

**Time:** ~1 hour

**Why these methods?**
These are **objective, measurable** techniques based on actual word usage and patterns, not subjective "sentiment" guessing.

## Step 1: Setup

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import re

# NLP tools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

# Load data function
def load_statements(directory, bank_name):
    statements = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            date_str = filename.replace('.txt', '').replace('-txt', '')
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
            statements.append({'date': date_str, 'bank': bank_name, 'text': text})
    df = pd.DataFrame(statements)
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    return df

# Load data
fed_data = load_statements('../usa-central-bank/fomc-statements', 'Fed')
nz_data = load_statements('../nz-central-bank/ocr', 'RBNZ')
all_data = pd.concat([fed_data, nz_data], ignore_index=True).sort_values('date').reset_index(drop=True)

print(f"âœ“ Loaded {len(all_data)} statements")

## Step 2: Topic Modeling with LDA

**LDA (Latent Dirichlet Allocation)** discovers hidden topics automatically by finding words that tend to appear together.

This is **objective** - it's based on actual statistical patterns in the text, not guessing emotions.

In [None]:
# Prepare text data
# We'll use TF-IDF to weight important words
vectorizer = CountVectorizer(
    max_features=200,  # Top 200 most common words
    stop_words='english',  # Remove common words
    min_df=2,  # Word must appear in at least 2 documents
    max_df=0.8  # Ignore words in more than 80% of documents
)

# Convert text to word counts
doc_term_matrix = vectorizer.fit_transform(all_data['text'])
feature_names = vectorizer.get_feature_names_out()

print(f"Created matrix: {doc_term_matrix.shape[0]} documents, {doc_term_matrix.shape[1]} words")
print(f"\nSample words: {', '.join(feature_names[:20])}")

In [None]:
# Fit LDA model to discover topics
n_topics = 5  # We'll look for 5 main topics

lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    max_iter=20
)

# This finds the topics
lda.fit(doc_term_matrix)

print("âœ“ Topics discovered!")

In [None]:
# Display the topics
def display_topics(model, feature_names, n_top_words=10):
    """
    Show the top words for each topic.
    """
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics[f"Topic {topic_idx + 1}"] = top_words
    return topics

topics = display_topics(lda, feature_names, n_top_words=10)

print("Discovered Topics:")
print("=" * 80)
for topic_name, words in topics.items():
    print(f"\n{topic_name}:")
    print(f"  {', '.join(words)}")

print("\nðŸ’¡ These topics were discovered automatically from word co-occurrence patterns.")
print("   Can you identify what each topic is about? (e.g., inflation, employment, policy)")

## Step 3: Track Topic Prevalence Over Time

Let's see which topics are emphasized in different time periods.

In [None]:
# Get topic distribution for each document
topic_distributions = lda.transform(doc_term_matrix)

# Add to dataframe
for i in range(n_topics):
    all_data[f'topic_{i+1}'] = topic_distributions[:, i]

# Plot topic prevalence over time for Fed
fed_topics = all_data[all_data['bank'] == 'Fed']

plt.figure(figsize=(14, 6))
for i in range(n_topics):
    plt.plot(fed_topics['date'], fed_topics[f'topic_{i+1}'], 
             label=f'Topic {i+1}', marker='o', linewidth=2)

plt.xlabel('Date', fontsize=12)
plt.ylabel('Topic Weight', fontsize=12)
plt.title('Topic Evolution in Fed Statements', fontsize=14, fontweight='bold')
plt.legend(fontsize=10, loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nðŸ’¡ This shows which topics were emphasized at different times.")
print("   Look for shifts that might correspond to economic events.")

## Step 4: Concrete Economic Term Tracking

Instead of vague "sentiment", let's track **actual economic terms** and see what the bank is focusing on.

In [None]:
# Define specific economic terms to track
economic_terms = {
    'Inflation': ['inflation', 'price', 'prices'],
    'Employment': ['employment', 'labor', 'jobs', 'unemployment'],
    'Growth': ['growth', 'economic activity', 'expansion'],
    'Risk': ['risk', 'uncertainty', 'uncertain'],
    'Policy': ['policy', 'monetary policy', 'target'],
    'Financial': ['financial', 'market', 'markets', 'credit']
}

def count_term_mentions(text, terms):
    """
    Count mentions of terms (case-insensitive).
    """
    text_lower = text.lower()
    count = 0
    for term in terms:
        count += text_lower.count(term.lower())
    return count

# Count each category
for category, terms in economic_terms.items():
    all_data[category] = all_data['text'].apply(lambda x: count_term_mentions(x, terms))

# Normalize by document length
all_data['word_count'] = all_data['text'].str.split().str.len()
for category in economic_terms.keys():
    all_data[f'{category}_per_100'] = (all_data[category] / all_data['word_count']) * 100

print("âœ“ Term tracking complete")
all_data[['date', 'bank'] + [f'{cat}_per_100' for cat in economic_terms.keys()]].head()

In [None]:
# Plot term frequency over time
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, category in enumerate(economic_terms.keys()):
    ax = axes[idx]
    
    for bank in all_data['bank'].unique():
        bank_data = all_data[all_data['bank'] == bank]
        ax.plot(bank_data['date'], bank_data[f'{category}_per_100'], 
               marker='o', label=bank, linewidth=2, markersize=4)
    
    ax.set_title(f'{category} Mentions', fontweight='bold')
    ax.set_ylabel('Mentions per 100 words')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nðŸ’¡ This shows concrete, measurable changes in what central banks talk about.")

## Step 5: Language Complexity Analysis

Measure **objective** readability metrics. Are statements getting simpler or more complex?

In [None]:
def calculate_complexity_metrics(text):
    """
    Calculate objective text complexity metrics.
    """
    words = text.split()
    sentences = re.split(r'[.!?]+', text)
    sentences = [s for s in sentences if s.strip()]  # Remove empty
    
    if len(words) == 0 or len(sentences) == 0:
        return {'avg_sentence_length': 0, 'avg_word_length': 0, 'long_words_pct': 0}
    
    # Average sentence length (words per sentence)
    avg_sentence_length = len(words) / len(sentences)
    
    # Average word length (characters)
    avg_word_length = sum(len(word) for word in words) / len(words)
    
    # Percentage of "long" words (7+ characters)
    long_words = sum(1 for word in words if len(word) >= 7)
    long_words_pct = (long_words / len(words)) * 100
    
    return {
        'avg_sentence_length': avg_sentence_length,
        'avg_word_length': avg_word_length,
        'long_words_pct': long_words_pct
    }

# Calculate for all documents
complexity = all_data['text'].apply(calculate_complexity_metrics)
all_data['avg_sentence_length'] = complexity.apply(lambda x: x['avg_sentence_length'])
all_data['avg_word_length'] = complexity.apply(lambda x: x['avg_word_length'])
all_data['long_words_pct'] = complexity.apply(lambda x: x['long_words_pct'])

print("âœ“ Complexity metrics calculated")
all_data[['date', 'bank', 'avg_sentence_length', 'avg_word_length', 'long_words_pct']].head()

In [None]:
# Plot complexity over time
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

metrics = [
    ('avg_sentence_length', 'Average Sentence Length (words)'),
    ('avg_word_length', 'Average Word Length (characters)'),
    ('long_words_pct', 'Long Words (%)')
]

for idx, (metric, title) in enumerate(metrics):
    for bank in all_data['bank'].unique():
        bank_data = all_data[all_data['bank'] == bank]
        axes[idx].plot(bank_data['date'], bank_data[metric], 
                      marker='o', label=bank, linewidth=2)
    
    axes[idx].set_title(title, fontweight='bold')
    axes[idx].set_ylabel(metric.replace('_', ' ').title())
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nðŸ’¡ Increasing values = more complex language.")
print("   This is objective and measurable, unlike sentiment.")

## Step 6: Vocabulary Diversity

Are statements using more varied language, or repeating the same words?

In [None]:
def calculate_vocabulary_diversity(text):
    """
    Calculate lexical diversity (Type-Token Ratio).
    Higher = more diverse vocabulary.
    """
    words = text.lower().split()
    if len(words) == 0:
        return 0
    
    unique_words = len(set(words))
    total_words = len(words)
    
    return unique_words / total_words

all_data['vocab_diversity'] = all_data['text'].apply(calculate_vocabulary_diversity)

# Plot
plt.figure(figsize=(14, 6))
for bank in all_data['bank'].unique():
    bank_data = all_data[all_data['bank'] == bank]
    plt.plot(bank_data['date'], bank_data['vocab_diversity'], 
            marker='o', label=bank, linewidth=2)

plt.xlabel('Date', fontsize=12)
plt.ylabel('Vocabulary Diversity (Type-Token Ratio)', fontsize=12)
plt.title('Language Diversity Over Time', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nðŸ’¡ Higher diversity = more varied language (less repetitive).")
print("   Lower diversity = more standardized, formulaic language.")

## Step 7: Most Distinctive Words by Bank

What words are characteristic of each bank? (Using TF-IDF)

In [None]:
# Compare banks using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine all text for each bank
bank_texts = all_data.groupby('bank')['text'].apply(lambda x: ' '.join(x)).to_dict()

# Calculate TF-IDF
tfidf = TfidfVectorizer(max_features=50, stop_words='english')
tfidf_matrix = tfidf.fit_transform(bank_texts.values())
feature_names = tfidf.get_feature_names_out()

# Get top words for each bank
for idx, bank in enumerate(bank_texts.keys()):
    scores = tfidf_matrix[idx].toarray()[0]
    top_indices = scores.argsort()[-15:][::-1]
    top_words = [(feature_names[i], scores[i]) for i in top_indices]
    
    print(f"\nMost Distinctive Words for {bank}:")
    print("=" * 60)
    for word, score in top_words[:10]:
        print(f"  {word:20s} {score:.3f}")

print("\nðŸ’¡ These words are statistically distinctive to each bank's language.")

## Step 8: Changing Language Patterns

Detect when language patterns shift significantly.

In [None]:
# Calculate change in key metrics
for bank in all_data['bank'].unique():
    mask = all_data['bank'] == bank
    
    # Calculate change from previous statement
    all_data.loc[mask, 'length_change'] = all_data.loc[mask, 'word_count'].diff()
    all_data.loc[mask, 'complexity_change'] = all_data.loc[mask, 'avg_sentence_length'].diff()
    all_data.loc[mask, 'diversity_change'] = all_data.loc[mask, 'vocab_diversity'].diff()

# Find biggest changes
print("Biggest Length Increases:")
print(all_data.nlargest(5, 'length_change')[['date', 'bank', 'word_count', 'length_change']])

print("\nBiggest Complexity Increases:")
print(all_data.nlargest(5, 'complexity_change')[['date', 'bank', 'avg_sentence_length', 'complexity_change']])

print("\nðŸ’¡ These dates might mark important communication shifts.")

## Step 9: Summary Dashboard

Bring it all together in one comprehensive view.

In [None]:
# Create comprehensive dashboard
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# 1. Topic evolution (Fed only for clarity)
fed_topics = all_data[all_data['bank'] == 'Fed']
for i in range(min(3, n_topics)):  # Show top 3 topics
    axes[0, 0].plot(fed_topics['date'], fed_topics[f'topic_{i+1}'], 
                   label=f'Topic {i+1}', marker='o', linewidth=2)
axes[0, 0].set_title('Topic Evolution (Fed)', fontweight='bold')
axes[0, 0].set_ylabel('Topic Weight')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Inflation mentions
for bank in all_data['bank'].unique():
    bank_data = all_data[all_data['bank'] == bank]
    axes[0, 1].plot(bank_data['date'], bank_data['Inflation_per_100'], 
                   marker='o', label=bank, linewidth=2)
axes[0, 1].set_title('Inflation Mentions per 100 Words', fontweight='bold')
axes[0, 1].set_ylabel('Mentions')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Language complexity
for bank in all_data['bank'].unique():
    bank_data = all_data[all_data['bank'] == bank]
    axes[1, 0].plot(bank_data['date'], bank_data['avg_sentence_length'], 
                   marker='o', label=bank, linewidth=2)
axes[1, 0].set_title('Sentence Length (Complexity)', fontweight='bold')
axes[1, 0].set_ylabel('Words per Sentence')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Vocabulary diversity
for bank in all_data['bank'].unique():
    bank_data = all_data[all_data['bank'] == bank]
    axes[1, 1].plot(bank_data['date'], bank_data['vocab_diversity'], 
                   marker='o', label=bank, linewidth=2)
axes[1, 1].set_title('Vocabulary Diversity', fontweight='bold')
axes[1, 1].set_ylabel('Type-Token Ratio')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.suptitle('Central Bank Communication Analysis Dashboard', 
            fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

## ðŸŽ¯ What You Learned

1. **Topic Modeling (LDA)**: Discover hidden themes statistically
2. **Concrete Term Tracking**: Measure actual economic focus areas
3. **Complexity Metrics**: Objective readability measurements
4. **Vocabulary Diversity**: Measure language variation
5. **TF-IDF Analysis**: Find distinctive words
6. **Change Detection**: Identify significant shifts

**Key Difference from Sentiment Analysis:**
- These are **objective, measurable** metrics
- Based on **actual word usage and patterns**
- Not trying to guess "emotions" from formal documents
- More appropriate for analyzing technical communications

## ðŸš€ Next Steps

In Tutorial 4, we'll learn:
- Creating publication-ready visualizations
- Interactive dashboards
- Exporting results

## ðŸ’¡ Try It Yourself

1. Experiment with different numbers of topics in LDA
2. Add more economic terms to track
3. Compare early vs late periods for each bank
4. Find correlations between metrics (e.g., complexity vs diversity)

In [None]:
# Exercise space
# YOUR CODE HERE
