<a href="https://colab.research.google.com/github/TCU-DCDA/WRIT20833-2025/blob/main/notebooks/exercises/Review_08_Data_Visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WRIT 20833 Review 08: Data Visualization

Create compelling visualizations to communicate cultural data insights.

**Make a copy:** File > Save a copy in Drive

## Exercise 1: Setting Up Visualization Tools
Import libraries and create basic plots.

In [None]:
# Import visualization libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter

# Set up plotting style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

# Optional: Try seaborn for enhanced styling
try:
    import seaborn as sns
    sns.set_style("whitegrid")
    print(" Seaborn loaded for enhanced styling")
except ImportError:
    print(" Seaborn not available, using matplotlib defaults")

# Create sample cultural data for visualization
books_data = {
    'title': ['1984', 'Pride and Prejudice', 'The Handmaid\'s Tale', 'Beloved', 'The Great Gatsby', 
             'To Kill a Mockingbird', 'Jane Eyre', 'Wuthering Heights', 'The Catcher in the Rye', 'Lord of the Flies'],
    'author': ['George Orwell', 'Jane Austen', 'Margaret Atwood', 'Toni Morrison', 'F. Scott Fitzgerald',
              'Harper Lee', 'Charlotte Brontë', 'Emily Brontë', 'J.D. Salinger', 'William Golding'],
    'year': [1949, 1813, 1985, 1987, 1925, 1960, 1847, 1847, 1951, 1954],
    'pages': [328, 432, 311, 275, 180, 281, 507, 416, 277, 224],
    'genre': ['Dystopian', 'Romance', 'Dystopian', 'Historical Fiction', 'Modernist', 
             'Coming of Age', 'Gothic', 'Gothic', 'Coming of Age', 'Allegorical'],
    'rating': [4.2, 4.1, 4.3, 4.4, 3.9, 4.5, 4.0, 3.8, 3.7, 3.9]
}

books_df = pd.DataFrame(books_data)

print("Sample Cultural Dataset:")
print(books_df.head())
print(f"\nDataset shape: {books_df.shape}")

# Test basic plotting
plt.figure(figsize=(8, 5))
plt.scatter(books_df['year'], books_df['pages'], alpha=0.7)
plt.xlabel('Publication Year')
plt.ylabel('Number of Pages')
plt.title('Book Length vs. Publication Year')
plt.grid(True, alpha=0.3)
plt.show()

print(" Basic plotting functionality working!")

## Exercise 2: Bar Charts for Categorical Data
Visualize distributions and comparisons in cultural data.

In [None]:
# Genre distribution bar chart
genre_counts = books_df['genre'].value_counts()

plt.figure(figsize=(10, 6))
bars = plt.bar(genre_counts.index, genre_counts.values, color='skyblue', alpha=0.8)
plt.xlabel('Literary Genre')
plt.ylabel('Number of Books')
plt.title('Distribution of Books by Genre')
plt.xticks(rotation=45, ha='right')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.02,
             f'{int(height)}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Horizontal bar chart for average ratings by genre
genre_ratings = books_df.groupby('genre')['rating'].mean().sort_values(ascending=True)

plt.figure(figsize=(10, 6))
colors = plt.cm.viridis(np.linspace(0, 1, len(genre_ratings)))
bars = plt.barh(genre_ratings.index, genre_ratings.values, color=colors, alpha=0.8)
plt.xlabel('Average Rating')
plt.ylabel('Genre')
plt.title('Average Book Ratings by Genre')

# Add value labels
for i, (bar, rating) in enumerate(zip(bars, genre_ratings.values)):
    plt.text(rating + 0.02, bar.get_y() + bar.get_height()/2, 
             f'{rating:.2f}', va='center', ha='left')

plt.xlim(0, 5)
plt.tight_layout()
plt.show()

# Grouped bar chart: Pages vs Rating by Century
books_df['century'] = ((books_df['year'] - 1) // 100 + 1) * 100
century_stats = books_df.groupby('century').agg({
    'pages': 'mean',
    'rating': 'mean'
})

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Average pages by century
ax1.bar(century_stats.index, century_stats['pages'], color='coral', alpha=0.8, width=30)
ax1.set_xlabel('Century')
ax1.set_ylabel('Average Pages')
ax1.set_title('Average Book Length by Century')
ax1.grid(True, alpha=0.3)

# Average rating by century
ax2.bar(century_stats.index, century_stats['rating'], color='lightgreen', alpha=0.8, width=30)
ax2.set_xlabel('Century')
ax2.set_ylabel('Average Rating')
ax2.set_title('Average Book Rating by Century')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Bar chart insights:")
print(f"Most common genre: {genre_counts.index[0]} ({genre_counts.iloc[0]} books)")
print(f"Highest-rated genre: {genre_ratings.index[-1]} ({genre_ratings.iloc[-1]:.2f} average rating)")
print(f"Century with longest books: {century_stats['pages'].idxmax()}s ({century_stats['pages'].max():.0f} avg pages)")

## Exercise 3: Scatter Plots and Correlations
Explore relationships between numerical variables.

In [None]:
# Basic scatter plot: Year vs Pages
plt.figure(figsize=(10, 6))
plt.scatter(books_df['year'], books_df['pages'], s=100, alpha=0.7, c='blue')
plt.xlabel('Publication Year')
plt.ylabel('Number of Pages')
plt.title('Book Length Over Time')
plt.grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(books_df['year'], books_df['pages'], 1)
p = np.poly1d(z)
plt.plot(books_df['year'], p(books_df['year']), "r--", alpha=0.8, label=f'Trend line (slope: {z[0]:.2f})')
plt.legend()
plt.show()

# Color-coded scatter plot by genre
plt.figure(figsize=(12, 8))
genres = books_df['genre'].unique()
colors = plt.cm.tab10(np.linspace(0, 1, len(genres)))

for i, genre in enumerate(genres):
    genre_data = books_df[books_df['genre'] == genre]
    plt.scatter(genre_data['pages'], genre_data['rating'], 
               c=[colors[i]], label=genre, s=100, alpha=0.7)

plt.xlabel('Number of Pages')
plt.ylabel('Rating')
plt.title('Book Rating vs Length by Genre')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Bubble chart: Size represents a third dimension
plt.figure(figsize=(12, 8))
# Use year as bubble size (normalize for visibility)
sizes = (books_df['year'] - books_df['year'].min() + 10) * 3

scatter = plt.scatter(books_df['pages'], books_df['rating'], 
                     s=sizes, c=books_df['year'], cmap='viridis', 
                     alpha=0.7, edgecolors='black', linewidth=0.5)

plt.colorbar(scatter, label='Publication Year')
plt.xlabel('Number of Pages')
plt.ylabel('Rating')
plt.title('Book Ratings vs Length (bubble size = recency)')
plt.grid(True, alpha=0.3)

# Add annotations for some interesting points
for i, row in books_df.iterrows():
    if row['rating'] > 4.3 or row['pages'] > 450:
        plt.annotate(row['title'][:15] + ('...' if len(row['title']) > 15 else ''), 
                    (row['pages'], row['rating']),
                    xytext=(5, 5), textcoords='offset points', 
                    fontsize=9, alpha=0.8)

plt.tight_layout()
plt.show()

# Correlation analysis
correlation_vars = ['year', 'pages', 'rating']
correlation_matrix = books_df[correlation_vars].corr()

print("Correlation Analysis:")
print(correlation_matrix.round(3))
print(f"\nStrongest correlation: {correlation_matrix.abs().unstack().drop_duplicates().sort_values(ascending=False).index[1]} = {correlation_matrix.abs().unstack().drop_duplicates().sort_values(ascending=False).iloc[1]:.3f}")

## Exercise 4: Time Series and Historical Trends
Visualize changes in cultural data over time.

In [None]:
# Create expanded historical data for better time series
historical_books = {
    'year': [1605, 1667, 1719, 1813, 1847, 1847, 1851, 1884, 1925, 1949, 1951, 1954, 1960, 1985, 1987, 2003],
    'title': ['Don Quixote', 'Paradise Lost', 'Robinson Crusoe', 'Pride and Prejudice', 'Jane Eyre', 
             'Wuthering Heights', 'Moby Dick', 'Adventures of Huckleberry Finn', 'The Great Gatsby',
             '1984', 'The Catcher in the Rye', 'Lord of the Flies', 'To Kill a Mockingbird', 
             'The Handmaid\'s Tale', 'Beloved', 'The Kite Runner'],
    'pages': [863, 453, 364, 432, 507, 416, 635, 366, 180, 328, 277, 224, 281, 311, 275, 372],
    'themes_complexity': [7, 9, 5, 6, 8, 9, 10, 7, 8, 9, 7, 8, 9, 9, 10, 7],  # Subjective complexity score
    'social_relevance': [8, 6, 4, 5, 7, 6, 8, 9, 7, 10, 6, 7, 10, 9, 9, 8]   # Contemporary social impact
}

hist_df = pd.DataFrame(historical_books)

# Time series line plot
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# 1. Book length over time
ax1.plot(hist_df['year'], hist_df['pages'], 'o-', color='blue', markersize=6, linewidth=2)
ax1.set_xlabel('Publication Year')
ax1.set_ylabel('Number of Pages')
ax1.set_title('Book Length Trends Over Time')
ax1.grid(True, alpha=0.3)
ax1.fill_between(hist_df['year'], hist_df['pages'], alpha=0.3, color='blue')

# 2. Thematic complexity over time
ax2.plot(hist_df['year'], hist_df['themes_complexity'], 's-', color='green', markersize=6, linewidth=2)
ax2.set_xlabel('Publication Year')
ax2.set_ylabel('Thematic Complexity (1-10)')
ax2.set_title('Literary Complexity Over Time')
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, 11)

# 3. Social relevance over time
ax3.plot(hist_df['year'], hist_df['social_relevance'], '^-', color='red', markersize=6, linewidth=2)
ax3.set_xlabel('Publication Year')
ax3.set_ylabel('Social Relevance (1-10)')
ax3.set_title('Social Impact Over Time')
ax3.grid(True, alpha=0.3)
ax3.set_ylim(0, 11)

# 4. Multiple trends comparison
# Normalize data for comparison (0-1 scale)
pages_norm = (hist_df['pages'] - hist_df['pages'].min()) / (hist_df['pages'].max() - hist_df['pages'].min())
complexity_norm = (hist_df['themes_complexity'] - 1) / 9  # Already 1-10 scale
relevance_norm = (hist_df['social_relevance'] - 1) / 9

ax4.plot(hist_df['year'], pages_norm, 'o-', label='Length (normalized)', linewidth=2, markersize=5)
ax4.plot(hist_df['year'], complexity_norm, 's-', label='Complexity (normalized)', linewidth=2, markersize=5)
ax4.plot(hist_df['year'], relevance_norm, '^-', label='Social Relevance (normalized)', linewidth=2, markersize=5)
ax4.set_xlabel('Publication Year')
ax4.set_ylabel('Normalized Score (0-1)')
ax4.set_title('Comparative Literary Trends')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Era-based analysis
def categorize_era(year):
    if year < 1800:
        return 'Pre-1800'
    elif year < 1900:
        return '19th Century'
    elif year < 1950:
        return 'Early 20th Century'
    elif year < 2000:
        return 'Late 20th Century'
    else:
        return '21st Century'

hist_df['era'] = hist_df['year'].apply(categorize_era)

era_analysis = hist_df.groupby('era').agg({
    'pages': 'mean',
    'themes_complexity': 'mean',
    'social_relevance': 'mean',
    'title': 'count'
})

print("Era-based Analysis:")
print(era_analysis.round(2))

# Create era comparison chart
fig, ax = plt.subplots(figsize=(12, 8))
x = np.arange(len(era_analysis.index))
width = 0.25

# Normalize for comparison
pages_norm_era = era_analysis['pages'] / era_analysis['pages'].max()
complexity_norm_era = era_analysis['themes_complexity'] / 10
relevance_norm_era = era_analysis['social_relevance'] / 10

ax.bar(x - width, pages_norm_era, width, label='Avg Length (normalized)', alpha=0.8)
ax.bar(x, complexity_norm_era, width, label='Avg Complexity', alpha=0.8)
ax.bar(x + width, relevance_norm_era, width, label='Avg Social Relevance', alpha=0.8)

ax.set_xlabel('Literary Era')
ax.set_ylabel('Normalized Score')
ax.set_title('Literary Characteristics by Historical Era')
ax.set_xticks(x)
ax.set_xticklabels(era_analysis.index, rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Exercise 5: Advanced Visualizations
Create more sophisticated plots for cultural analysis.

In [None]:
# Pie chart for genre distribution
plt.figure(figsize=(10, 8))
genre_counts = books_df['genre'].value_counts()
colors = plt.cm.Set3(np.linspace(0, 1, len(genre_counts)))

wedges, texts, autotexts = plt.pie(genre_counts.values, labels=genre_counts.index, 
                                  autopct='%1.1f%%', colors=colors, startangle=90,
                                  explode=[0.05 if i == 0 else 0 for i in range(len(genre_counts))])

plt.title('Distribution of Books by Genre', fontsize=16, fontweight='bold')

# Enhance text appearance
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')

plt.axis('equal')
plt.show()

# Histogram for rating distribution
plt.figure(figsize=(10, 6))
plt.hist(books_df['rating'], bins=8, color='lightcoral', alpha=0.7, edgecolor='black')
plt.axvline(books_df['rating'].mean(), color='red', linestyle='--', linewidth=2, 
           label=f'Mean: {books_df["rating"].mean():.2f}')
plt.axvline(books_df['rating'].median(), color='blue', linestyle='--', linewidth=2, 
           label=f'Median: {books_df["rating"].median():.2f}')
plt.xlabel('Rating')
plt.ylabel('Number of Books')
plt.title('Distribution of Book Ratings')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Box plot for ratings by genre
plt.figure(figsize=(12, 8))
genre_data = [books_df[books_df['genre'] == genre]['rating'].values for genre in genre_counts.index]
box_plot = plt.boxplot(genre_data, labels=genre_counts.index, patch_artist=True)

# Color the boxes
colors = plt.cm.viridis(np.linspace(0, 1, len(box_plot['boxes'])))
for patch, color in zip(box_plot['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

plt.xlabel('Genre')
plt.ylabel('Rating')
plt.title('Rating Distribution by Genre')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Heatmap-style correlation matrix
# Create more numerical data for correlation
extended_df = books_df.copy()
extended_df['word_estimate'] = extended_df['pages'] * 250  # Rough word count estimate
extended_df['age_years'] = 2025 - extended_df['year']
extended_df['title_length'] = extended_df['title'].str.len()

numerical_cols = ['year', 'pages', 'rating', 'word_estimate', 'age_years', 'title_length']
corr_matrix = extended_df[numerical_cols].corr()

plt.figure(figsize=(10, 8))
im = plt.imshow(corr_matrix, cmap='RdBu_r', aspect='auto', vmin=-1, vmax=1)
plt.colorbar(im, label='Correlation Coefficient')

# Add text annotations
for i in range(len(numerical_cols)):
    for j in range(len(numerical_cols)):
        text = plt.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
                       ha="center", va="center", color="black" if abs(corr_matrix.iloc[i, j]) < 0.5 else "white",
                       fontweight='bold')

plt.xticks(range(len(numerical_cols)), numerical_cols, rotation=45, ha='right')
plt.yticks(range(len(numerical_cols)), numerical_cols)
plt.title('Correlation Matrix Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Stacked bar chart for era and genre combinations
extended_df['era'] = extended_df['year'].apply(categorize_era)
era_genre_counts = pd.crosstab(extended_df['era'], extended_df['genre'])

plt.figure(figsize=(12, 8))
era_genre_counts.plot(kind='bar', stacked=True, figsize=(12, 8), 
                     colormap='tab20', alpha=0.8)
plt.xlabel('Literary Era')
plt.ylabel('Number of Books')
plt.title('Genre Distribution Across Literary Eras')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Advanced visualization insights:")
print(f"Most correlated variables: {corr_matrix.abs().unstack().drop_duplicates().sort_values(ascending=False).index[1]}")
print(f"Era with most genre diversity: {(era_genre_counts > 0).sum(axis=1).idxmax()}")
print(f"Rating standard deviation: {books_df['rating'].std():.3f}")

## Exercise 6: Creating Your Own Visualizations
Apply visualization techniques to your own cultural dataset.

In [None]:
# TODO: Create your own cultural dataset for visualization
# Consider: Movies, songs, artworks, historical events, etc.

your_cultural_data = {
    # TODO: Replace with your own data
    # Example structure for movies:
    # 'title': ['Movie 1', 'Movie 2', 'Movie 3', ...],
    # 'director': ['Director 1', 'Director 2', 'Director 1', ...],
    # 'year': [2020, 2018, 2019, ...],
    # 'genre': ['Action', 'Drama', 'Comedy', ...],
    # 'rating': [8.5, 7.2, 8.9, ...],
    # 'budget_millions': [150, 50, 75, ...]
    
    # Placeholder data - replace with your interests!
    'item': ['Item A', 'Item B', 'Item C', 'Item D', 'Item E', 'Item F'],
    'creator': ['Creator 1', 'Creator 2', 'Creator 1', 'Creator 3', 'Creator 2', 'Creator 4'],
    'year': [2020, 2018, 2019, 2021, 2017, 2022],
    'category': ['Type X', 'Type Y', 'Type X', 'Type Z', 'Type Y', 'Type X'],
    'score': [8.5, 7.2, 8.9, 6.8, 7.8, 9.1],
    'popularity': [85, 62, 78, 45, 69, 92]
}

your_df = pd.DataFrame(your_cultural_data)

print("YOUR CULTURAL VISUALIZATION PROJECT")
print("=" * 40)

# Check if user has customized the data
if len(your_df) > 0 and your_df['item'].iloc[0] != 'Item A':
    print("Analyzing your custom cultural dataset...\n")
    
    # Basic info about dataset
    print(f"Dataset contains {len(your_df)} items")
    print(f"Categories: {your_df['category'].nunique()} unique")
    print(f"Creators: {your_df['creator'].nunique()} unique")
    print(f"Year range: {your_df['year'].min()} - {your_df['year'].max()}")
    print()
    
    # Create visualizations for user's data
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Category distribution
    category_counts = your_df['category'].value_counts()
    ax1.bar(category_counts.index, category_counts.values, color='skyblue', alpha=0.8)
    ax1.set_title('Distribution by Category')
    ax1.set_xlabel('Category')
    ax1.set_ylabel('Count')
    
    # 2. Score vs Popularity scatter
    colors = plt.cm.viridis(np.linspace(0, 1, len(your_df)))
    ax2.scatter(your_df['score'], your_df['popularity'], c=colors, s=100, alpha=0.7)
    ax2.set_xlabel('Score')
    ax2.set_ylabel('Popularity')
    ax2.set_title('Score vs Popularity')
    
    # 3. Time series of scores
    ax3.plot(your_df['year'], your_df['score'], 'o-', linewidth=2, markersize=8)
    ax3.set_xlabel('Year')
    ax3.set_ylabel('Score')
    ax3.set_title('Scores Over Time')
    ax3.grid(True, alpha=0.3)
    
    # 4. Creator comparison
    creator_stats = your_df.groupby('creator')['score'].mean().sort_values(ascending=False)
    ax4.barh(creator_stats.index, creator_stats.values, color='lightgreen', alpha=0.8)
    ax4.set_xlabel('Average Score')
    ax4.set_ylabel('Creator')
    ax4.set_title('Average Scores by Creator')
    
    plt.tight_layout()
    plt.show()
    
    # Statistical insights
    print("INSIGHTS FROM YOUR DATA:")
    print(f"Highest scored item: {your_df.loc[your_df['score'].idxmax(), 'item']} ({your_df['score'].max()})")
    print(f"Most popular item: {your_df.loc[your_df['popularity'].idxmax(), 'item']} ({your_df['popularity'].max()})")
    print(f"Most productive creator: {your_df['creator'].value_counts().index[0]} ({your_df['creator'].value_counts().iloc[0]} items)")
    
    # Correlation between score and popularity
    correlation = your_df['score'].corr(your_df['popularity'])
    print(f"Score-Popularity correlation: {correlation:.3f}")
    
else:
    print("Please customize the 'your_cultural_data' dictionary above!")
    print()
    print("Example datasets you could create:")
    print(" Books: title, author, year, genre, pages, rating")
    print(" Movies: title, director, year, genre, budget, box_office, rating")
    print(" Songs: title, artist, year, genre, duration, chart_position")
    print(" Artworks: title, artist, year, medium, price, museum")
    print(" Historical Events: event, year, location, significance, casualties")
    print(" TV Shows: title, creator, start_year, seasons, rating, network")
    print()
    
    # Show example with placeholder data anyway
    plt.figure(figsize=(10, 6))
    plt.scatter(your_df['score'], your_df['popularity'], s=100, alpha=0.7, c='red')
    plt.xlabel('Score')
    plt.ylabel('Popularity')
    plt.title('Example Visualization (Replace with Your Data!)')
    plt.grid(True, alpha=0.3)
    
    for i, row in your_df.iterrows():
        plt.annotate(row['item'], (row['score'], row['popularity']), 
                    xytext=(5, 5), textcoords='offset points', fontsize=9)
    
    plt.show()

print("\n" + "="*50)
print("VISUALIZATION DESIGN TIPS:")
print(" Choose appropriate chart types for your data")
print(" Use color strategically to highlight insights")
print(" Include clear titles and axis labels")
print(" Consider your audience and what story you want to tell")
print(" Don't overcomplicate - clarity is key")

## Exercise 7: Interactive and Multi-panel Dashboards
Create comprehensive visualization dashboards.

In [None]:
# Create a comprehensive cultural analysis dashboard
def create_cultural_dashboard(df, title="Cultural Analysis Dashboard"):
    \"\"\"Create a multi-panel dashboard for cultural data analysis\"\"\" 
    
    fig = plt.figure(figsize=(20, 15))
    fig.suptitle(title, fontsize=20, fontweight='bold', y=0.98)
    
    # Create a grid layout
    gs = fig.add_gridspec(3, 4, height_ratios=[1, 1, 1], width_ratios=[1, 1, 1, 1])
    
    # 1. Overview statistics (top-left)
    ax1 = fig.add_subplot(gs[0, 0])
    stats_text = f\"\"\"Dataset Overview
    
Total Items: {len(df)}
Year Range: {df['year'].min()}-{df['year'].max()}
Avg Rating: {df['rating'].mean():.2f}
Avg Pages: {df['pages'].mean():.0f}

Top Genre: {df['genre'].value_counts().index[0]}
({df['genre'].value_counts().iloc[0]} books)

Highest Rated:
{df.loc[df['rating'].idxmax(), 'title'][:20]}...
({df['rating'].max()}/5.0)\"\"\" 
    
    ax1.text(0.05, 0.95, stats_text, transform=ax1.transAxes, fontsize=11,
             verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
    ax1.set_xlim(0, 1)
    ax1.set_ylim(0, 1)
    ax1.axis('off')
    
    # 2. Genre distribution pie chart (top-center-left)
    ax2 = fig.add_subplot(gs[0, 1])
    genre_counts = df['genre'].value_counts()
    colors = plt.cm.Set3(np.linspace(0, 1, len(genre_counts)))
    wedges, texts, autotexts = ax2.pie(genre_counts.values, labels=genre_counts.index, 
                                      autopct='%1.1f%%', colors=colors, startangle=90)
    ax2.set_title('Genre Distribution', fontweight='bold')
    
    # 3. Rating distribution histogram (top-center-right)
    ax3 = fig.add_subplot(gs[0, 2])
    ax3.hist(df['rating'], bins=6, color='lightcoral', alpha=0.7, edgecolor='black')
    ax3.axvline(df['rating'].mean(), color='red', linestyle='--', linewidth=2)
    ax3.set_xlabel('Rating')
    ax3.set_ylabel('Count')
    ax3.set_title('Rating Distribution', fontweight='bold')
    ax3.grid(True, alpha=0.3)
    
    # 4. Publication timeline (top-right)
    ax4 = fig.add_subplot(gs[0, 3])
    decade_counts = (df['year'] // 10 * 10).value_counts().sort_index()
    ax4.bar(decade_counts.index, decade_counts.values, width=8, color='gold', alpha=0.8)
    ax4.set_xlabel('Decade')
    ax4.set_ylabel('Books Published')
    ax4.set_title('Publication Timeline', fontweight='bold')
    ax4.tick_params(axis='x', rotation=45)
    
    # 5. Scatter plot: Year vs Rating (middle-left)
    ax5 = fig.add_subplot(gs[1, 0])
    scatter = ax5.scatter(df['year'], df['rating'], c=df['pages'], cmap='viridis', 
                         s=80, alpha=0.7, edgecolors='black', linewidth=0.5)
    ax5.set_xlabel('Publication Year')
    ax5.set_ylabel('Rating')
    ax5.set_title('Rating vs Publication Year\n(color = page count)', fontweight='bold')
    ax5.grid(True, alpha=0.3)
    plt.colorbar(scatter, ax=ax5, label='Pages')
    
    # 6. Box plot: Ratings by genre (middle-center)
    ax6 = fig.add_subplot(gs[1, 1:3])
    genre_data = [df[df['genre'] == genre]['rating'].values for genre in genre_counts.index]
    bp = ax6.boxplot(genre_data, labels=genre_counts.index, patch_artist=True)
    colors = plt.cm.viridis(np.linspace(0, 1, len(bp['boxes'])))
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
    ax6.set_xlabel('Genre')
    ax6.set_ylabel('Rating')
    ax6.set_title('Rating Distribution by Genre', fontweight='bold')
    ax6.tick_params(axis='x', rotation=45)
    ax6.grid(True, alpha=0.3)
    
    # 7. Page count vs Rating (middle-right)
    ax7 = fig.add_subplot(gs[1, 3])
    ax7.scatter(df['pages'], df['rating'], alpha=0.7, s=80, color='orange', edgecolors='black')
    # Add trend line
    z = np.polyfit(df['pages'], df['rating'], 1)
    p = np.poly1d(z)
    ax7.plot(df['pages'], p(df['pages']), "r--", alpha=0.8)
    ax7.set_xlabel('Pages')
    ax7.set_ylabel('Rating')
    ax7.set_title('Length vs Rating', fontweight='bold')
    ax7.grid(True, alpha=0.3)
    
    # 8. Author productivity (bottom-left)
    ax8 = fig.add_subplot(gs[2, 0])
    author_counts = df['author'].value_counts().head(6)
    ax8.barh(range(len(author_counts)), author_counts.values, color='lightgreen', alpha=0.8)
    ax8.set_yticks(range(len(author_counts)))
    ax8.set_yticklabels([name[:15] + '...' if len(name) > 15 else name for name in author_counts.index])
    ax8.set_xlabel('Number of Books')
    ax8.set_title('Most Prolific Authors', fontweight='bold')
    
    # 9. Century comparison (bottom-center-left)
    ax9 = fig.add_subplot(gs[2, 1])
    df['century'] = ((df['year'] - 1) // 100 + 1) * 100
    century_ratings = df.groupby('century')['rating'].mean()
    ax9.bar(century_ratings.index, century_ratings.values, color='purple', alpha=0.8, width=30)
    ax9.set_xlabel('Century')
    ax9.set_ylabel('Average Rating')
    ax9.set_title('Avg Rating by Century', fontweight='bold')
    
    # 10. Top rated books (bottom-center-right)
    ax10 = fig.add_subplot(gs[2, 2])
    top_books = df.nlargest(5, 'rating')
    bars = ax10.barh(range(len(top_books)), top_books['rating'], color='red', alpha=0.8)
    ax10.set_yticks(range(len(top_books)))
    ax10.set_yticklabels([title[:20] + '...' if len(title) > 20 else title for title in top_books['title']])
    ax10.set_xlabel('Rating')
    ax10.set_title('Top 5 Highest Rated', fontweight='bold')
    
    # Add rating values on bars
    for i, (bar, rating) in enumerate(zip(bars, top_books['rating'])):
        ax10.text(rating + 0.02, bar.get_y() + bar.get_height()/2, 
                 f'{rating:.2f}', va='center', ha='left')
    
    # 11. Correlation matrix (bottom-right)
    ax11 = fig.add_subplot(gs[2, 3])
    corr_vars = ['year', 'pages', 'rating']
    corr_matrix = df[corr_vars].corr()
    im = ax11.imshow(corr_matrix, cmap='RdBu_r', aspect='auto', vmin=-1, vmax=1)
    
    # Add correlation values
    for i in range(len(corr_vars)):
        for j in range(len(corr_vars)):
            ax11.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
                     ha="center", va="center", 
                     color="white" if abs(corr_matrix.iloc[i, j]) > 0.5 else "black",
                     fontweight='bold')
    
    ax11.set_xticks(range(len(corr_vars)))
    ax11.set_yticks(range(len(corr_vars)))
    ax11.set_xticklabels(corr_vars)
    ax11.set_yticklabels(corr_vars)
    ax11.set_title('Correlation Matrix', fontweight='bold')
    
    plt.tight_layout()
    return fig

# Create the dashboard
dashboard = create_cultural_dashboard(books_df, "Literary Analysis Dashboard")
plt.show()

# Summary insights
print("DASHBOARD INSIGHTS:")
print("=" * 30)
print(f" Dataset contains {len(books_df)} books from {books_df['year'].nunique()} different years")
print(f" Average book length: {books_df['pages'].mean():.0f} pages")
print(f" Average rating: {books_df['rating'].mean():.2f}/5.0")
print(f" Highest rated: {books_df.loc[books_df['rating'].idxmax(), 'title']} ({books_df['rating'].max()})")
print(f" Longest book: {books_df.loc[books_df['pages'].idxmax(), 'title']} ({books_df['pages'].max()} pages)")
print(f" Most common genre: {books_df['genre'].value_counts().index[0]} ({books_df['genre'].value_counts().iloc[0]} books)")
print(f" Publication span: {books_df['year'].max() - books_df['year'].min()} years")

## Summary

You explored:
- Setting up visualization libraries and creating basic plots
- Bar charts for categorical cultural data analysis
- Scatter plots and correlation analysis
- Time series visualization for historical trends
- Advanced visualization techniques (heatmaps, box plots, pie charts)
- Creating comprehensive multi-panel dashboards
- Critical evaluation of visualization ethics and effectiveness

**Key Visualization Types:**
- **Bar Charts**: Categorical comparisons, distributions
- **Scatter Plots**: Relationships, correlations, trends
- **Line Charts**: Time series, historical changes
- **Histograms**: Data distributions, patterns
- **Box Plots**: Statistical summaries, outliers
- **Heatmaps**: Correlation matrices, complex relationships
- **Pie Charts**: Proportional data, compositions
- **Dashboards**: Comprehensive multi-dimensional analysis

**Design Principles:**
- Choose appropriate chart types for your data and message
- Use color strategically and accessibility-consciously
- Provide clear titles, labels, and legends
- Consider your audience and context
- Balance detail with clarity
- Be aware of potential biases and misrepresentations

**Cultural Applications:**
- Literary analysis across time periods and genres
- Historical trend identification
- Cross-cultural comparisons
- Pattern recognition in large cultural datasets
- Communicating research findings to diverse audiences

**Next:** Review 09 will integrate all skills in a comprehensive  exercise.

---
 