In [1]:
"""
Geographic Analysis Script for Book Bans Project
State-level patterns, political correlations, and mapping
Author: Joseph (Geographic & Dashboard Specialist)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

def load_data():
    """Load all necessary datasets"""
    
    # Load PEN America data
    bans_df = pd.read_csv('pen_america_banned_books.csv')
    
    # Load political data
    political_df = pd.read_csv('data/geographic/state_political_classification.csv')
    
    # Load demographics
    demographics_df = pd.read_csv('data/geographic/state_demographics.csv')
    
    return bans_df, political_df, demographics_df

def analyze_state_patterns(bans_df):
    """Analyze basic state-level patterns"""
    
    print("\n=== STATE-LEVEL ANALYSIS ===\n")
    
    # Count bans per state
    state_counts = bans_df['State'].value_counts()
    
    print(f"Total states with bans: {len(state_counts)}")
    print(f"\nTop 10 states by number of bans:")
    print(state_counts.head(10))
    
    print(f"\nBottom 10 states:")
    print(state_counts.tail(10))
    
    # Save state counts
    state_counts_df = state_counts.reset_index()
    state_counts_df.columns = ['State', 'Total_Bans']
    state_counts_df.to_csv('data/processed/bans_by_state.csv', index=False)
    
    return state_counts_df

def calculate_per_capita(bans_df, demographics_df):
    """Calculate bans per capita"""
    
    print("\n=== PER CAPITA ANALYSIS ===\n")
    
    # Count bans per state
    state_counts = bans_df['State'].value_counts().reset_index()
    state_counts.columns = ['State', 'Total_Bans']
    
    # Merge with population data
    merged = state_counts.merge(demographics_df[['State', 'Population']], on='State', how='left')
    
    # Calculate per capita (per 100,000 people)
    merged['Bans_Per_100k'] = (merged['Total_Bans'] / merged['Population']) * 100000
    merged = merged.sort_values('Bans_Per_100k', ascending=False)
    
    print("Top 10 states by bans per 100k population:")
    print(merged[['State', 'Total_Bans', 'Population', 'Bans_Per_100k']].head(10))
    
    # Save
    merged.to_csv('data/processed/bans_per_capita.csv', index=False)
    
    return merged

def political_analysis(bans_df, political_df, demographics_df):
    """Analyze relationship between politics and bans"""
    
    print("\n=== POLITICAL ANALYSIS ===\n")
    
    # Count bans per state
    state_counts = bans_df['State'].value_counts().reset_index()
    state_counts.columns = ['State', 'Total_Bans']
    
    # Merge with political and population data
    merged = state_counts.merge(political_df, on='State', how='left')
    merged = merged.merge(demographics_df[['State', 'Population']], on='State', how='left')
    merged['Bans_Per_100k'] = (merged['Total_Bans'] / merged['Population']) * 100000
    
    # Compare by political leaning
    red_states = merged[merged['Political_Leaning'] == 'Red']
    blue_states = merged[merged['Political_Leaning'] == 'Blue']
    
    print(f"Red states: n={len(red_states)}")
    print(f"  Average bans per 100k: {red_states['Bans_Per_100k'].mean():.2f}")
    print(f"  Total bans: {red_states['Total_Bans'].sum()}")
    
    print(f"\nBlue states: n={len(blue_states)}")
    print(f"  Average bans per 100k: {blue_states['Bans_Per_100k'].mean():.2f}")
    print(f"  Total bans: {blue_states['Total_Bans'].sum()}")
    
    # Statistical test
    t_stat, p_value = stats.ttest_ind(red_states['Bans_Per_100k'].dropna(), 
                                       blue_states['Bans_Per_100k'].dropna())
    
    print(f"\nIndependent t-test:")
    print(f"  t-statistic: {t_stat:.4f}")
    print(f"  p-value: {p_value:.4f}")
    
    if p_value < 0.05:
        print("  ** Statistically significant difference **")
    else:
        print("  No significant difference")
    
    # Correlation with Trump vote %
    if 'Trump_2020_Percent' in merged.columns:
        correlation = merged[['Trump_2020_Percent', 'Bans_Per_100k']].corr().iloc[0, 1]
        print(f"\nCorrelation with Trump vote %: {correlation:.3f}")
    
    # Save results
    with open('documentation/political_patterns.md', 'w') as f:
        f.write("# Political Patterns in Book Banning\n\n")
        f.write(f"**Red States:**\n")
        f.write(f"- Count: {len(red_states)}\n")
        f.write(f"- Avg bans per 100k: {red_states['Bans_Per_100k'].mean():.2f}\n")
        f.write(f"- Total bans: {red_states['Total_Bans'].sum()}\n\n")
        f.write(f"**Blue States:**\n")
        f.write(f"- Count: {len(blue_states)}\n")
        f.write(f"- Avg bans per 100k: {blue_states['Bans_Per_100k'].mean():.2f}\n")
        f.write(f"- Total bans: {blue_states['Total_Bans'].sum()}\n\n")
        f.write(f"**Statistical Test:**\n")
        f.write(f"- t-statistic: {t_stat:.4f}\n")
        f.write(f"- p-value: {p_value:.4f}\n")
        if p_value < 0.05:
            f.write(f"- **Conclusion:** Significant difference between red and blue states\n")
        else:
            f.write(f"- **Conclusion:** No significant difference\n")
    
    return merged

def create_choropleth_map(data_df, column, title, filename):
    """Create interactive choropleth map using Plotly"""
    
    fig = px.choropleth(
        data_df,
        locations='State',
        locationmode='USA-states',
        color=column,
        hover_name='State',
        hover_data={column: ':.2f'},
        color_continuous_scale='Reds',
        scope='usa',
        title=title
    )
    
    fig.update_layout(
        title_font_size=18,
        geo=dict(
            bgcolor='rgba(0,0,0,0)',
            lakecolor='lightblue'
        ),
        height=600,
        margin=dict(l=0, r=0, t=50, b=0)
    )
    
    # Save
    fig.write_html(f'maps/{filename}.html')
    print(f"✓ Created: maps/{filename}.html")
    
    return fig

def create_political_map(merged_df):
    """Create map showing bans with political overlay"""
    
    # Define colors for political leaning
    color_map = {'Red': '#FF4444', 'Blue': '#4444FF', 'Purple': '#AA44AA'}
    merged_df['Color'] = merged_df['Political_Leaning'].map(color_map)
    
    fig = go.Figure()
    
    # Add choropleth for political leaning
    fig.add_trace(go.Choropleth(
        locations=merged_df['State'],
        locationmode='USA-states',
        z=merged_df['Total_Bans'],
        colorscale='Reds',
        text=merged_df['State'],
        hovertemplate='<b>%{text}</b><br>' +
                      'Political Leaning: %{customdata[0]}<br>' +
                      'Total Bans: %{z}<br>' +
                      'Bans per 100k: %{customdata[1]:.2f}<extra></extra>',
        customdata=np.column_stack((merged_df['Political_Leaning'], 
                                    merged_df['Bans_Per_100k'])),
        colorbar=dict(title="Total Bans")
    ))
    
    fig.update_layout(
        title='Book Bans by State with Political Leaning',
        title_font_size=18,
        geo=dict(
            scope='usa',
            bgcolor='rgba(0,0,0,0)',
            lakecolor='lightblue'
        ),
        height=600,
        margin=dict(l=0, r=0, t=50, b=0)
    )
    
    fig.write_html('maps/political_bans_map.html')
    print("✓ Created: maps/political_bans_map.html")
    
    return fig

def regional_analysis(bans_df):
    """Analyze by US region"""
    
    print("\n=== REGIONAL ANALYSIS ===\n")
    
    # Define regions (you may need to adjust based on your state codes)
    regions = {
        'Northeast': ['CT', 'ME', 'MA', 'NH', 'RI', 'VT', 'NJ', 'NY', 'PA'],
        'South': ['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'WV', 'AL', 'KY', 
                  'MS', 'TN', 'AR', 'LA', 'OK', 'TX'],
        'Midwest': ['IL', 'IN', 'MI', 'OH', 'WI', 'IA', 'KS', 'MN', 'MO', 'NE', 
                    'ND', 'SD'],
        'West': ['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'AK', 'CA', 
                 'HI', 'OR', 'WA']
    }
    
    # Create reverse mapping
    state_to_region = {}
    for region, states in regions.items():
        for state in states:
            state_to_region[state] = region
    
    # Add region column
    bans_df['Region'] = bans_df['State'].map(state_to_region)
    
    # Count by region
    regional_counts = bans_df['Region'].value_counts()
    print("Bans by region:")
    print(regional_counts)
    
    # Create visualization
    plt.figure(figsize=(10, 6))
    regional_counts.plot(kind='bar', color='steelblue')
    plt.title('Book Bans by US Region', fontsize=14, fontweight='bold')
    plt.xlabel('Region', fontsize=12)
    plt.ylabel('Number of Bans', fontsize=12)
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.savefig('maps/regional_comparison.png', dpi=300)
    plt.close()
    print("✓ Created: maps/regional_comparison.png")
    
    return regional_counts

# Main execution
if __name__ == "__main__":
    print("GEOGRAPHIC ANALYSIS FOR BOOK BANS PROJECT")
    print("=" * 50)
    
    # Load data
    bans_df, political_df, demographics_df = load_data()
    
    # Run analyses
    state_counts = analyze_state_patterns(bans_df)
    per_capita = calculate_per_capita(bans_df, demographics_df)
    political_data = political_analysis(bans_df, political_df, demographics_df)
    regional_counts = regional_analysis(bans_df)
    
    # Create maps
    create_choropleth_map(state_counts, 'Total_Bans', 
                         'Total Book Bans by State', 'bans_by_state')
    create_choropleth_map(per_capita, 'Bans_Per_100k', 
                         'Book Bans Per 100,000 Population', 'bans_per_capita')
    create_political_map(political_data)
    
    print("\n" + "=" * 50)
    print("ANALYSIS COMPLETE!")
    print("Check the maps/ folder for visualizations.")


GEOGRAPHIC ANALYSIS FOR BOOK BANS PROJECT


FileNotFoundError: [Errno 2] No such file or directory: 'pen_america_banned_books.csv'