In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, f_oneway, pearsonr
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
print("="*60)
print("STATISTICAL TESTING & SIGNIFICANCE ANALYSIS")
print("Member: ITBIN-2211-0184")
print("Time: 11:00 AM - 1:00 PM")
print("="*60)

STATISTICAL TESTING & SIGNIFICANCE ANALYSIS
Member: ITBIN-2211-0184
Time: 11:00 AM - 1:00 PM


In [3]:
def visualize_statistical_tests(df, test_results):
    """Create visualizations for statistical test results"""
    print("\n📊 Creating statistical test visualizations...")
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Statistical Significance Test Results', fontsize=16, fontweight='bold')
    
    # 1. Party affiliation vs Label heatmap
    party_label_crosstab = pd.crosstab(df['party_affiliation'], df['label'], normalize='index')
    top_parties = df['party_affiliation'].value_counts().head(10).index
    party_subset = party_label_crosstab.loc[top_parties]
    
    sns.heatmap(party_subset, annot=True, cmap='YlOrRd', ax=axes[0,0], fmt='.3f')
    axes[0,0].set_title(f'Party vs Label Distribution\n(χ²={test_results["chi2_party"]["statistic"]:.2f}, p={test_results["chi2_party"]["p_value"]:.2e})')
    axes[0,0].set_xlabel('Truth Label')
    axes[0,0].set_ylabel('Party Affiliation')
    
    # 2. Subject vs Label heatmap (top subjects)
    subject_label_crosstab = pd.crosstab(df['subject'], df['label'], normalize='index')
    top_subjects = df['subject'].value_counts().head(8).index
    subject_subset = subject_label_crosstab.loc[top_subjects]
    
    sns.heatmap(subject_subset, annot=True, cmap='YlGnBu', ax=axes[0,1], fmt='.3f')
    axes[0,1].set_title(f'Subject vs Label Distribution\n(χ²={test_results["chi2_subject"]["statistic"]:.2f})')
    axes[0,1].set_xlabel('Truth Label')
    axes[0,1].set_ylabel('Subject')
    
    # 3. ANOVA F-statistics
    anova_features = [k for k in test_results.keys() if k.startswith('anova_')]
    anova_names = [k.replace('anova_', '') for k in anova_features]
    anova_f_stats = [test_results[k]['f_statistic'] for k in anova_features]
    anova_p_vals = [test_results[k]['p_value'] for k in anova_features]
    
    colors = ['red' if p < 0.05 else 'blue' for p in anova_p_vals]
    bars = axes[0,2].bar(range(len(anova_names)), anova_f_stats, color=colors, alpha=0.7)
    axes[0,2].set_xticks(range(len(anova_names)))
    axes[0,2].set_xticklabels(anova_names, rotation=45, ha='right')
    axes[0,2].set_title('ANOVA F-statistics\n(Red = Significant)')
    axes[0,2].set_ylabel('F-statistic')  # FIXED THE UNTERMINATED STRING HERE
    
    # Add p-value annotations
    for i, (bar, p_val) in enumerate(zip(bars, anova_p_vals)):
        height = bar.get_height()
        axes[0,2].text(i, height + 0.5, f'p={p_val:.2e}', ha='center', fontsize=9)
    
    # 4. T-test results visualization
    ttest_features = [k for k in test_results.keys() if k.startswith('ttest_')]
    ttest_names = [k.replace('ttest_', '') for k in ttest_features]
    ttest_stats = [test_results[k]['t_statistic'] for k in ttest_features]
    ttest_p_vals = [test_results[k]['p_value'] for k in ttest_features]
    
    colors = ['red' if p < 0.05 else 'blue' for p in ttest_p_vals]
    axes[1,0].bar(range(len(ttest_names)), ttest_stats, color=colors, alpha=0.7)
    axes[1,0].set_xticks(range(len(ttest_names)))
    axes[1,0].set_xticklabels(ttest_names, rotation=45, ha='right')
    axes[1,0].set_title('T-test Statistics\n(True vs False Statements)')
    axes[1,0].set_ylabel('T-statistic')
    
    # 5. Correlation visualization
    corr_features = [k for k in test_results.keys() if k.startswith('corr_')]
    corr_names = [k.replace('corr_', '').replace('_truth', '') for k in corr_features]
    corr_coeffs = [test_results[k]['correlation'] for k in corr_features]
    corr_p_vals = [test_results[k]['p_value'] for k in corr_features]
    
    # Create a bar plot for correlations
    bars = axes[1,1].bar(range(len(corr_names)), corr_coeffs, color='purple', alpha=0.7)
    axes[1,1].set_xticks(range(len(corr_names)))
    axes[1,1].set_xticklabels(corr_names, rotation=45, ha='right')
    axes[1,1].set_title('Feature Correlations with Truth Score')
    axes[1,1].set_ylabel('Pearson Correlation Coefficient')
    axes[1,1].axhline(0, color='gray', linestyle='--')
    
    # Add significance markers
    for i, (bar, p_val) in enumerate(zip(bars, corr_p_vals)):
        if p_val < 0.05:
            axes[1,1].text(i, bar.get_height() + 0.02, '*', ha='center', fontsize=14, color='red')
    
    # 6. P-value summary
    all_tests = [('Chi-square', 'chi2_party'), ('Chi-square', 'chi2_subject')] + \
                [(name, key) for key, name in zip(anova_features, anova_names)] + \
                [(name, key) for key, name in zip(ttest_features, ttest_names)] + \
                [(name, key) for key, name in zip(corr_features, corr_names)]
    
    test_names = [t[0] for t in all_tests]
    p_values = [test_results[t[1]]['p_value'] for t in all_tests]
    
    # Create -log10(p-value) for visualization
    log_p_values = [-np.log10(p) if p > 0 else 20 for p in p_values]
    
    axes[1,2].barh(range(len(test_names)), log_p_values, color='teal', alpha=0.7)
    axes[1,2].set_yticks(range(len(test_names)))
    axes[1,2].set_yticklabels(test_names)
    axes[1,2].set_title('Statistical Significance Summary')
    axes[1,2].set_xlabel('-log10(p-value)')
    axes[1,2].axvline(-np.log10(0.05), color='red', linestyle='--', label='p=0.05')
    axes[1,2].legend()
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.92)
    plt.savefig('results/plots/statistical_tests_summary.png', dpi=300, bbox_inches='tight')
    print("Statistical test visualizations saved to 'results/plots/statistical_tests_summary.png'")
    plt.show()

SyntaxError: unterminated string literal (detected at line 169) (2954275967.py, line 169)