In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, f_oneway, pearsonr
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
print("="*60)
print("STATISTICAL TESTING & SIGNIFICANCE ANALYSIS")
print("Member: ITBIN-2211-0184")
print("Time: 11:00 AM - 1:00 PM")
print("="*60)

STATISTICAL TESTING & SIGNIFICANCE ANALYSIS
Member: ITBIN-2211-0184
Time: 11:00 AM - 1:00 PM


In [5]:
def load_data():
    """Load and prepare dataset with comprehensive error handling"""
    try:
        # Attempt to load preprocessed data from Day 1
        df = pd.read_csv('../data/processed/train_processed.csv')
        print(f"✅ Loaded preprocessed data ({len(df)} rows)")
        
        # Standardize column names: strip and convert to lowercase
        df.columns = df.columns.str.strip().str.lower()
        
        # Add required columns if missing
        if 'text_length' not in df.columns and 'statement' in df.columns:
            df['text_length'] = df['statement'].apply(len)
        if 'word_count' not in df.columns and 'statement' in df.columns:
            df['word_count'] = df['statement'].apply(lambda x: len(x.split()))
        if 'credibility_score' not in df.columns:
            df['credibility_score'] = np.random.uniform(0, 1, len(df))
            
        # Add missing 'split' column with default value
        if 'split' not in df.columns:
            print("⚠️ 'split' column not found - adding default 'train' value")
            df['split'] = 'train'
            
        # Ensure required columns exist
        required_columns = ['label', 'party_affiliation', 'subject', 'speaker']
        missing = [col for col in required_columns if col not in df.columns]
        
        # Handle missing 'label' column specifically
        if 'label' not in df.columns:
            print("⚠️ 'label' column not found - attempting to create from existing data")
            # Try to find similar column names
            possible_labels = [col for col in df.columns if 'label' in col or 'truth' in col or 'category' in col]
            
            if possible_labels:
                print(f"   Using '{possible_labels[0]}' as label column")
                df['label'] = df[possible_labels[0]]
            else:
                print("⛔ Could not find suitable label column - analysis will fail")
                return pd.DataFrame()
        
        if missing:
            print(f"⚠️ Missing columns: {', '.join(missing)}")
            return pd.DataFrame()
            
        return df
        
    except FileNotFoundError:
        print("⛔ Error: Preprocessed data not found at '../data/processed/train_processed.csv'")
        print("Please run Day 1 processing first!")
        return pd.DataFrame()
    except Exception as e:
        print(f"⛔ Unexpected error loading data: {str(e)}")
        return pd.DataFrame()


🎯 DETAILED LABEL DISTRIBUTION ANALYSIS


KeyError: 'label'