In [1]:
# Advanced Visualizations and Data Profiling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
import os
import json
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

class AdvancedEDAAnalyzer:
    def __init__(self, data_path):
        # Verify file exists
        if not os.path.exists(data_path):
            raise FileNotFoundError(f"Dataset not found at: {data_path}")
            
        self.df = pd.read_csv(data_path, sep='\t', header=0)
        self.setup_data()
    
    def setup_data(self):
        """Prepare data for analysis"""
        # Define column names based on LIAR dataset structure
        columns = ['id', 'label', 'statement', 'subject', 'speaker', 'speaker_job', 
                  'state_info', 'party_affiliation', 'barely_true_counts', 
                  'false_counts', 'half_true_counts', 'mostly_true_counts', 
                  'pants_fire_counts', 'context']
        
        if len(self.df.columns) == len(columns):
            self.df.columns = columns
        else:
            warnings.warn(f"Dataset has {len(self.df.columns)} columns, expected {len(columns)}. Using original column names.")
        
        # Create derived features
        self.df['text_length'] = self.df['statement'].str.len()
        self.df['word_count'] = self.df['statement'].str.split().str.len()
        
        # Calculate credibility scores
        credibility_cols = ['barely_true_counts', 'false_counts', 'half_true_counts',
                          'mostly_true_counts', 'pants_fire_counts']
        self.df['total_statements'] = self.df[credibility_cols].sum(axis=1)
        self.df['credibility_score'] = (
            (self.df['mostly_true_counts'] * 1.0 + 
             self.df['half_true_counts'] * 0.5 + 
             self.df['barely_true_counts'] * 0.25) / 
            (self.df['total_statements'] + 1e-5)  # Avoid division by zero
    
    def create_interactive_dashboard(self):
        """Create comprehensive interactive dashboard"""
        # Create subplot figure
        fig = make_subplots(
            rows=3, cols=2,
            subplot_titles=('Label Distribution', 'Text Length by Label',
                          'Credibility Score Distribution', 'Party vs Label Analysis',
                          'Subject Category Analysis', 'Credibility vs Text Length'),
            specs=[[{"type": "bar"}, {"type": "box"}],
                   [{"type": "histogram"}, {"type": "heatmap"}],
                   [{"type": "bar"}, {"type": "scatter"}]]
        )
        
        # 1. Label Distribution
        label_counts = self.df['label'].value_counts()
        fig.add_trace(
            go.Bar(x=label_counts.index, y=label_counts.values, 
                   name='Label Count', showlegend=False),
            row=1, col=1
        )
        
        # 2. Text Length by Label - Box Plot
        for label in self.df['label'].unique():
            data = self.df[self.df['label'] == label]['text_length']
            fig.add_trace(
                go.Box(y=data, name=label, showlegend=False),
                row=1, col=2
            )
        
        # 3. Credibility Score Distribution
        fig.add_trace(
            go.Histogram(x=self.df['credibility_score'], name='Credibility', 
                        showlegend=False, nbinsx=30),
            row=2, col=1
        )
        
        # 4. Party vs Label Heatmap
        party_label_crosstab = pd.crosstab(
            self.df['party_affiliation'], self.df['label'], normalize='index'
        )
        fig.add_trace(
            go.Heatmap(z=party_label_crosstab.values,
                      x=party_label_crosstab.columns,
                      y=party_label_crosstab.index,
                      colorscale='Blues', showscale=True,
                      colorbar=dict(title='Proportion')),
            row=2, col=2
        )
        
        # 5. Subject Category Analysis
        subject_counts = self.df['subject'].value_counts().head(10)
        fig.add_trace(
            go.Bar(x=subject_counts.values, y=subject_counts.index,
                   orientation='h', name='Subject Count', showlegend=False),
            row=3, col=1
        )
        
        # 6. Credibility vs Text Length
        fig.add_trace(
            go.Scatter(x=self.df['text_length'], y=self.df['credibility_score'],
                      mode='markers', name='Correlation', showlegend=False,
                      marker=dict(opacity=0.6, size=5)),
            row=3, col=2
        )
        
        # Update layout
        fig.update_layout(
            height=1200, width=1400,
            title_text="Fake News Dataset - Comprehensive Analysis Dashboard",
            title_x=0.5
        )
        
        # Save and show
        os.makedirs("results/figures", exist_ok=True)
        fig.write_html("results/figures/interactive_dashboard.html")
        fig.show()
        
        return fig
    
    def create_word_clouds(self):
        """Generate word clouds for different truth labels"""
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))
        axes = axes.flatten()
        
        labels = self.df['label'].unique()[:6]
        
        for i, label in enumerate(labels):
            # Get statements for this label
            statements = self.df[self.df['label'] == label]['statement']
            text = ' '.join(statements.dropna().astype(str))
            
            # Create word cloud
            wordcloud = WordCloud(
                width=800, height=400, 
                background_color='white',
                max_words=100,
                colormap='viridis'
            ).generate(text)
            
            # Plot
            axes[i].imshow(wordcloud, interpolation='bilinear')
            axes[i].set_title(f'Word Cloud - {label.upper()}', 
                            fontsize=14, fontweight='bold')
            axes[i].axis('off')
        
        plt.tight_layout()
        os.makedirs("results/figures", exist_ok=True)
        plt.savefig('results/figures/word_clouds_by_label.png', 
                   dpi=300, bbox_inches='tight')
        plt.show()
    
    def statistical_significance_analysis(self):
        """Perform statistical tests"""
        results = {}
        
        # Chi-square test for party affiliation and label
        contingency_table = pd.crosstab(
            self.df['party_affiliation'], self.df['label']
        )
        chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
        results['party_label_association'] = {
            'chi2': chi2,
            'p_value': p_value,
            'significant': p_value < 0.05
        }
        
        # ANOVA for text length across labels
        groups = [group['text_length'].values for name, group in 
                 self.df.groupby('label')]
        f_stat, p_value = stats.f_oneway(*groups)
        results['text_length_anova'] = {
            'f_statistic': f_stat,
            'p_value': p_value,
            'significant': p_value < 0.05
        }
        
        # Correlation analysis
        numeric_cols = ['text_length', 'word_count', 'credibility_score',
                       'total_statements']
        correlation_matrix = self.df[numeric_cols].corr()
        
        # Create correlation heatmap
        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm',
                   center=0, square=True, fmt=".2f")
        plt.title('Feature Correlation Matrix')
        plt.tight_layout()
        os.makedirs("results/figures", exist_ok=True)
        plt.savefig('results/figures/correlation_matrix.png', dpi=300)
        plt.show()
        
        results['correlations'] = correlation_matrix
        
        return results
    
    def create_speaker_analysis(self):
        """Analyze speaker patterns"""
        # Top speakers by statement count
        speaker_counts = self.df['speaker'].value_counts().head(15)
        
        # Speaker credibility analysis
        speaker_credibility = self.df.groupby('speaker').agg({
            'credibility_score': 'mean',
            'total_statements': 'first',
            'label': 'count'
        }).sort_values('label', ascending=False).head(15)
        
        # Create visualization
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
        
        # Top speakers
        speaker_counts.plot(kind='barh', ax=ax1, color='skyblue')
        ax1.set_title('Top 15 Speakers by Statement Count')
        ax1.set_xlabel('Number of Statements')
        
        # Speaker credibility
        speaker_credibility.plot(x='total_statements', y='credibility_score',
                               kind='scatter', ax=ax2, s=100, alpha=0.7)
        ax2.set_title('Speaker Credibility vs Total Statements')
        ax2.set_xlabel('Total Statements Made')
        ax2.set_ylabel('Average Credibility Score')
        
        # Annotate top points
        for i, row in speaker_credibility.iterrows():
            ax2.annotate(i, (row['total_statements'], row['credibility_score']),
                         xytext=(5, 5), textcoords='offset points')
        
        plt.tight_layout()
        os.makedirs("results/figures", exist_ok=True)
        plt.savefig('results/figures/speaker_analysis.png', dpi=300)
        plt.show()
        
        return speaker_credibility
    
    def generate_comprehensive_report(self):
        """Generate final EDA report"""
        report = {
            'dataset_overview': {
                'total_samples': len(self.df),
                'features': list(self.df.columns),
                'label_distribution': self.df['label'].value_counts().to_dict(),
                'missing_values': self.df.isnull().sum().to_dict()
            },
            'text_statistics': {
                'avg_text_length': self.df['text_length'].mean(),
                'avg_word_count': self.df['word_count'].mean(),
                'text_length_by_label': self.df.groupby('label')['text_length'].mean().to_dict()
            },
            'metadata_insights': {
                'unique_speakers': self.df['speaker'].nunique(),
                'unique_subjects': self.df['subject'].nunique(),
                'party_distribution': self.df['party_affiliation'].value_counts().to_dict(),
                'avg_credibility_by_party': self.df.groupby('party_affiliation')['credibility_score'].mean().to_dict()
            }
        }
        
        # Save report
        os.makedirs("results/reports", exist_ok=True)
        with open('results/reports/eda_comprehensive_report.json', 'w') as f:
            json.dump(report, f, indent=2)
        
        return report

# Main execution
if __name__ == "__main__":
    # Create results directories
    os.makedirs("results/figures", exist_ok=True)
    os.makedirs("results/reports", exist_ok=True)
    
    try:
        # Use training set by default
        DATA_PATH = 'data/raw/train.tsv'
        
        print(f"Starting EDA analysis with dataset: {DATA_PATH}")
        analyzer = AdvancedEDAAnalyzer(DATA_PATH)
        
        # Generate all analyses
        print("Creating interactive dashboard...")
        dashboard = analyzer.create_interactive_dashboard()
        
        print("Generating word clouds...")
        analyzer.create_word_clouds()
        
        print("Performing statistical analysis...")
        statistical_results = analyzer.statistical_significance_analysis()
        
        print("Analyzing speaker patterns...")
        speaker_analysis = analyzer.create_speaker_analysis()
        
        print("Generating comprehensive report...")
        comprehensive_report = analyzer.generate_comprehensive_report()
        
        print("\nAdvanced EDA Analysis Complete!")
        print("Generated files:")
        print("- results/figures/interactive_dashboard.html")
        print("- results/figures/word_clouds_by_label.png")
        print("- results/figures/correlation_matrix.png")
        print("- results/figures/speaker_analysis.png")
        print("- results/reports/eda_comprehensive_report.json")
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please verify the dataset path exists")
    except Exception as e:
        print(f"Unexpected error: {e}")

SyntaxError: '(' was never closed (1197512271.py, line 50)

In [None]:
# Cell 6: Speaker Credibility Analysis
def speaker_credibility_analysis():
    """Comprehensive speaker credibility analysis"""
    # Calculate credibility metrics
    credibility_cols = ['barely_true_counts', 'false_counts', 'half_true_counts', 
                       'mostly_true_counts', 'pants_fire_counts']
    
    full_df['total_statements'] = full_df[credibility_cols].sum(axis=1)
    full_df['true_ratio'] = (full_df['mostly_true_counts'] + full_df['half_true_counts']) / (full_df['total_statements'] + 1)
    full_df['false_ratio'] = (full_df['false_counts'] + full_df['pants_fire_counts']) / (full_df['total_statements'] + 1)
    
    # Top speakers by statement count
    top_speakers = full_df.groupby('speaker').agg({
        'total_statements': 'first',
        'true_ratio': 'first',
        'false_ratio': 'first',
        'statement': 'count'
    }).sort_values('statement', ascending=False).head(15)
    
    # Visualization
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Top 15 Most Active Speakers', 'Credibility Score Distribution',
                       'True vs False Ratio by Speaker Type', 'Speaker Activity by Party'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Top speakers
    fig.add_trace(
        go.Bar(x=top_speakers.index[:10], y=top_speakers['statement'][:10],
               name='Statement Count'),
        row=1, col=1
    )
    
    # Credibility distribution
    fig.add_trace(
        go.Histogram(x=full_df['true_ratio'], nbinsx=30, name='True Ratio Distribution'),
        row=1, col=2
    )
    
    # Party analysis
    party_stats = full_df.groupby('party_affiliation').agg({
        'true_ratio': 'mean',
        'false_ratio': 'mean',
        'statement': 'count'
    }).sort_values('statement', ascending=False).head(8)
    
    fig.add_trace(
        go.Bar(x=party_stats.index, y=party_stats['true_ratio'], 
               name='Avg True Ratio'),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Bar(x=party_stats.index, y=party_stats['statement'], 
               name='Total Statements'),
        row=2, col=2
    )
    
    fig.update_layout(height=800, showlegend=True)
    fig.show()
    fig.write_html('results/figures/speaker_credibility_analysis.html')
    
    return top_speakers, party_stats

top_speakers, party_stats = speaker_credibility_analysis()

In [None]:
# Cell 7: Political Bias Analysis
def political_bias_analysis():
    """Analyze political bias patterns"""
    # Create party-label crosstab
    party_label_crosstab = pd.crosstab(full_df['party_affiliation'], full_df['label'], normalize='index')
    
    # Filter out parties with less than 50 statements
    party_counts = full_df['party_affiliation'].value_counts()
    major_parties = party_counts[party_counts >= 50].index
    filtered_crosstab = party_label_crosstab.loc[major_parties]
    
    # Create heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(filtered_crosstab, annot=True, fmt='.3f', cmap='RdYlBu_r', 
                center=0.16, cbar_kws={'label': 'Proportion'})
    plt.title('Truth Label Distribution by Political Party (Normalized)', fontsize=14)
    plt.xlabel('Truth Label')
    plt.ylabel('Political Party')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('results/figures/political_bias_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Subject analysis by party
    subject_party = pd.crosstab(full_df['subject'], full_df['party_affiliation'])
    top_subjects = full_df['subject'].value_counts().head(10).index
    subject_party_filtered = subject_party.loc[top_subjects, major_parties]
    
    plt.figure(figsize=(14, 8))
    sns.heatmap(subject_party_filtered, annot=True, fmt='d', cmap='Blues')
    plt.title('Statement Count by Subject and Political Party', fontsize=14)
    plt.xlabel('Political Party')
    plt.ylabel('Subject')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('results/figures/subject_party_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return filtered_crosstab, subject_party_filtered

bias_analysis, subject_analysis = political_bias_analysis()

In [None]:
# Cell 8: Comprehensive Data Profiling
def create_data_profile():
    """Create comprehensive data profiling report"""
    
    profile_data = {}
    
    # Basic dataset info
    profile_data['dataset_info'] = {
        'total_samples': len(full_df),
        'train_samples': len(train_df),
        'test_samples': len(test_df),
        'validation_samples': len(valid_df),
        'features': len(full_df.columns),
        'memory_usage_mb': full_df.memory_usage(deep=True).sum() / 1024**2
    }
    
    # Missing values analysis
    missing_analysis = full_df.isnull().sum()
    profile_data['missing_values'] = missing_analysis[missing_analysis > 0].to_dict()
    
    # Label distribution
    profile_data['label_distribution'] = full_df['label'].value_counts().to_dict()
    
    # Text statistics
    profile_data['text_statistics'] = {
        'avg_text_length': full_df['text_length'].mean(),
        'avg_word_count': full_df['word_count'].mean(),
        'avg_sentence_count': full_df['sentence_count'].mean(),
        'max_text_length': full_df['text_length'].max(),
        'min_text_length': full_df['text_length'].min(),
    }
    
    # Categorical variable stats
    categorical_vars = ['subject', 'speaker', 'party_affiliation', 'state_info']
    profile_data['categorical_stats'] = {}
    
    for var in categorical_vars:
        if var in full_df.columns:
            profile_data['categorical_stats'][var] = {
                'unique_values': full_df[var].nunique(),
                'most_frequent': full_df[var].mode()[0] if len(full_df[var].mode()) > 0 else 'N/A',
                'frequency_of_most_frequent': full_df[var].value_counts().iloc[0]
            }
    
    # Save profile as JSON
    import json
    with open('results/reports/data_profile.json', 'w') as f:
        json.dump(profile_data, f, indent=2, default=str)
    
    return profile_data

data_profile = create_data_profile()
print("Data profile created and saved!")

In [2]:
# Cell 8: Comprehensive Data Profiling
def create_data_profile():
    """Create comprehensive data profiling report"""
    
    profile_data = {}
    
    # Basic dataset info
    profile_data['dataset_info'] = {
        'total_samples': len(full_df),
        'train_samples': len(train_df),
        'test_samples': len(test_df),
        'validation_samples': len(valid_df),
        'features': len(full_df.columns),
        'memory_usage_mb': full_df.memory_usage(deep=True).sum() / 1024**2
    }
    
    # Missing values analysis
    missing_analysis = full_df.isnull().sum()
    profile_data['missing_values'] = missing_analysis[missing_analysis > 0].to_dict()
    
    # Label distribution
    profile_data['label_distribution'] = full_df['label'].value_counts().to_dict()
    
    # Text statistics
    profile_data['text_statistics'] = {
        'avg_text_length': full_df['text_length'].mean(),
        'avg_word_count': full_df['word_count'].mean(),
        'avg_sentence_count': full_df['sentence_count'].mean(),
        'max_text_length': full_df['text_length'].max(),
        'min_text_length': full_df['text_length'].min(),
    }
    
    # Categorical variable stats
    categorical_vars = ['subject', 'speaker', 'party_affiliation', 'state_info']
    profile_data['categorical_stats'] = {}
    
    for var in categorical_vars:
        if var in full_df.columns:
            profile_data['categorical_stats'][var] = {
                'unique_values': full_df[var].nunique(),
                'most_frequent': full_df[var].mode()[0] if len(full_df[var].mode()) > 0 else 'N/A',
                'frequency_of_most_frequent': full_df[var].value_counts().iloc[0]
            }
    
    # Save profile as JSON
    import json
    with open('results/reports/data_profile.json', 'w') as f:
        json.dump(profile_data, f, indent=2, default=str)
    
    return profile_data

data_profile = create_data_profile()
print("Data profile created and saved!")

NameError: name 'full_df' is not defined

In [3]:
# Cell 9: Save All Analysis Results
def save_day3_results():
    """Save all Day 3 analysis results"""
    
    # Create summary statistics
    summary_stats = {
        'text_metrics': full_df[['text_length', 'word_count', 'sentence_count', 
                                'avg_word_length', 'capital_ratio']].describe(),
        'label_distribution': full_df['label'].value_counts(),
        'party_distribution': full_df['party_affiliation'].value_counts(),
        'subject_distribution': full_df['subject'].value_counts()
    }
    
    # Save processed dataset
    full_df.to_csv('results/processed_liar_dataset.csv', index=False)
    
    # Save summary statistics
    with pd.ExcelWriter('results/reports/day3_analysis_summary.xlsx') as writer:
        summary_stats['text_metrics'].to_excel(writer, sheet_name='Text_Metrics')
        summary_stats['label_distribution'].to_excel(writer, sheet_name='Label_Distribution')
        summary_stats['party_distribution'].to_excel(writer, sheet_name='Party_Distribution')
        summary_stats['subject_distribution'].to_excel(writer, sheet_name='Subject_Distribution')
        
        # Add top speakers and party analysis
        top_speakers.to_excel(writer, sheet_name='Top_Speakers')
        party_stats.to_excel(writer, sheet_name='Party_Statistics')
    
    print("All Day 3 results saved successfully!")
    print(f"Files saved in:")
    print(f"- results/figures/ (visualizations)")
    print(f"- results/reports/ (analysis summaries)")
    print(f"- results/processed_liar_dataset.csv (processed data)")

save_day3_results()

NameError: name 'full_df' is not defined