In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set style for better visualizations
plt.style.use('default')
sns.set_palette("husl")

print("=== FAKE NEWS DETECTION PROJECT ===")
print("Member: ITBIN-2211-0184")
print("Role: EDA & Documentation")
print("Environment Setup Complete!")

=== FAKE NEWS DETECTION PROJECT ===
Member: ITBIN-2211-0184
Role: EDA & Documentation
Environment Setup Complete!


In [3]:
## 1. DATA LOADING AND INITIAL EXPLORATION

def load_liar_dataset():
    """Load all three splits of the LIAR dataset"""
    try:
        # Column names for LIAR dataset
        columns = [
            'label', 'statement', 'subject', 'speaker', 'speaker_job',
            'state_info', 'party_affiliation', 'barely_true_counts',
            'false_counts', 'half_true_counts', 'mostly_true_counts',
            'pants_fire_counts', 'context'
        ]
        
        # Load datasets
        train_df = pd.read_csv('data/raw/train.tsv', sep='\t', names=columns, header=None)
        test_df = pd.read_csv('data/raw/test.tsv', sep='\t', names=columns, header=None)
        valid_df = pd.read_csv('data/raw/valid.tsv', sep='\t', names=columns, header=None)
        
        # Add dataset split information
        train_df['split'] = 'train'
        test_df['split'] = 'test'
        valid_df['split'] = 'valid'
        
        # Combine all splits for comprehensive analysis
        full_df = pd.concat([train_df, test_df, valid_df], ignore_index=True)
        
        print("✅ Dataset loaded successfully!")
        print(f"📊 Training samples: {len(train_df)}")
        print(f"📊 Testing samples: {len(test_df)}")
        print(f"📊 Validation samples: {len(valid_df)}")
        print(f"📊 Total samples: {len(full_df)}")
        
        return train_df, test_df, valid_df, full_df
        
    except FileNotFoundError:
        print("❌ Dataset files not found!")
        print("Please ensure the following files are in data/raw/:")
        print("- train.tsv")
        print("- test.tsv") 
        print("- valid.tsv")
        return None, None, None, None

In [4]:
# Load the dataset
train_df, test_df, valid_df, full_df = load_liar_dataset()

❌ Dataset files not found!
Please ensure the following files are in data/raw/:
- train.tsv
- test.tsv
- valid.tsv


In [5]:
## 2. BASIC DATA EXPLORATION

def basic_data_exploration(df):
    """Perform basic exploration of the dataset"""
    print("\n=== BASIC DATA EXPLORATION ===")
    
    # Dataset shape
    print(f"📋 Dataset Shape: {df.shape}")
    print(f"📋 Features: {df.shape[1]}")
    print(f"📋 Samples: {df.shape[0]}")
    
    # Data types
    print("\n📊 Data Types:")
    print(df.dtypes)
    
    # Missing values
    print("\n🔍 Missing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Percentage': missing_pct
    })
    print(missing_df[missing_df['Missing Count'] > 0])
    
    # Label distribution
    print("\n🎯 Label Distribution:")
    label_counts = df['label'].value_counts()
    label_pct = df['label'].value_counts(normalize=True) * 100
    
    for label in label_counts.index:
        print(f"{label}: {label_counts[label]} ({label_pct[label]:.1f}%)")
    
    return missing_df, label_counts

# Perform basic exploration
if full_df is not None:
    missing_info, label_dist = basic_data_exploration(full_df)

In [28]:
def create_comprehensive_visualizations(df):
    """Create comprehensive EDA visualizations"""
    print("\n=== CREATING VISUALIZATIONS ===")
    
    # Set up the plotting area
    fig = plt.figure(figsize=(20, 24))
    
    # 1. Label Distribution
    plt.subplot(4, 3, 1)
    label_counts = df['label'].value_counts()
    colors = plt.cm.Set3(np.linspace(0, 1, len(label_counts)))
    bars = plt.bar(label_counts.index, label_counts.values, color=colors)
    plt.title('Label Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Truth Labels')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 50,
                f'{int(height)}', ha='center', va='bottom')
    
    # 2. Text Length Distribution
    plt.subplot(4, 3, 2)
    plt.hist(df['text_length'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    plt.title('Statement Length Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Text Length (characters)')
    plt.ylabel('Frequency')
    plt.axvline(df['text_length'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["text_length"].mean():.0f}')
    plt.legend()
    
    # 3. Word Count by Label
    plt.subplot(4, 3, 3)
    sns.boxplot(data=df, x='label', y='word_count')
    plt.title('Word Count by Label', fontsize=14, fontweight='bold')
    plt.xlabel('Truth Labels')
    plt.ylabel('Word Count')
    plt.xticks(rotation=45)
    
    # 4. Party Affiliation Distribution
    plt.subplot(4, 3, 4)
    party_counts = df['party_affiliation'].value_counts().head(10)
    plt.barh(range(len(party_counts)), party_counts.values)
    plt.yticks(range(len(party_counts)), party_counts.index)
    plt.title('Top 10 Party Affiliations', fontsize=14, fontweight='bold')
    plt.xlabel('Count')
    
    # 5. Subject Distribution
    plt.subplot(4, 3, 5)
    subject_counts = df['subject'].value_counts().head(10)
    plt.barh(range(len(subject_counts)), subject_counts.values, color='lightcoral')
    plt.yticks(range(len(subject_counts)), subject_counts.index)
    plt.title('Top 10 Subjects', fontsize=14, fontweight='bold')
    plt.xlabel('Count')
    
    # 6. Credibility Score Distribution
    plt.subplot(4, 3, 6)
    plt.hist(df['credibility_score'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
    plt.title('Speaker Credibility Score Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Credibility Score')
    plt.ylabel('Frequency')
    plt.axvline(df['credibility_score'].mean(), color='red', linestyle='--',
                label=f'Mean: {df["credibility_score"].mean():.3f}')
    plt.legend()
    
    # 7. Label vs Credibility Score
    plt.subplot(4, 3, 7)
    sns.boxplot(data=df, x='label', y='credibility_score')
    plt.title('Credibility Score by Label', fontsize=14, fontweight='bold')
    plt.xlabel('Truth Labels')
    plt.ylabel('Credibility Score')
    plt.xticks(rotation=45)
    
    # 8. Truth Score vs Text Length
    plt.subplot(4, 3, 8)
    plt.scatter(df['text_length'], df['truth_score'], alpha=0.5, color='purple')
    plt.title('Truth Score vs Text Length', fontsize=14, fontweight='bold')
    plt.xlabel('Text Length')
    plt.ylabel('Truth Score')
    
    # Add trend line
    z = np.polyfit(df['text_length'].dropna(), df['truth_score'].dropna(), 1)
    p = np.poly1d(z)
    plt.plot(df['text_length'], p(df['text_length']), "r--", alpha=0.8)
    
    # 9. Correlation Heatmap
    plt.subplot(4, 3, 9)
    # Select numeric columns for correlation
    numeric_cols = ['text_length', 'word_count', 'sentence_count', 'credibility_score', 
                   'truth_score', 'total_statements']
    corr_matrix = df[numeric_cols].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='RdYlBu_r', center=0, 
                square=True, linewidths=0.5)
    plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
    
    # 10. Statement Count by Split
    plt.subplot(4, 3, 10)
    split_counts = df['split'].value_counts()
    plt.pie(split_counts.values, labels=split_counts.index, autopct='%1.1f%%',
            colors=['lightblue', 'lightgreen', 'lightyellow'])
    plt.title('Data Split Distribution', fontsize=14, fontweight='bold')
    
    # 11. Top Speakers by Statement Count
    plt.subplot(4, 3, 11)
    top_speakers = df['speaker'].value_counts().head(10)
    plt.barh(range(len(top_speakers)), top_speakers.values, color='orange')
    plt.yticks(range(len(top_speakers)), top_speakers.index)
    plt.title('Top 10 Speakers by Statement Count', fontsize=14, fontweight='bold')
    plt.xlabel('Number of Statements')
    
    # 12. Average Truth Score by Party
    plt.subplot(4, 3, 12)
    party_truth = df.groupby('party_affiliation')['truth_score'].mean().sort_values(ascending=False).head(10)
    plt.barh(range(len(party_truth)), party_truth.values, color='mediumpurple')
    plt.yticks(range(len(party_truth)), party_truth.index)
    plt.title('Average Truth Score by Party (Top 10)', fontsize=14, fontweight='bold')
    plt.xlabel('Average Truth Score')
    
    plt.tight_layout()
    plt.savefig('results/plots/comprehensive_eda.png', dpi=300, bbox_inches='tight')
    print("📊 Comprehensive EDA visualization saved to 'results/plots/comprehensive_eda.png'")
    plt.show()
    
    return fig


In [32]:
## 4. COMPREHENSIVE VISUALIZATIONS

def create_comprehensive_visualizations(df):
    """Create comprehensive EDA visualizations"""
    print("\n=== CREATING VISUALIZATIONS ===")
    
    # Set up the plotting area
    fig = plt.figure(figsize=(20, 24))
    
    # 1. Label Distribution
    plt.subplot(4, 3, 1)
    label_counts = df['label'].value_counts()
    colors = plt.cm.Set3(np.linspace(0, 1, len(label_counts)))
    bars = plt.bar(label_counts.index, label_counts.values, color=colors)
    plt.title('Label Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Truth Labels')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 50,
                f'{int(height)}', ha='center', va='bottom')
    
    # 2. Text Length Distribution
    plt.subplot(4, 3, 2)
    plt.hist(df['text_length'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    plt.title('Statement Length Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Text Length (characters)')
    plt.ylabel('Frequency')
    plt.axvline(df['text_length'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["text_length"].mean():.0f}')
    plt.legend()
    
    # 3. Word Count by Label
    plt.subplot(4, 3, 3)
    sns.boxplot(data=df, x='label', y='word_count')
    plt.title('Word Count by Label', fontsize=14, fontweight='bold')
    plt.xlabel('Truth Labels')
    plt.ylabel('Word Count')
    plt.xticks(rotation=45)
    
    # 4. Party Affiliation Distribution
    plt.subplot(4, 3, 4)
    party_counts = df['party_affiliation'].value_counts().head(10)
    plt.barh(range(len(party_counts)), party_counts.values)
    plt.yticks(range(len(party_counts)), party_counts.index)
    plt.title('Top 10 Party Affiliations', fontsize=14, fontweight='bold')
    plt.xlabel('Count')
    
    # 5. Subject Distribution
    plt.subplot(4, 3, 5)
    subject_counts = df['subject'].value_counts().head(10)
    plt.barh(range(len(subject_counts)), subject_counts.values, color='lightcoral')
    plt.yticks(range(len(subject_counts)), subject_counts.index)
    plt.title('Top 10 Subjects', fontsize=14, fontweight='bold')
    plt.xlabel('Count')
    
    # 6. Credibility Score Distribution
    plt.subplot(4, 3, 6)
    plt.hist(df['credibility_score'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
    plt.title('Speaker Credibility Score Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Credibility Score')
    plt.ylabel('Frequency')
    plt.axvline(df['credibility_score'].mean(), color='red', linestyle='--',
                label=f'Mean: {df["credibility_score"].mean():.3f}')
    plt.legend()
    
    # 7. Label vs Credibility Score
    plt.subplot(4, 3, 7)
    sns.boxplot(data=df, x='label', y='credibility_score')
    plt.title('Credibility Score by Label', fontsize=14, fontweight='bold')
    plt.xlabel('Truth Labels')
    plt.ylabel('Credibility Score')
    plt.xticks(rotation=45)
    
    # 8. Truth Score vs Text Length
    plt.subplot(4, 3, 8)
    plt.scatter(df['text_length'], df['truth_score'], alpha=0.5, color='purple')
    plt.title('Truth Score vs Text Length', fontsize=14, fontweight='bold')
    plt.xlabel('Text Length')
    plt.ylabel('Truth Score')
    
    # Add trend line
    z = np.polyfit(df['text_length'].dropna(), df['truth_score'].dropna(), 1)
    p = np.poly1d(z)
    plt.plot(df['text_length'], p(df['text_length']), "r--", alpha=0.8)
    
    # 9. Correlation Heatmap
    plt.subplot(4, 3, 9)
    # Select numeric columns for correlation
    numeric_cols = ['text_length', 'word_count', 'sentence_count', 'credibility_score', 
                   'truth_score', 'total_statements']
    corr_matrix = df[numeric_cols].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='RdYlBu_r', center=0, 
                square=True, linewidths=0.5)
    plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
    
    # 10. Statement Count by Split
    plt.subplot(4, 3, 10)
    split_counts = df['split'].value_counts()
    plt.pie(split_counts.values, labels=split_counts.index, autopct='%1.1f%%',
            colors=['lightblue', 'lightgreen', 'lightyellow'])
    plt.title('Data Split Distribution', fontsize=14, fontweight='bold')
    
    # 11. Top Speakers by Statement Count
    plt.subplot(4, 3, 11)
    top_speakers = df['speaker'].value_counts().head(10)
    plt.barh(range(len(top_speakers)), top_speakers.values, color='orange')
    plt.yticks(range(len(top_speakers)), top_speakers.index)
    plt.title('Top 10 Speakers by Statement Count', fontsize=14, fontweight='bold')
    plt.xlabel('Number of Statements')
    
    # 12. Average Truth Score by Party
    plt.subplot(4, 3, 12)
    party_truth = df.groupby('party_affiliation')['truth_score'].mean().sort_values(ascending=False).head(10)
    plt.barh(range(len(party_truth)), party_truth.values, color='mediumpurple')
    plt.yticks(range(len(party_truth)), party_truth.index)
    plt.title('Average Truth Score by Party (Top 10)', fontsize=14, fontweight='bold')
    plt.xlabel('Average Truth Score')
    
    plt.tight_layout()
    plt.savefig('results/plots/comprehensive_eda.png', dpi=300, bbox_inches='tight')
    print("📊 Comprehensive EDA visualization saved to 'results/plots/comprehensive_eda.png'")
    plt.show()
    
    return fig

# Create visualizations
if full_df is not None:
    viz_fig = create_comprehensive_visualizations(full_df)

In [33]:
## 5. ADVANCED ANALYSIS

def advanced_analysis(df):
    """Perform advanced statistical analysis"""
    print("\n=== ADVANCED ANALYSIS ===")
    
    # Chi-square test for categorical associations
    from scipy.stats import chi2_contingency
    
    # Test association between party and label
    party_label_crosstab = pd.crosstab(df['party_affiliation'], df['label'])
    chi2, p_val, dof, expected = chi2_contingency(party_label_crosstab)
    
    print(f"🧮 Chi-square test (Party vs Label):")
    print(f"   Chi-square statistic: {chi2:.4f}")
    print(f"   P-value: {p_val:.4e}")
    print(f"   Degrees of freedom: {dof}")
    
    # Correlation analysis
    print(f"\n📊 Key Correlations:")
    correlations = df[['text_length', 'credibility_score', 'truth_score', 'total_statements']].corr()
    print(f"   Text Length vs Truth Score: {correlations.loc['text_length', 'truth_score']:.4f}")
    print(f"   Credibility vs Truth Score: {correlations.loc['credibility_score', 'truth_score']:.4f}")
    
    # Group analysis by subject
    subject_analysis = df.groupby('subject').agg({
        'truth_score': ['mean', 'std', 'count'],
        'text_length': 'mean',
        'credibility_score': 'mean'
    }).round(3)
    
    print(f"\n📈 Subject Analysis (Top 5 by count):")
    top_subjects = df['subject'].value_counts().head().index
    print(subject_analysis.loc[top_subjects])
    
    return party_label_crosstab, subject_analysis

# Perform advanced analysis
if full_df is not None:
    crosstab_result, subject_stats = advanced_analysis(full_df)


In [34]:
## 6. DATA QUALITY ASSESSMENT

def data_quality_assessment(df):
    """Assess data quality and create report"""
    print("\n=== DATA QUALITY ASSESSMENT ===")
    
    quality_report = {}
    
    # Completeness
    completeness = (df.count() / len(df)) * 100
    quality_report['completeness'] = completeness
    
    # Duplicates
    duplicates = df.duplicated().sum()
    quality_report['total_duplicates'] = duplicates
    quality_report['duplicate_percentage'] = (duplicates / len(df)) * 100
    
    # Unique values
    unique_counts = df.nunique()
    quality_report['unique_counts'] = unique_counts
    
    # Text quality checks
    empty_statements = df['statement'].isna().sum()
    very_short_statements = (df['text_length'] < 10).sum()
    very_long_statements = (df['text_length'] > 500).sum()
    
    print(f"📋 Data Quality Report:")
    print(f"   Total samples: {len(df)}")
    print(f"   Complete records: {(df.dropna().shape[0] / len(df)) * 100:.1f}%")
    print(f"   Duplicate records: {duplicates} ({quality_report['duplicate_percentage']:.1f}%)")
    print(f"   Empty statements: {empty_statements}")
    print(f"   Very short statements (<10 chars): {very_short_statements}")
    print(f"   Very long statements (>500 chars): {very_long_statements}")
    
    # Feature completeness
    print(f"\n📊 Feature Completeness:")
    for col in df.columns:
        if completeness[col] < 100:
            print(f"   {col}: {completeness[col]:.1f}% complete")
    
    return quality_report

# Assess data quality
if full_df is not None:
    quality_report = data_quality_assessment(full_df)