In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set style for better visualizations
plt.style.use('default')
sns.set_palette("husl")

print("=== FAKE NEWS DETECTION PROJECT ===")
print("Member: ITBIN-2211-0184")
print("Role: EDA & Documentation")
print("Environment Setup Complete!")

=== FAKE NEWS DETECTION PROJECT ===
Member: ITBIN-2211-0184
Role: EDA & Documentation
Environment Setup Complete!


In [3]:
## 1. DATA LOADING AND INITIAL EXPLORATION

def load_liar_dataset():
    """Load all three splits of the LIAR dataset"""
    try:
        # Column names for LIAR dataset
        columns = [
            'label', 'statement', 'subject', 'speaker', 'speaker_job',
            'state_info', 'party_affiliation', 'barely_true_counts',
            'false_counts', 'half_true_counts', 'mostly_true_counts',
            'pants_fire_counts', 'context'
        ]
        
        # Load datasets
        train_df = pd.read_csv('data/raw/train.tsv', sep='\t', names=columns, header=None)
        test_df = pd.read_csv('data/raw/test.tsv', sep='\t', names=columns, header=None)
        valid_df = pd.read_csv('data/raw/valid.tsv', sep='\t', names=columns, header=None)
        
        # Add dataset split information
        train_df['split'] = 'train'
        test_df['split'] = 'test'
        valid_df['split'] = 'valid'
        
        # Combine all splits for comprehensive analysis
        full_df = pd.concat([train_df, test_df, valid_df], ignore_index=True)
        
        print("✅ Dataset loaded successfully!")
        print(f"📊 Training samples: {len(train_df)}")
        print(f"📊 Testing samples: {len(test_df)}")
        print(f"📊 Validation samples: {len(valid_df)}")
        print(f"📊 Total samples: {len(full_df)}")
        
        return train_df, test_df, valid_df, full_df
        
    except FileNotFoundError:
        print("❌ Dataset files not found!")
        print("Please ensure the following files are in data/raw/:")
        print("- train.tsv")
        print("- test.tsv") 
        print("- valid.tsv")
        return None, None, None, None

In [4]:
# Load the dataset
train_df, test_df, valid_df, full_df = load_liar_dataset()

❌ Dataset files not found!
Please ensure the following files are in data/raw/:
- train.tsv
- test.tsv
- valid.tsv


In [5]:
## 2. BASIC DATA EXPLORATION

def basic_data_exploration(df):
    """Perform basic exploration of the dataset"""
    print("\n=== BASIC DATA EXPLORATION ===")
    
    # Dataset shape
    print(f"📋 Dataset Shape: {df.shape}")
    print(f"📋 Features: {df.shape[1]}")
    print(f"📋 Samples: {df.shape[0]}")
    
    # Data types
    print("\n📊 Data Types:")
    print(df.dtypes)
    
    # Missing values
    print("\n🔍 Missing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Percentage': missing_pct
    })
    print(missing_df[missing_df['Missing Count'] > 0])
    
    # Label distribution
    print("\n🎯 Label Distribution:")
    label_counts = df['label'].value_counts()
    label_pct = df['label'].value_counts(normalize=True) * 100
    
    for label in label_counts.index:
        print(f"{label}: {label_counts[label]} ({label_pct[label]:.1f}%)")
    
    return missing_df, label_counts

# Perform basic exploration
if full_df is not None:
    missing_info, label_dist = basic_data_exploration(full_df)

In [6]:
## 4. COMPREHENSIVE VISUALIZATIONS

def create_comprehensive_visualizations(df):
    """Create comprehensive EDA visualizations"""
    print("\n=== CREATING VISUALIZATIONS ===")
    
    # Set up the plotting area
    fig = plt.figure(figsize=(20, 24))

In [7]:
try:
    # Print data inspection info
    print("All columns in DataFrame:", df.columns.tolist())
    print("\nFirst 5 rows:")
    print(df.head())
    
    # Find potential label columns
    possible_labels = [col for col in df.columns 
                      if 'label' in col.lower() 
                      or 'class' in col.lower() 
                      or 'category' in col.lower()
                      or 'target' in col.lower()]
    
    print("\nPossible label columns:", possible_labels)
    
    if not possible_labels:
        raise ValueError("No obvious label column found in the DataFrame")
    
    # Use the first found label column
    label_column = possible_labels[0]
    
    # 1. Label Distribution
    plt.figure(figsize=(10, 6))
    plt.subplot(4, 3, 1)
    label_counts = df[label_column].value_counts()
    colors = plt.cm.Set3(np.linspace(0, 1, len(label_counts)))
    bars = plt.bar(label_counts.index, label_counts.values, color=colors)
    plt.title(f'{label_column} Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Truth Labels')
    plt.ylabel('Count')
    plt.xticks(rotation=45)

    # Add value Labels on bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 50,
                f'{int(height)}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

except NameError:
    print("Error: DataFrame 'df' is not defined. Please load your data first.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

NameError: name 'df' is not defined

In [None]:
print("\nFirst 5 rows:")
print(df.head())

In [None]:
# Rename columns
new_columns = {
    '2635.json': 'id',
    'false': 'label',
    'Says the Annies List political group supports third-trimester abortions on demand.': 'claim',
    'abortion': 'topic',
    'dwayne-bohac': 'person',
    'State representative': 'position',
    'Texas': 'state',
    'republican': 'party',
    '@': 'source_1',
    '@.1': 'source_2',
    '@.2': 'source_3',
    '@.3': 'source_4',
    'a mailer': 'source_5'
}

df = df.rename(columns=new_columns)
print(df.columns)  # Verify changes

In [None]:
# Countplot for 'label' column
sns.countplot(data=df, x='label')
plt.title("Distribution of True/Fake Claims")
plt.show()

In [None]:
# Group by topic and label
topic_counts = df.groupby(['topic', 'label']).size().unstack()
topic_counts.plot(kind='bar', stacked=True)
plt.title("Claims by Topic and Veracity")
plt.ylabel("Count")
plt.show()

In [None]:
clean_df = df[['id', 'label', 'claim', 'topic', 'person']]
sns.barplot(data=clean_df, x='topic', y='label') 

In [None]:
new_columns = {
    'false': 'label',  # Assuming 'false' is the original column name for labels
}
df = df.rename(columns=new_columns)

In [None]:
# Plot label distribution
plt.figure(figsize=(8, 6))
sns.countplot(x="label", data=df, palette="Set2")

plt.title("Label Distribution", fontsize=14)
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()

In [None]:
df.describe().to_csv("../results/0184_day1_stats_summary.txt", sep="\t")