In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Add root directory to Python path
current_dir = os.path.dirname(os.getcwd())
root_dir = os.path.dirname(current_dir) if os.path.basename(current_dir) == "notebooks" else current_dir
sys.path.append(root_dir)


In [3]:
# Import custom modules after setting up paths
try:
    from src.preprocessing.text_processor import TextPreprocessor
    from src.preprocessing.metadata_processor import MetadataProcessor
    print("✓ Custom modules imported successfully")
except ImportError as e:
    print(f"✗ Module import error: {e}")
    print("Continuing with limited functionality")

✗ Module import error: No module named 'src.preprocessing.text_processor'
Continuing with limited functionality


In [4]:
#Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Starting Data Loading and Preprocessing Pipeline")
print("=" * 60)

Starting Data Loading and Preprocessing Pipeline


In [5]:
def load_liar_dataset():
    """Load the LIAR dataset from TSV files with robust path handling."""
    try:
        # Get current directory and adjust paths
        current_dir = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
        
        # Navigate to correct data directory
        if "notebooks" in current_dir:
            data_dir = os.path.join(os.path.dirname(current_dir), "data", "raw")
        else:
            data_dir = os.path.join(current_dir, "data", "raw")
            
        print(f"Loading data from: {data_dir}")
        
        # Define file paths
        train_file = os.path.join(data_dir, "train.tsv")
        test_file = os.path.join(data_dir, "test.tsv")
        valid_file = os.path.join(data_dir, "valid.tsv")
        
        # Column names for LIAR dataset
        columns = [
            'label', 'statement', 'subject', 'speaker', 'speaker_job', 
            'state_info', 'party_affiliation', 'barely_true_counts', 
            'false_counts', 'half_true_counts', 'mostly_true_counts', 
            'pants_fire_counts', 'context'
        ]
        
        datasets = {}
        
        # Load training data
        train_df = pd.read_csv(train_file, sep='\t', header=None, names=columns)
        datasets['train'] = train_df
        print(f"✓ Loaded training data: {train_df.shape}")
            
        # Load test data
        test_df = pd.read_csv(test_file, sep='\t', header=None, names=columns)
        datasets['test'] = test_df
        print(f"✓ Loaded test data: {test_df.shape}")
            
        # Load validation data
        valid_df = pd.read_csv(valid_file, sep='\t', header=None, names=columns)
        datasets['valid'] = valid_df
        print(f"✓ Loaded validation data: {valid_df.shape}")
            
        return datasets
        
    except Exception as e:
        print(f"Error loading data: {e}")
        print("Creating sample data...")
        return create_sample_data()

In [6]:
def create_sample_data():
    """Create sample data with simplified structure"""
    sample_data = pd.DataFrame({
        'label': ['true', 'false', 'half-true'],
        'statement': [
            'The sky is blue',
            'Water boils at 50 degrees Celsius',
            'Humans have 5 senses'
        ],
        'subject': ['science', 'science', 'biology'],
        'speaker': ['John Doe', 'Jane Smith', 'Alan Turing'],
        'speaker_job': ['Scientist', 'Researcher', 'Mathematician'],
        'state_info': ['CA', 'NY', 'UK'],
        'party_affiliation': ['Independent', 'Democrat', 'Nonpartisan'],
        'barely_true_counts': [1, 3, 2],
        'false_counts': [0, 5, 1],
        'half_true_counts': [2, 1, 4],
        'mostly_true_counts': [4, 0, 3],
        'pants_fire_counts': [0, 2, 0],
        'context': ['Weather report', 'Science class', 'Biology lecture']
    })
    return {'train': sample_data, 'test': sample_data, 'valid': sample_data}

In [7]:
try:
    datasets = load_liar_dataset()
    print(f"Loaded datasets: {list(datasets.keys())}")
except Exception as e:
    print(f"Critical error: {e}")
    datasets = create_sample_data()

# ==========================================
# 2. INITIAL DATA EXPLORATION (SAFER)
# ==========================================
print("\n" + "=" * 60)
print("INITIAL DATA EXPLORATION")
print("=" * 60)

if 'train' in datasets:
    train_df = datasets['train']
    
    # ... (rest of your exploration code remains the same) ...
    # Remember to create '../results/figures/' directory if it doesn't exist
    os.makedirs('../results/figures/', exist_ok=True)
    
    # Save plot
    plt.tight_layout()
    plt.savefig('../results/figures/initial_label_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No training data available in datasets")

Loading data from: C:\Users\mayur\fake-news-detection\data\raw
✓ Loaded training data: (10240, 13)
✓ Loaded test data: (1267, 13)
✓ Loaded validation data: (1284, 13)
Loaded datasets: ['train', 'test', 'valid']

INITIAL DATA EXPLORATION


<Figure size 800x550 with 0 Axes>