# Data Preprocessing Pipeline

**COMP647 Assignment 02 - Student ID: 1163127**

This notebook implements comprehensive data preprocessing for Lending Club loan data analysis.

## 1. Import Libraries and Setup

In [None]:
# Essential data processing libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning utilities
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# System and utility libraries
import warnings
import os
from pathlib import Path

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('default')

## 2. Data Loading Functions

In [None]:
def load_sample_data(sample_size='10000'):
    """
    Load sample datasets for development and analysis.
    
    Parameters:
    sample_size (str): Size of sample to load ('1000', '10000', '50000')
    
    Returns:
    tuple: (accepted_df, rejected_df)
    """
    # Define file paths for data loading
    data_path = '../data/processed/'
    accepted_file = f'accepted_sample_{sample_size}.csv'
    rejected_file = f'rejected_sample_{sample_size}.csv'
    
    print(f"Loading sample datasets (size: {sample_size})...")
    
    try:
        # Load accepted loans dataset
        accepted_df = pd.read_csv(os.path.join(data_path, accepted_file))
        print(f"Accepted loans loaded: {accepted_df.shape[0]:,} rows, {accepted_df.shape[1]} columns")
        
        # Load rejected loans dataset
        rejected_df = pd.read_csv(os.path.join(data_path, rejected_file))
        print(f"Rejected loans loaded: {rejected_df.shape[0]:,} rows, {rejected_df.shape[1]} columns")
        
        print("Data loading completed successfully!")
        
        return accepted_df, rejected_df
        
    except FileNotFoundError as e:
        print(f"Error loading files: {e}")
        print("Please ensure data files are in the correct directory")
        return None, None
    except Exception as e:
        print(f"Unexpected error during data loading: {e}")
        return None, None

## 3. Missing Value Analysis

In [None]:
def analyze_missing_values(df):
    """
    Comprehensive analysis of missing values in the dataset.
    
    Parameters:
    df (pd.DataFrame): Input dataframe
    
    Returns:
    dict: Missing value analysis results
    """
    print(f"Analyzing missing values for dataset with shape: {df.shape}")
    
    # Calculate missing values for each column
    missing_data = df.isnull().sum()
    missing_percentage = (missing_data / len(df)) * 100
    
    # Create detailed missing data summary
    missing_summary = pd.DataFrame({
        'Column': missing_data.index,
        'Missing_Count': missing_data.values,
        'Missing_Percentage': missing_percentage.values,
        'Data_Type': df.dtypes.values,
        'Unique_Values': [df[col].nunique() for col in df.columns]
    })
    
    # Filter only columns with missing values
    missing_summary = missing_summary[missing_summary['Missing_Count'] > 0]
    missing_summary = missing_summary.sort_values('Missing_Percentage', ascending=False)
    
    # Calculate summary statistics
    total_missing = missing_data.sum()
    columns_with_missing = len(missing_summary)
    overall_missing_pct = (total_missing / (len(df) * len(df.columns))) * 100
    
    print(f"Columns with missing values: {columns_with_missing}")
    print(f"Total missing values: {total_missing:,}")
    print(f"Overall missing percentage: {overall_missing_pct:.2f}%")
    
    # Categorize missing values by severity
    analysis_results = {
        'missing_summary': missing_summary,
        'total_missing_values': total_missing,
        'columns_with_missing': columns_with_missing,
        'overall_missing_percentage': overall_missing_pct,
        'structural_missing_90plus': len(missing_summary[missing_summary['Missing_Percentage'] > 90]),
        'high_missing_50_90': len(missing_summary[(missing_summary['Missing_Percentage'] > 50) & (missing_summary['Missing_Percentage'] <= 90)]),
        'moderate_missing_10_50': len(missing_summary[(missing_summary['Missing_Percentage'] > 10) & (missing_summary['Missing_Percentage'] <= 50)]),
        'low_missing_under_10': len(missing_summary[missing_summary['Missing_Percentage'] <= 10])
    }
    
    if columns_with_missing > 0:
        print("\nMissing value categorization:")
        print(f"  Structural missing (>90%): {analysis_results['structural_missing_90plus']} columns")
        print(f"  High missing (50-90%): {analysis_results['high_missing_50_90']} columns") 
        print(f"  Moderate missing (10-50%): {analysis_results['moderate_missing_10_50']} columns")
        print(f"  Low missing (<10%): {analysis_results['low_missing_under_10']} columns")
        
        print(f"\nTop 10 columns with highest missing percentages:")
        if len(missing_summary) > 0:
            for _, row in missing_summary.head(10).iterrows():
                print(f"  {row['Column']}: {row['Missing_Percentage']:.1f}% ({row['Missing_Count']:,} values)")
    else:
        print("No missing values found in dataset")
    
    return analysis_results

## 4. Data Preprocessing Pipeline

In [None]:
def preprocess_lending_data(df):
    """
    Main preprocessing pipeline for Lending Club data.
    
    Parameters:
    df (pd.DataFrame): Raw lending data
    
    Returns:
    pd.DataFrame: Preprocessed data
    """
    print(f"Starting preprocessing pipeline for dataset: {df.shape}")
    
    # Step 1: Create a copy to avoid modifying original data
    df_processed = df.copy()
    
    # Step 2: Basic data validation
    print("Step 1: Basic data validation")
    print(f"  Original shape: {df_processed.shape}")
    print(f"  Memory usage: {df_processed.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Step 3: Remove duplicate rows
    print("Step 2: Duplicate removal")
    initial_rows = len(df_processed)
    df_processed = df_processed.drop_duplicates()
    duplicates_removed = initial_rows - len(df_processed)
    print(f"  Duplicates removed: {duplicates_removed:,}")
    
    # Step 4: Analyze missing values
    print("Step 3: Missing value analysis")
    missing_analysis = analyze_missing_values(df_processed)
    
    # Step 5: Handle missing values based on analysis
    print("Step 4: Missing value treatment")
    if missing_analysis['columns_with_missing'] > 0:
        print("  Missing value treatment will be implemented in subsequent steps")
        # Placeholder for missing value treatment implementation
    else:
        print("  No missing values found - skipping treatment")
    
    # Step 6: Data type optimization preparation
    print("Step 5: Data type optimization")
    print(f"  Current data types: {df_processed.dtypes.value_counts().to_dict()}")
    
    print(f"Preprocessing pipeline completed. Final shape: {df_processed.shape}")
    
    return df_processed

## 5. Main Execution

*This section will be populated as we build the preprocessing pipeline incrementally.*