# Exploratory Data Analysis (EDA)

**COMP647 Assignment 02 - Student ID: 1163127**

This notebook performs comprehensive exploratory data analysis of the Lending Club loan dataset to understand data patterns, relationships, and characteristics that will inform research question development.

## 1. Import Libraries and Setup

In [None]:
# Essential data processing libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis libraries
from scipy import stats
from scipy.stats import chi2_contingency, pearsonr, spearmanr
from sklearn.preprocessing import StandardScaler, LabelEncoder

# System utilities
import warnings
import os
from pathlib import Path

# Configuration for better visualization
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('default')
sns.set_palette("husl")

# Figure size configuration
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

## 2. Data Loading and Initial Setup

In [None]:
def load_preprocessed_data(sample_size='10000'):
    """
    Load and preprocess sample datasets for exploratory data analysis.
    
    This function loads the sample datasets and applies the preprocessing
    pipeline developed in the previous notebook to ensure data quality
    for analysis.
    
    Parameters:
    sample_size (str): Size of sample to load ('1000', '10000', '50000')
    
    Returns:
    tuple: (preprocessed_accepted_df, raw_rejected_df)
    """
    print(f"Loading sample datasets for EDA (size: {sample_size})...")
    
    # Define file paths
    data_path = '../data/processed/'
    accepted_file = f'accepted_sample_{sample_size}.csv'
    rejected_file = f'rejected_sample_{sample_size}.csv'
    
    try:
        # Load datasets
        df_accepted = pd.read_csv(os.path.join(data_path, accepted_file))
        df_rejected = pd.read_csv(os.path.join(data_path, rejected_file))
        
        print(f"Raw accepted loans: {df_accepted.shape[0]:,} rows, {df_accepted.shape[1]} columns")
        print(f"Raw rejected loans: {df_rejected.shape[0]:,} rows, {df_rejected.shape[1]} columns")
        
        # For EDA, we'll work with the accepted loans dataset
        # Apply basic preprocessing for better analysis
        df_processed = df_accepted.copy()
        
        # Remove duplicate rows
        initial_rows = len(df_processed)
        df_processed = df_processed.drop_duplicates()
        duplicates_removed = initial_rows - len(df_processed)
        
        if duplicates_removed > 0:
            print(f"Removed {duplicates_removed:,} duplicate rows")
        
        print(f"Preprocessed accepted loans ready for EDA: {df_processed.shape}")
        print("Data loading completed successfully!")
        
        return df_processed, df_rejected
        
    except FileNotFoundError as e:
        print(f"Error loading files: {e}")
        return None, None
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None, None

## 3. Dataset Overview and Structure Analysis

In [None]:
def analyze_dataset_structure(df):
    """
    Comprehensive analysis of dataset structure and characteristics.
    
    This function provides detailed insights into the dataset structure,
    including data types, missing values, unique values, and basic 
    statistical properties to guide the EDA process.
    
    Parameters:
    df (DataFrame): Dataset to analyze
    
    Returns:
    dict: Comprehensive dataset analysis results
    """
    print(f"Analyzing dataset structure for {df.shape[0]:,} rows and {df.shape[1]} columns")
    
    # Basic dataset information
    dataset_info = {
        'shape': df.shape,
        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2,
        'total_missing': df.isnull().sum().sum(),
        'data_types': df.dtypes.value_counts().to_dict()
    }
    
    # Column analysis
    column_analysis = []
    for col in df.columns:
        col_info = {
            'column': col,
            'dtype': str(df[col].dtype),
            'missing_count': df[col].isnull().sum(),
            'missing_pct': (df[col].isnull().sum() / len(df)) * 100,
            'unique_count': df[col].nunique(),
            'unique_pct': (df[col].nunique() / len(df)) * 100
        }
        
        # Add data type specific analysis
        if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
            col_info['min_value'] = df[col].min()
            col_info['max_value'] = df[col].max()
            col_info['mean_value'] = df[col].mean()
            col_info['std_value'] = df[col].std()
        else:
            # For categorical data, get most common values
            value_counts = df[col].value_counts()
            col_info['most_common'] = value_counts.index[0] if len(value_counts) > 0 else None
            col_info['most_common_count'] = value_counts.iloc[0] if len(value_counts) > 0 else 0
        
        column_analysis.append(col_info)
    
    # Convert to DataFrame for easier analysis
    columns_df = pd.DataFrame(column_analysis)
    
    # Data quality categorization
    quality_summary = {
        'high_quality_cols': len(columns_df[columns_df['missing_pct'] < 5]),
        'medium_quality_cols': len(columns_df[(columns_df['missing_pct'] >= 5) & (columns_df['missing_pct'] < 20)]),
        'low_quality_cols': len(columns_df[columns_df['missing_pct'] >= 20]),
        'unique_identifier_cols': len(columns_df[columns_df['unique_pct'] > 95]),
        'categorical_cols': len(columns_df[(columns_df['unique_count'] < 20) & (~columns_df['dtype'].str.contains('int|float'))]),
        'numeric_cols': len(columns_df[columns_df['dtype'].str.contains('int|float')])
    }
    
    # Display summary
    print(f"\nDataset Structure Summary:")
    print(f"  Shape: {dataset_info['shape']}")
    print(f"  Memory usage: {dataset_info['memory_usage_mb']:.2f} MB")
    print(f"  Total missing values: {dataset_info['total_missing']:,}")
    
    print(f"\nData Types Distribution:")
    for dtype, count in dataset_info['data_types'].items():
        print(f"  {dtype}: {count} columns")
    
    print(f"\nData Quality Assessment:")
    print(f"  High quality columns (<5% missing): {quality_summary['high_quality_cols']}")
    print(f"  Medium quality columns (5-20% missing): {quality_summary['medium_quality_cols']}")
    print(f"  Low quality columns (>20% missing): {quality_summary['low_quality_cols']}")
    print(f"  Potential identifier columns (>95% unique): {quality_summary['unique_identifier_cols']}")
    print(f"  Categorical columns: {quality_summary['categorical_cols']}")
    print(f"  Numeric columns: {quality_summary['numeric_cols']}")
    
    return {
        'dataset_info': dataset_info,
        'column_analysis': columns_df,
        'quality_summary': quality_summary
    }

## 4. Statistical Summary Analysis

In [None]:
def generate_statistical_summary(df):
    """
    Generate comprehensive statistical summaries for numeric and categorical variables.
    
    This function creates detailed statistical summaries that go beyond basic
    describe() to include skewness, kurtosis, and quartile analysis for numeric
    variables, and frequency analysis for categorical variables.
    
    Parameters:
    df (DataFrame): Dataset to analyze
    
    Returns:
    dict: Statistical summary results
    """
    print("Generating comprehensive statistical summaries...")
    
    # Identify numeric and categorical columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns
    
    print(f"Analyzing {len(numeric_cols)} numeric and {len(categorical_cols)} categorical columns")
    
    # Numeric variables statistical summary
    numeric_summary = {}
    if len(numeric_cols) > 0:
        # Enhanced numeric summary with additional statistics
        numeric_stats = df[numeric_cols].describe()
        
        # Add skewness and kurtosis
        for col in numeric_cols:
            if df[col].notna().sum() > 0:  # Only if column has non-null values
                numeric_summary[col] = {
                    'count': df[col].count(),
                    'mean': df[col].mean(),
                    'median': df[col].median(),
                    'std': df[col].std(),
                    'min': df[col].min(),
                    'max': df[col].max(),
                    'q25': df[col].quantile(0.25),
                    'q75': df[col].quantile(0.75),
                    'skewness': df[col].skew(),
                    'kurtosis': df[col].kurtosis(),
                    'iqr': df[col].quantile(0.75) - df[col].quantile(0.25),
                    'cv': df[col].std() / df[col].mean() if df[col].mean() != 0 else 0,
                    'zeros': (df[col] == 0).sum(),
                    'zeros_pct': ((df[col] == 0).sum() / len(df)) * 100
                }
    
    # Categorical variables summary
    categorical_summary = {}
    if len(categorical_cols) > 0:
        for col in categorical_cols[:10]:  # Limit to first 10 categorical columns
            value_counts = df[col].value_counts()
            categorical_summary[col] = {
                'unique_count': df[col].nunique(),
                'most_common': value_counts.index[0] if len(value_counts) > 0 else None,
                'most_common_count': value_counts.iloc[0] if len(value_counts) > 0 else 0,
                'most_common_pct': (value_counts.iloc[0] / len(df)) * 100 if len(value_counts) > 0 else 0,
                'top_5_values': value_counts.head().to_dict(),
                'missing_count': df[col].isnull().sum(),
                'missing_pct': (df[col].isnull().sum() / len(df)) * 100
            }
    
    # Display key insights
    print(f"\nKey Numeric Variable Insights:")
    if numeric_summary:
        # Find highly skewed variables
        highly_skewed = [(col, stats['skewness']) for col, stats in numeric_summary.items() 
                        if abs(stats['skewness']) > 2]
        if highly_skewed:
            print(f"  Highly skewed variables (|skewness| > 2): {len(highly_skewed)}")
            for col, skew in highly_skewed[:5]:
                print(f"    {col}: {skew:.2f}")
        
        # Find high variability variables
        high_cv = [(col, stats['cv']) for col, stats in numeric_summary.items() 
                  if stats['cv'] > 1]
        if high_cv:
            print(f"  High variability variables (CV > 1): {len(high_cv)}")
    
    print(f"\nKey Categorical Variable Insights:")
    if categorical_summary:
        # Find highly concentrated categorical variables
        concentrated = [(col, stats['most_common_pct']) for col, stats in categorical_summary.items() 
                       if stats['most_common_pct'] > 90]
        if concentrated:
            print(f"  Highly concentrated variables (>90% in one category): {len(concentrated)}")
        
        # Find high cardinality categorical variables
        high_cardinality = [(col, stats['unique_count']) for col, stats in categorical_summary.items() 
                           if stats['unique_count'] > 50]
        if high_cardinality:
            print(f"  High cardinality variables (>50 categories): {len(high_cardinality)}")
    
    return {
        'numeric_summary': numeric_summary,
        'categorical_summary': categorical_summary,
        'numeric_columns': list(numeric_cols),
        'categorical_columns': list(categorical_cols)
    }

## 5. Data Visualization Framework

Comprehensive visualization functions for distribution analysis and pattern discovery.

In [None]:
def create_distribution_plots(df, columns, plot_type='histogram'):
    """
    Create distribution plots for numeric variables to understand data patterns.
    
    This function generates various types of distribution plots including histograms,
    box plots, and violin plots to analyze the distribution characteristics of
    numeric variables in the dataset.
    
    Parameters:
    df (DataFrame): Dataset containing the variables
    columns (list): List of column names to plot
    plot_type (str): Type of plot ('histogram', 'boxplot', 'violin')
    
    Returns:
    None: Displays plots
    """
    if not columns or len(columns) == 0:
        print("No columns provided for visualization")
        return
    
    # Limit to first 6 columns to avoid overwhelming output
    columns = columns[:6]
    
    print(f"Creating {plot_type} plots for {len(columns)} variables...")
    
    # Set up subplot grid
    n_cols = min(3, len(columns))
    n_rows = (len(columns) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    if len(columns) == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes if isinstance(axes, np.ndarray) else [axes]
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(columns):
        if col in df.columns and df[col].notna().sum() > 0:
            # Remove outliers for better visualization
            data = df[col].dropna()
            Q1 = data.quantile(0.25)
            Q3 = data.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            filtered_data = data[(data >= lower_bound) & (data <= upper_bound)]
            
            if plot_type == 'histogram':
                axes[i].hist(filtered_data, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
                axes[i].axvline(filtered_data.mean(), color='red', linestyle='--', label=f'Mean: {filtered_data.mean():.2f}')
                axes[i].axvline(filtered_data.median(), color='green', linestyle='--', label=f'Median: {filtered_data.median():.2f}')
                axes[i].legend()
                
            elif plot_type == 'boxplot':
                axes[i].boxplot(filtered_data)
            
            elif plot_type == 'violin':
                # Use seaborn for violin plots
                sns.violinplot(y=filtered_data, ax=axes[i])
            
            axes[i].set_title(f'{col}\n(n={len(filtered_data):,}, outliers removed)', fontsize=10)
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for i in range(len(columns), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()
    
    print(f"Distribution plots completed for {len(columns)} variables")

## 6. Main Execution - Load Data and Begin Analysis

In [None]:
# Load and prepare data for exploratory analysis
print("=== LOADING DATA FOR EXPLORATORY DATA ANALYSIS ===")
df_loans, df_rejected = load_preprocessed_data(sample_size='10000')

if df_loans is not None:
    print(f"\nSuccessfully loaded loans dataset: {df_loans.shape}")
    
    # Display basic information about the dataset
    print(f"\n=== DATASET BASIC INFORMATION ===")
    print(f"Shape: {df_loans.shape}")
    print(f"Memory usage: {df_loans.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"Missing values: {df_loans.isnull().sum().sum():,}")
    
    # Show sample of column names
    print(f"\nSample columns (first 10):")
    for i, col in enumerate(df_loans.columns[:10]):
        print(f"  {i+1:2d}. {col}")
    
    print(f"\nDataset loaded successfully - ready for EDA!")
else:
    print("Failed to load data - please check file paths and ensure sample files exist")

In [None]:
# Comprehensive dataset structure analysis
if df_loans is not None:
    print("=== DATASET STRUCTURE ANALYSIS ===")
    structure_analysis = analyze_dataset_structure(df_loans)
    
    # Display detailed column analysis for key variables
    column_analysis = structure_analysis['column_analysis']
    
    print(f"\n=== TOP 10 HIGHEST QUALITY COLUMNS ===")
    high_quality_cols = column_analysis.sort_values('missing_pct').head(10)
    for _, row in high_quality_cols.iterrows():
        print(f"{row['column']:25} | {row['dtype']:10} | {row['missing_pct']:5.1f}% missing | {row['unique_count']:8,} unique")
    
    print(f"\n=== POTENTIAL ANALYSIS TARGETS ===")
    # Find good candidates for analysis based on data quality
    good_numeric = column_analysis[
        (column_analysis['dtype'].str.contains('int|float')) & 
        (column_analysis['missing_pct'] < 10) &
        (column_analysis['unique_pct'] > 1)  # Not constant values
    ]['column'].head(8).tolist()
    
    good_categorical = column_analysis[
        (~column_analysis['dtype'].str.contains('int|float')) & 
        (column_analysis['missing_pct'] < 20) &
        (column_analysis['unique_count'] > 1) &
        (column_analysis['unique_count'] < 50)  # Not too many categories
    ]['column'].head(6).tolist()
    
    print(f"Good numeric variables for analysis ({len(good_numeric)}):")
    for col in good_numeric:
        print(f"  - {col}")
    
    print(f"\nGood categorical variables for analysis ({len(good_categorical)}):")
    for col in good_categorical:
        print(f"  - {col}")
    
    # Store variables for later analysis
    analysis_numeric_vars = good_numeric
    analysis_categorical_vars = good_categorical
else:
    analysis_numeric_vars = []
    analysis_categorical_vars = []

In [None]:
# Generate comprehensive statistical summaries
if df_loans is not None and len(analysis_numeric_vars) > 0:
    print("=== STATISTICAL SUMMARY ANALYSIS ===")
    stats_results = generate_statistical_summary(df_loans)
    
    # Display detailed statistics for key numeric variables
    print(f"\n=== DETAILED NUMERIC STATISTICS ===")
    numeric_stats = stats_results['numeric_summary']
    
    # Create a summary table for the top numeric variables
    if numeric_stats:
        summary_data = []
        for col in analysis_numeric_vars[:6]:  # Top 6 numeric variables
            if col in numeric_stats:
                stats = numeric_stats[col]
                summary_data.append({
                    'Variable': col,
                    'Count': f"{stats['count']:,}",
                    'Mean': f"{stats['mean']:.2f}",
                    'Median': f"{stats['median']:.2f}",
                    'Std': f"{stats['std']:.2f}",
                    'Skewness': f"{stats['skewness']:.2f}",
                    'CV': f"{stats['cv']:.2f}",
                    'Zeros%': f"{stats['zeros_pct']:.1f}%"
                })
        
        if summary_data:
            summary_df = pd.DataFrame(summary_data)
            print(summary_df.to_string(index=False))
    
    # Display categorical variable insights
    print(f"\n=== CATEGORICAL VARIABLES INSIGHTS ===")
    categorical_stats = stats_results['categorical_summary']
    
    for col in analysis_categorical_vars[:4]:  # Top 4 categorical variables
        if col in categorical_stats:
            stats = categorical_stats[col]
            print(f"\n{col}:")
            print(f"  Unique values: {stats['unique_count']:,}")
            print(f"  Most common: '{stats['most_common']}' ({stats['most_common_pct']:.1f}%)")
            print(f"  Missing: {stats['missing_pct']:.1f}%")
            if 'top_5_values' in stats and stats['top_5_values']:
                print(f"  Top categories: {list(stats['top_5_values'].keys())[:3]}")
else:
    print("No suitable numeric variables found for statistical analysis")

In [None]:
            # Pie chart for top categories
            colors = plt.cm.Set3(np.linspace(0, 1, len(value_counts)))
            wedges, texts, autotexts = ax2.pie(value_counts.values, 
                                             labels=[str(x)[:10] + '...' if len(str(x)) > 10 else str(x) 
                                                   for x in value_counts.index],
                                             autopct='%1.1f%%', colors=colors, startangle=90)

In [None]:
# Phase 1: Basic Distribution Visualizations Implementation
# Create distribution plots for key numeric variables identified from EDA
if df_loans is not None and len(analysis_numeric_vars) > 0:
    print("=== PHASE 1: BASIC DISTRIBUTION VISUALIZATIONS ==")
    
    # Select key variables for visualization based on business importance
    key_variables = [
        'loan_amnt', 'annual_inc', 'int_rate', 'installment', 
        'dti', 'fico_range_low', 'fico_range_high', 'open_acc'
    ]
    
    # Filter to available variables
    available_key_vars = [var for var in key_variables if var in df_loans.columns][:6]
    
    if available_key_vars:
        print(f"Creating distribution visualizations for: {available_key_vars}")
        
        # 1. Histogram distributions with statistical overlays
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        axes = axes.flatten()
        
        for i, var in enumerate(available_key_vars):
            if var in df_loans.columns and df_loans[var].notna().sum() > 0:
                # Clean data by removing outliers for better visualization
                data = df_loans[var].dropna()
                Q1, Q3 = data.quantile([0.25, 0.75])
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                clean_data = data[(data >= lower_bound) & (data <= upper_bound)]
                
                # Create histogram with overlays
                axes[i].hist(clean_data, bins=40, alpha=0.7, color='lightblue', 
                           edgecolor='navy', density=True, label='Distribution')
                
                # Add statistical overlays
                mean_val = clean_data.mean()
                median_val = clean_data.median()
                std_val = clean_data.std()
                
                axes[i].axvline(mean_val, color='red', linestyle='--', linewidth=2, 
                               label=f'Mean: {mean_val:.2f}')
                axes[i].axvline(median_val, color='green', linestyle='--', linewidth=2, 
                               label=f'Median: {median_val:.2f}')
                
                # Add normal distribution overlay for comparison
                x = np.linspace(clean_data.min(), clean_data.max(), 100)
                normal_dist = stats.norm.pdf(x, mean_val, std_val)
                axes[i].plot(x, normal_dist, 'orange', linewidth=2, alpha=0.8, 
                            label='Normal Fit')
                
                axes[i].set_title(f'{var}\nSkewness: {clean_data.skew():.2f}, '
                                f'Outliers Removed: {len(data) - len(clean_data):,}', 
                                fontsize=11)
                axes[i].set_xlabel(var)
                axes[i].set_ylabel('Density')
                axes[i].legend(fontsize=9)
                axes[i].grid(True, alpha=0.3)
        
        # Hide empty subplots
        for i in range(len(available_key_vars), len(axes)):
            axes[i].set_visible(False)
        
        plt.tight_layout()
        plt.suptitle('Phase 1: Distribution Analysis of Key Loan Variables', 
                     fontsize=16, y=1.02)
        plt.show()
        
        # 2. Box plot analysis for outlier detection
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        axes = axes.flatten()
        
        for i, var in enumerate(available_key_vars):
            if var in df_loans.columns and df_loans[var].notna().sum() > 0:
                data = df_loans[var].dropna()
                
                # Create box plot
                box_parts = axes[i].boxplot(data, patch_artist=True, 
                                          boxprops=dict(facecolor='lightblue', alpha=0.7),
                                          medianprops=dict(color='red', linewidth=2),
                                          flierprops=dict(marker='o', markerfacecolor='red', 
                                                        markersize=4, alpha=0.5))
                
                # Add statistical annotations
                Q1, Q2, Q3 = data.quantile([0.25, 0.5, 0.75])
                IQR = Q3 - Q1
                outlier_count = len(data[(data < Q1 - 1.5*IQR) | (data > Q3 + 1.5*IQR)])
                outlier_pct = (outlier_count / len(data)) * 100
                
                axes[i].set_title(f'{var}\nOutliers: {outlier_count:,} ({outlier_pct:.1f}%)\n'
                                f'IQR: {IQR:.2f}', fontsize=11)
                axes[i].set_ylabel('Value')
                axes[i].grid(True, alpha=0.3)
                
                # Add quartile labels
                axes[i].text(1.1, Q1, f'Q1: {Q1:.1f}', fontsize=9, ha='left')
                axes[i].text(1.1, Q2, f'Q2: {Q2:.1f}', fontsize=9, ha='left', color='red')
                axes[i].text(1.1, Q3, f'Q3: {Q3:.1f}', fontsize=9, ha='left')
        
        # Hide empty subplots
        for i in range(len(available_key_vars), len(axes)):
            axes[i].set_visible(False)
        
        plt.tight_layout()
        plt.suptitle('Phase 1: Box Plot Analysis - Outlier Detection', 
                     fontsize=16, y=1.02)
        plt.show()
        
        # 3. Distribution insights summary
        print(f"\\n=== PHASE 1 INSIGHTS ===")
        for var in available_key_vars[:3]:  # Top 3 variables
            if var in df_loans.columns and df_loans[var].notna().sum() > 0:
                data = df_loans[var].dropna()
                skewness = data.skew()
                kurtosis = data.kurtosis()
                cv = data.std() / data.mean() if data.mean() != 0 else 0
                
                print(f"\\n{var.upper()}:")
                print(f"  Distribution: {'Right-skewed' if skewness > 1 else 'Left-skewed' if skewness < -1 else 'Approximately normal'}")
                print(f"  Variability: {'High' if cv > 1 else 'Moderate' if cv > 0.5 else 'Low'} (CV: {cv:.2f})")
                print(f"  Tail behavior: {'Heavy-tailed' if kurtosis > 3 else 'Light-tailed' if kurtosis < -1 else 'Normal-tailed'}")
                print(f"  Data quality: {'Excellent' if skewness < 2 and cv < 1 else 'Good' if skewness < 3 else 'Needs transformation'}")
        
        print(f"\\nPhase 1 completed: {len(available_key_vars)} variables analyzed with distribution visualizations")
    else:
        print("No suitable variables found for Phase 1 visualization")
else:
    print("Dataset not available for Phase 1 visualization")

In [None]:
# Create categorical variable visualizations
if df_loans is not None and len(analysis_categorical_vars) > 0:
    print("=== CATEGORICAL DISTRIBUTION VISUALIZATION ===")
    create_categorical_plots(df_loans, analysis_categorical_vars[:4])
    
    print(f"\n=== CATEGORICAL VARIABLES SUMMARY ===")
    for col in analysis_categorical_vars[:4]:
        if col in df_loans.columns:
            unique_count = df_loans[col].nunique()
            missing_pct = (df_loans[col].isnull().sum() / len(df_loans)) * 100
            most_common = df_loans[col].mode()[0] if len(df_loans[col].mode()) > 0 else 'N/A'
            most_common_pct = (df_loans[col].value_counts().iloc[0] / len(df_loans)) * 100 if unique_count > 0 else 0
            
            print(f"\n{col}:")
            print(f"  Categories: {unique_count:,}")
            print(f"  Missing: {missing_pct:.1f}%")
            print(f"  Dominant category: '{most_common}' ({most_common_pct:.1f}%)")
            print(f"  Analysis suitability: {'High' if 2 <= unique_count <= 20 and missing_pct < 15 else 'Moderate' if unique_count <= 50 else 'Low'}")
else:
    print("No suitable categorical variables available for visualization")

## 7. Correlation Analysis and Relationships

Comprehensive correlation analysis to identify relationships between variables and potential patterns for research question development.

In [None]:
def analyze_correlations(df, numeric_columns, method='pearson', threshold=0.3):
    """
    Comprehensive correlation analysis with insights and visualization.
    
    This function calculates correlation matrices, identifies strong relationships,
    and provides actionable insights for feature selection and research questions.
    
    Parameters:
    df (DataFrame): Dataset for analysis
    numeric_columns (list): List of numeric columns to analyze
    method (str): Correlation method ('pearson', 'spearman', 'kendall')
    threshold (float): Minimum correlation strength to report
    
    Returns:
    dict: Correlation analysis results
    """
    print(f"Performing {method} correlation analysis on {len(numeric_columns)} variables...")
    
    if len(numeric_columns) < 2:
        print("Need at least 2 numeric variables for correlation analysis")
        return {}
    
    # Limit to available columns and first 15 for manageable analysis
    available_cols = [col for col in numeric_columns if col in df.columns]
    analysis_cols = available_cols[:15]
    
    print(f"Analyzing correlations for {len(analysis_cols)} numeric variables")
    
    # Calculate correlation matrix
    correlation_data = df[analysis_cols].select_dtypes(include=[np.number])
    
    if method == 'pearson':
        corr_matrix = correlation_data.corr(method='pearson')
    elif method == 'spearman':
        corr_matrix = correlation_data.corr(method='spearman')
    else:
        corr_matrix = correlation_data.corr(method='kendall')
    
    # Find strong correlations (excluding self-correlations)
    strong_correlations = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            var1 = corr_matrix.columns[i]
            var2 = corr_matrix.columns[j]
            corr_value = corr_matrix.iloc[i, j]
            
            if not pd.isna(corr_value) and abs(corr_value) >= threshold:
                strong_correlations.append({
                    'variable_1': var1,
                    'variable_2': var2,
                    'correlation': corr_value,
                    'strength': 'Very Strong' if abs(corr_value) >= 0.8 else 
                               'Strong' if abs(corr_value) >= 0.6 else
                               'Moderate' if abs(corr_value) >= 0.4 else 'Weak',
                    'direction': 'Positive' if corr_value > 0 else 'Negative'
                })
    
    # Sort by absolute correlation value
    strong_correlations.sort(key=lambda x: abs(x['correlation']), reverse=True)
    
    # Display insights
    print(f"\n=== CORRELATION ANALYSIS RESULTS ===")
    print(f"Method: {method.capitalize()}")
    print(f"Strong correlations found (|r| >= {threshold}): {len(strong_correlations)}")
    
    if strong_correlations:
        print(f"\nTop 10 strongest correlations:")
        for i, corr in enumerate(strong_correlations[:10]):
            print(f"{i+1:2d}. {corr['variable_1']} vs {corr['variable_2']}")
            print(f"     Correlation: {corr['correlation']:.3f} ({corr['strength']}, {corr['direction']})")
    
    # Identify multicollinearity concerns
    high_corr = [corr for corr in strong_correlations if abs(corr['correlation']) >= 0.8]
    if high_corr:
        print(f"\nMulticollinearity concerns (|r| >= 0.8): {len(high_corr)}")
        for corr in high_corr:
            print(f"  - {corr['variable_1']} vs {corr['variable_2']}: {corr['correlation']:.3f}")
    
    return {
        'correlation_matrix': corr_matrix,
        'strong_correlations': strong_correlations,
        'analysis_columns': analysis_cols,
        'method': method
    }

def create_correlation_heatmap(corr_matrix, title_suffix=''):
    """
    Create publication-quality correlation heatmap with annotations.
    
    Parameters:
    corr_matrix (DataFrame): Correlation matrix to visualize
    title_suffix (str): Additional text for plot title
    
    Returns:
    None: Displays heatmap
    """
    if corr_matrix.empty:
        print("No correlation matrix provided for visualization")
        return
    
    print(f"Creating correlation heatmap for {corr_matrix.shape[0]} variables...")
    
    # Set up the matplotlib figure
    plt.figure(figsize=(12, 10))
    
    # Create heatmap with custom styling
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Mask upper triangle
    
    heatmap = sns.heatmap(
        corr_matrix,
        mask=mask,
        annot=True,
        cmap='RdBu_r',
        center=0,
        fmt='.2f',
        square=True,
        linewidths=0.5,
        cbar_kws={"shrink": .8},
        annot_kws={'size': 8}
    )
    
    # Customize the plot
    plt.title(f'Correlation Matrix Heatmap {title_suffix}\n(Lower Triangle Only)', 
             fontsize=14, fontweight='bold', pad =20)
    plt.xlabel('Variables', fontsize=12)
    plt.ylabel('Variables', fontsize=12)
    
    # Rotate labels for better readability
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    
    plt.tight_layout()
    plt.show()
    
    print("Correlation heatmap visualization completed")

def identify_research_opportunities(strong_correlations, distribution_insights=None):
    """
    Identify potential research questions based on correlation patterns.
    
    Parameters:
    strong_correlations (list): List of strong correlation relationships
    distribution_insights (dict): Distribution analysis results
    
    Returns:
    list: Potential research questions and hypotheses
    """
    print("Identifying research opportunities from correlation patterns...")
    
    research_opportunities = []
    
    if not strong_correlations:
        print("No strong correlations found - limited research opportunities")
        return research_opportunities
    
    # Categorize correlations by strength and type
    very_strong = [c for c in strong_correlations if abs(c['correlation']) >= 0.8]
    strong_positive = [c for c in strong_correlations if c['correlation'] >= 0.6]
    strong_negative = [c for c in strong_correlations if c['correlation'] <= -0.6]
    
    # Generate research questions based on patterns
    if very_strong:
        research_opportunities.append({
            'category': 'Multicollinearity Investigation',
            'question': f'Are {very_strong[0]["variable_1"]} and {very_strong[0]["variable_2"]} measuring similar underlying factors?',
            'rationale': f'Very high correlation ({very_strong[0]["correlation"]:.3f}) suggests potential redundancy or shared causation',
            'analysis_approach': 'Principal component analysis, factor analysis, or variable selection techniques'
        })
    
    if strong_positive:
        research_opportunities.append({
            'category': 'Positive Relationship Analysis',
            'question': f'How does {strong_positive[0]["variable_1"]} influence {strong_positive[0]["variable_2"]} in lending decisions?',
            'rationale': f'Strong positive correlation ({strong_positive[0]["correlation"]:.3f}) suggests mutual reinforcement or causal relationship',
            'analysis_approach': 'Regression analysis, causal inference, or predictive modeling'
        })
    
    if strong_negative:
        research_opportunities.append({
            'category': 'Inverse Relationship Analysis', 
            'question': f'Why do {strong_negative[0]["variable_1"]} and {strong_negative[0]["variable_2"]} show opposing patterns?',
            'rationale': f'Strong negative correlation ({strong_negative[0]["correlation"]:.3f}) indicates trade-off or substitution effects',
            'analysis_approach': 'Risk-return analysis, segmentation analysis, or behavioral studies'
        })
    
    # Add general research questions
    if len(strong_correlations) >= 5:
        research_opportunities.append({
            'category': 'Predictive Modeling',
            'question': 'Which combination of correlated variables best predicts loan outcomes?',
            'rationale': f'Multiple strong correlations ({len(strong_correlations)}) provide rich feature set for prediction',
            'analysis_approach': 'Machine learning models, feature selection, cross-validation'
        })
    
    # Display opportunities
    print(f"\n=== RESEARCH OPPORTUNITIES IDENTIFIED ===")
    print(f"Total opportunities: {len(research_opportunities)}")
    
    for i, opportunity in enumerate(research_opportunities):
        print(f"\n{i+1}. {opportunity['category']}:")
        print(f"   Question: {opportunity['question']}")
        print(f"   Rationale: {opportunity['rationale']}")
        print(f"   Approach: {opportunity['analysis_approach']}")
    
    return research_opportunities

print("Correlation analysis and research opportunity identification functions defined successfully")

In [None]:
# Phase 2: Correlation Analysis Visualizations Implementation
if df_loans is not None and len(analysis_numeric_vars) >= 2:
    print("=== PHASE 2: CORRELATION ANALYSIS VISUALIZATIONS ==")
    
    # Perform enhanced correlation analysis with visualizations
    correlation_results = analyze_correlations(
        df_loans, 
        analysis_numeric_vars, 
        method='pearson', 
        threshold=0.3
    )
    
    if 'correlation_matrix' in correlation_results:
        corr_matrix = correlation_results['correlation_matrix']
        
        # 1. Enhanced correlation heatmap with better styling
        plt.figure(figsize=(14, 12))
        
        # Create mask for upper triangle
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        
        # Custom colormap
        cmap = sns.diverging_palette(230, 20, as_cmap=True)
        
        # Create heatmap
        heatmap = sns.heatmap(
            corr_matrix,
            mask=mask,
            annot=True,
            cmap=cmap,
            center=0,
            fmt='.2f',
            square=True,
            linewidths=0.8,
            cbar_kws={"shrink": .8, "label": "Correlation Coefficient"},
            annot_kws={'size': 9, 'weight': 'bold'}
        )
        
        plt.title('Phase 2: Pearson Correlation Matrix\\n(Lower Triangle Only)', 
                 fontsize=16, fontweight='bold', pad=20)
        plt.xlabel('Variables', fontsize=12, fontweight='bold')
        plt.ylabel('Variables', fontsize=12, fontweight='bold')
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()
        
        # 2. Correlation strength distribution
        # Get all correlation values (excluding self-correlations)
        corr_values = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if not pd.isna(corr_val):
                    corr_values.append(abs(corr_val))
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        # Histogram of correlation strengths
        ax1.hist(corr_values, bins=20, alpha=0.7, color='skyblue', edgecolor='navy')
        ax1.axvline(np.mean(corr_values), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {np.mean(corr_values):.3f}')
        ax1.axvline(0.3, color='orange', linestyle='--', linewidth=2, 
                   label='Threshold: 0.3')
        ax1.set_title('Distribution of Absolute Correlation Coefficients', fontweight='bold')
        ax1.set_xlabel('Absolute Correlation')
        ax1.set_ylabel('Frequency')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # Correlation strength categories
        categories = ['Weak\\n(0-0.3)', 'Moderate\\n(0.3-0.6)', 'Strong\\n(0.6-0.8)', 'Very Strong\\n(0.8-1.0)']
        counts = [
            sum(1 for x in corr_values if x < 0.3),
            sum(1 for x in corr_values if 0.3 <= x < 0.6),
            sum(1 for x in corr_values if 0.6 <= x < 0.8),
            sum(1 for x in corr_values if x >= 0.8)
        ]
        
        colors = ['lightgray', 'lightblue', 'orange', 'red']
        bars = ax2.bar(categories, counts, color=colors, alpha=0.7, edgecolor='navy')
        
        # Add value labels on bars
        for bar, count in zip(bars, counts):
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                    f'{count}', ha='center', va='bottom', fontweight='bold')
        
        ax2.set_title('Correlation Strength Categories', fontweight='bold')
        ax2.set_ylabel('Number of Variable Pairs')
        ax2.grid(True, alpha=0.3, axis='y')
        
        plt.suptitle('Phase 2: Correlation Analysis Summary', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        # 3. Top correlations scatter plots
        strong_corrs = correlation_results.get('strong_correlations', [])
        if strong_corrs:
            print(f"\\n=== TOP CORRELATION RELATIONSHIPS ===")
            
            # Select top 6 correlations for scatter plots
            top_corrs = strong_corrs[:6]
            
            fig, axes = plt.subplots(2, 3, figsize=(18, 12))
            axes = axes.flatten()
            
            for i, corr_info in enumerate(top_corrs):
                var1 = corr_info['variable_1']
                var2 = corr_info['variable_2']
                corr_val = corr_info['correlation']
                
                if var1 in df_loans.columns and var2 in df_loans.columns:
                    # Sample data for plotting (to avoid overplotting)
                    sample_data = df_loans[[var1, var2]].dropna().sample(
                        min(2000, len(df_loans)), random_state=42)
                    
                    # Create scatter plot
                    axes[i].scatter(sample_data[var1], sample_data[var2], 
                                  alpha=0.5, s=20, c='lightblue', edgecolors='navy')
                    
                    # Add trend line
                    z = np.polyfit(sample_data[var1], sample_data[var2], 1)
                    p = np.poly1d(z)
                    x_trend = np.linspace(sample_data[var1].min(), sample_data[var1].max(), 100)
                    axes[i].plot(x_trend, p(x_trend), "r--", alpha=0.8, linewidth=2)
                    
                    # Formatting
                    axes[i].set_title(f'{var1} vs {var2}\\nCorr: {corr_val:.3f} ({corr_info["strength"]})', 
                                    fontsize=11, fontweight='bold')
                    axes[i].set_xlabel(var1)
                    axes[i].set_ylabel(var2)
                    axes[i].grid(True, alpha=0.3)
                    
                    print(f"  {i+1}. {var1} ↔ {var2}: {corr_val:.3f} ({corr_info['direction']} {corr_info['strength']})")
            
            # Hide empty subplots
            for i in range(len(top_corrs), len(axes)):
                axes[i].set_visible(False)
            
            plt.suptitle('Phase 2: Top Correlation Relationships - Scatter Plots', 
                        fontsize=16, fontweight='bold')
            plt.tight_layout()
            plt.show()
        
        # 4. Spearman vs Pearson comparison
        print(f"\\n=== SPEARMAN VS PEARSON COMPARISON ===")
        spearman_results = analyze_correlations(
            df_loans, 
            analysis_numeric_vars[:10], 
            method='spearman', 
            threshold=0.3
        )
        
        if spearman_results:
            pearson_count = len(strong_corrs)
            spearman_count = len(spearman_results.get('strong_correlations', []))
            
            comparison_data = ['Pearson', 'Spearman']
            comparison_counts = [pearson_count, spearman_count]
            
            plt.figure(figsize=(10, 6))
            bars = plt.bar(comparison_data, comparison_counts, 
                          color=['lightblue', 'lightcoral'], alpha=0.7, edgecolor='navy')
            
            # Add value labels
            for bar, count in zip(bars, comparison_counts):
                height = bar.get_height()
                plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                        f'{count}', ha='center', va='bottom', fontsize=14, fontweight='bold')
            
            plt.title('Correlation Methods Comparison\\n(Strong Correlations Found)', 
                     fontsize=14, fontweight='bold')
            plt.ylabel('Number of Strong Correlations (|r| ≥ 0.3)')
            plt.grid(True, alpha=0.3, axis='y')
            
            # Add interpretation
            if spearman_count > pearson_count:
                plt.text(0.5, max(comparison_counts) * 0.8, 
                        f'Non-linear relationships detected\\n({spearman_count - pearson_count} additional)',
                        ha='center', fontsize=12, bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow", alpha=0.7))
            elif pearson_count > spearman_count:
                plt.text(0.5, max(comparison_counts) * 0.8,
                        'Linear relationships dominate',
                        ha='center', fontsize=12, bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgreen", alpha=0.7))
            else:
                plt.text(0.5, max(comparison_counts) * 0.8,
                        'Similar linear/non-linear patterns',
                        ha='center', fontsize=12, bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray", alpha=0.7))
            
            plt.tight_layout()
            plt.show()
            
            print(f"  Pearson correlations (linear): {pearson_count}")
            print(f"  Spearman correlations (monotonic): {spearman_count}")
            print(f"  Difference: {abs(spearman_count - pearson_count)} additional relationships")
        
        print(f"\\nPhase 2 completed: Correlation analysis with {len(corr_matrix.columns)} variables")
    else:
        print("Correlation matrix not available for Phase 2 visualization")
else:
    print("Insufficient numeric variables for Phase 2 correlation analysis")

In [None]:
# Additional correlation analysis with Spearman method for comparison
if df_loans is not None and len(analysis_numeric_vars) >= 2:
    print("=== SPEARMAN CORRELATION COMPARISON ===")
    
    # Spearman correlation for non-linear relationships
    spearman_results = analyze_correlations(
        df_loans, 
        analysis_numeric_vars[:10], 
        method='spearman', 
        threshold=0.3
    )
    
    # Compare Pearson vs Spearman results
    if 'correlation_results' in locals() and spearman_results:
        pearson_strong = len(correlation_results.get('strong_correlations', []))
        spearman_strong = len(spearman_results.get('strong_correlations', []))
        
        print(f"\nCorrelation Method Comparison:")
        print(f"  Pearson strong correlations: {pearson_strong}")
        print(f"  Spearman strong correlations: {spearman_strong}")
        
        if spearman_strong > pearson_strong:
            print(f"  Insight: More non-linear relationships detected ({spearman_strong - pearson_strong} additional)")
        elif pearson_strong > spearman_strong:
            print(f"  Insight: Primarily linear relationships dominate")
        else:
            print(f"  Insight: Similar linear and non-linear relationship patterns")
else:
    print("Skipping Spearman correlation comparison")

## 8. Comprehensive EDA Summary and Insights

Final synthesis of exploratory data analysis findings with actionable insights for research question development.

In [None]:
def generate_eda_summary(df, structure_analysis, stats_results, correlation_results, distribution_results):
    """
    Generate comprehensive EDA summary with key insights and recommendations.
    
    This function synthesizes all analysis results into actionable insights
    for research question development and further analysis directions.
    
    Parameters:
    df (DataFrame): Original dataset
    structure_analysis (dict): Dataset structure analysis results
    stats_results (dict): Statistical summary results
    correlation_results (dict): Correlation analysis results
    distribution_results (dict): Distribution analysis results
    
    Returns:
    dict: Comprehensive summary with insights
    """
    print("Generating comprehensive EDA summary and insights...")
    
    summary = {
        'dataset_overview': {},
        'data_quality_assessment': {},
        'key_findings': [],
        'research_recommendations': [],
        'analysis_readiness': {}
    }
    
    # Dataset Overview
    if df is not None:
        summary['dataset_overview'] = {
            'total_records': len(df),
            'total_variables': len(df.columns),
            'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2,
            'analysis_period': 'Sample data for development'
        }
    
    # Data Quality Assessment
    if structure_analysis:
        quality_summary = structure_analysis.get('quality_summary', {})
        dataset_info = structure_analysis.get('dataset_info', {})
        
        total_missing_pct = (dataset_info.get('total_missing', 0) / 
                            (summary['dataset_overview']['total_records'] * 
                             summary['dataset_overview']['total_variables'])) * 100
        
        summary['data_quality_assessment'] = {
            'overall_completeness': 100 - total_missing_pct,
            'high_quality_variables': quality_summary.get('high_quality_cols', 0),
            'medium_quality_variables': quality_summary.get('medium_quality_cols', 0),
            'low_quality_variables': quality_summary.get('low_quality_cols', 0),
            'numeric_variables': quality_summary.get('numeric_cols', 0),
            'categorical_variables': quality_summary.get('categorical_cols', 0),
            'data_quality_score': 'Excellent' if total_missing_pct < 5 else 
                                 'Good' if total_missing_pct < 15 else 
                                 'Fair' if total_missing_pct < 30 else 'Poor'
        }
    
    # Key Statistical Findings
    if stats_results:
        numeric_stats = stats_results.get('numeric_summary', {})
        if numeric_stats:
            # Identify highly skewed variables
            highly_skewed = [(var, stats['skewness']) for var, stats in numeric_stats.items() 
                           if abs(stats['skewness']) > 2]
            
            # Identify high variability variables
            high_variability = [(var, stats['cv']) for var, stats in numeric_stats.items() 
                              if stats['cv'] > 1]
            
            summary['key_findings'].extend([
                f"Identified {len(highly_skewed)} highly skewed variables requiring transformation",
                f"Found {len(high_variability)} variables with high coefficient of variation (>1.0)",
                f"Statistical analysis completed for {len(numeric_stats)} numeric variables"
            ])
    
    # Correlation Findings
    if correlation_results:
        strong_corrs = correlation_results.get('strong_correlations', [])
        if strong_corrs:
            very_strong = [c for c in strong_corrs if abs(c['correlation']) >= 0.8]
            moderate_strong = [c for c in strong_corrs if 0.4 <= abs(c['correlation']) < 0.8]
            
            summary['key_findings'].extend([
                f"Discovered {len(strong_corrs)} significant variable relationships (|r| >= 0.3)",
                f"Identified {len(very_strong)} potential multicollinearity concerns (|r| >= 0.8)",
                f"Found {len(moderate_strong)} moderate to strong correlations for analysis"
            ])
    
    # Distribution Findings
    if distribution_results:
        high_quality_distributions = sum(1 for insights in distribution_results.values() 
                                        if insights.get('data_quality') == 'High')
        outlier_heavy = sum(1 for insights in distribution_results.values() 
                           if insights.get('outlier_percentage', 0) > 10)
        
        summary['key_findings'].extend([
            f"Analyzed {len(distribution_results)} variable distributions",
            f"{high_quality_distributions} variables show high-quality distributions",
            f"{outlier_heavy} variables require outlier treatment (>10% outliers)"
        ])
    
    # Research Recommendations
    summary['research_recommendations'] = [
        "Focus on high-quality variables with strong correlations for predictive modeling",
        "Investigate causal relationships between strongly correlated variables",
        "Apply appropriate transformations to highly skewed variables",
        "Consider dimensionality reduction for variables with multicollinearity",
        "Develop hypotheses based on correlation patterns and business logic"
    ]
    
    # Analysis Readiness Assessment
    readiness_score = 0
    if summary['data_quality_assessment'].get('overall_completeness', 0) > 90:
        readiness_score += 25
    if correlation_results and len(correlation_results.get('strong_correlations', [])) > 5:
        readiness_score += 25
    if stats_results and len(stats_results.get('numeric_summary', {})) > 5:
        readiness_score += 25
    if structure_analysis and structure_analysis.get('quality_summary', {}).get('high_quality_cols', 0) > 10:
        readiness_score += 25
    
    summary['analysis_readiness'] = {
        'readiness_score': readiness_score,
        'readiness_level': 'Excellent' if readiness_score >= 90 else 
                          'Good' if readiness_score >= 70 else
                          'Fair' if readiness_score >= 50 else 'Poor',
        'ready_for_modeling': readiness_score >= 70,
        'recommended_next_steps': [
            'Feature engineering and selection',
            'Research question formulation', 
            'Hypothesis development',
            'Predictive model development'
        ] if readiness_score >= 70 else [
            'Additional data cleaning required',
            'Missing value imputation needed',
            'Data quality improvement necessary'
        ]
    }
    
    return summary

print("EDA summary generation function defined successfully")

In [None]:
# Generate comprehensive EDA summary
if df_loans is not None:
    print("=== COMPREHENSIVE EDA SUMMARY ===")
    
    # Collect all analysis results
    eda_summary = generate_eda_summary(
        df_loans,
        structure_analysis if 'structure_analysis' in locals() else {},
        stats_results if 'stats_results' in locals() else {},
        correlation_results if 'correlation_results' in locals() else {},
        distribution_results if 'distribution_results' in locals() else {}
    )
    
    # Display summary results
    print(f"\n=== DATASET OVERVIEW ===")
    overview = eda_summary['dataset_overview']
    print(f"Total records: {overview.get('total_records', 'N/A'):,}")
    print(f"Total variables: {overview.get('total_variables', 'N/A'):,}")
    print(f"Memory usage: {overview.get('memory_usage_mb', 0):.2f} MB")
    
    print(f"\n=== DATA QUALITY ASSESSMENT ===")
    quality = eda_summary['data_quality_assessment']
    print(f"Overall completeness: {quality.get('overall_completeness', 0):.1f}%")
    print(f"Data quality score: {quality.get('data_quality_score', 'Unknown')}")
    print(f"High quality variables: {quality.get('high_quality_variables', 0)}")
    print(f"Numeric variables: {quality.get('numeric_variables', 0)}")
    print(f"Categorical variables: {quality.get('categorical_variables', 0)}")
    
    print(f"\n=== KEY FINDINGS ===")
    for i, finding in enumerate(eda_summary['key_findings'], 1):
        print(f"{i:2d}. {finding}")
    
    print(f"\n=== RESEARCH RECOMMENDATIONS ===")
    for i, rec in enumerate(eda_summary['research_recommendations'], 1):
        print(f"{i:2d}. {rec}")
    
    print(f"\n=== ANALYSIS READINESS ===")
    readiness = eda_summary['analysis_readiness']
    print(f"Readiness score: {readiness.get('readiness_score', 0)}/100")
    print(f"Readiness level: {readiness.get('readiness_level', 'Unknown')}")
    print(f"Ready for modeling: {'Yes' if readiness.get('ready_for_modeling', False) else 'No'}")
    
    print(f"\nRecommended next steps:")
    for i, step in enumerate(readiness.get('recommended_next_steps', []), 1):
        print(f"  {i}. {step}")
    
    print(f"\n{'='*50}")
    print(f"EXPLORATORY DATA ANALYSIS COMPLETED SUCCESSFULLY")
    print(f"Dataset is ready for feature engineering and research question development")
    print(f"{'='*50}")
else:
    print("No dataset available for EDA summary generation")