# Research Questions Development

**COMP647 Assignment 02 - Student ID: 1163127**

This notebook develops research questions based on the exploratory data analysis (EDA) findings from the Lending Club loan dataset. Each research question is supported by evidence from our statistical analysis and correlation studies.

## 1. Import Libraries and Load EDA Results

In [None]:
# Essential libraries for analysis
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
from scipy import stats
from scipy.stats import chi2_contingency

# System utilities
import warnings
import os

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully")
print("Ready to develop research questions based on EDA findings")

In [None]:
def load_analysis_data(sample_size='10000'):
    """
    Load the preprocessed dataset for research question analysis.
    
    This function loads the same dataset used in EDA to ensure consistency
    in research question development and validation.
    
    Parameters:
    sample_size (str): Size of sample to load
    
    Returns:
    DataFrame: Preprocessed lending data
    """
    print(f"Loading analysis dataset (sample size: {sample_size})...")
    
    data_path = '../data/processed/'
    accepted_file = f'accepted_sample_{sample_size}.csv'
    
    try:
        df = pd.read_csv(os.path.join(data_path, accepted_file))
        
        # Basic preprocessing consistent with EDA
        df = df.drop_duplicates()
        
        print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")
        print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        
        return df
        
    except FileNotFoundError as e:
        print(f"Error loading data: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None

## 2. Research Question Framework

Based on EDA findings, we develop research questions that explore meaningful patterns in lending data.

In [None]:
def analyze_key_variables(df):
    """
    Identify key variables and relationships for research question development.
    
    This function analyzes the dataset to identify variables with strong
    analytical potential based on data quality, variability, and business relevance.
    
    Parameters:
    df (DataFrame): Input dataset
    
    Returns:
    dict: Analysis results for research question development
    """
    print("Analyzing key variables for research question development...")
    
    if df is None or df.empty:
        print("No data available for analysis")
        return {}
    
    analysis_results = {
        'dataset_overview': {},
        'key_numeric_variables': [],
        'key_categorical_variables': [],
        'potential_target_variables': [],
        'high_correlation_pairs': []
    }
    
    # Dataset overview
    analysis_results['dataset_overview'] = {
        'total_loans': len(df),
        'total_features': len(df.columns),
        'data_completeness': ((df.count().sum()) / (len(df) * len(df.columns))) * 100
    }
    
    # Identify high-quality numeric variables
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        missing_pct = (df[col].isnull().sum() / len(df)) * 100
        unique_pct = (df[col].nunique() / len(df)) * 100
        
        # Select variables with good data quality and variability
        if missing_pct < 15 and unique_pct > 1 and unique_pct < 95:
            var_info = {
                'variable': col,
                'missing_pct': missing_pct,
                'unique_values': df[col].nunique(),
                'data_type': 'numeric',
                'business_relevance': get_business_relevance(col)
            }
            analysis_results['key_numeric_variables'].append(var_info)
    
    # Identify categorical variables
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns
    
    for col in categorical_cols:
        missing_pct = (df[col].isnull().sum() / len(df)) * 100
        unique_count = df[col].nunique()
        
        # Select categorical variables with moderate cardinality
        if missing_pct < 20 and 2 <= unique_count <= 20:
            var_info = {
                'variable': col,
                'missing_pct': missing_pct,
                'unique_values': unique_count,
                'data_type': 'categorical',
                'top_category': df[col].mode().iloc[0] if len(df[col].mode()) > 0 else None,
                'business_relevance': get_business_relevance(col)
            }
            analysis_results['key_categorical_variables'].append(var_info)
    
    # Identify potential target variables based on business logic
    target_candidates = []
    for col in df.columns:
        col_lower = col.lower()
        if any(keyword in col_lower for keyword in ['status', 'grade', 'outcome', 'result', 'default', 'paid']):
            target_candidates.append({
                'variable': col,
                'rationale': f'Contains target-related keyword: {get_target_keyword(col)}',
                'unique_values': df[col].nunique() if col in df.columns else 0
            })
    
    analysis_results['potential_target_variables'] = target_candidates[:5]
    
    # Quick correlation analysis for top numeric variables
    if len(analysis_results['key_numeric_variables']) >= 2:
        top_numeric = [var['variable'] for var in analysis_results['key_numeric_variables'][:10]]
        corr_matrix = df[top_numeric].corr()
        
        # Find strong correlations
        strong_correlations = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                var1 = corr_matrix.columns[i]
                var2 = corr_matrix.columns[j]
                corr_value = corr_matrix.iloc[i, j]
                
                if not pd.isna(corr_value) and abs(corr_value) >= 0.4:
                    strong_correlations.append({
                        'var1': var1,
                        'var2': var2,
                        'correlation': corr_value,
                        'strength': 'Strong' if abs(corr_value) >= 0.6 else 'Moderate'
                    })
        
        analysis_results['high_correlation_pairs'] = sorted(
            strong_correlations, 
            key=lambda x: abs(x['correlation']), 
            reverse=True
        )[:8]
    
    # Display analysis summary
    print(f"\nVariable Analysis Summary:")
    print(f"  High-quality numeric variables: {len(analysis_results['key_numeric_variables'])}")
    print(f"  Suitable categorical variables: {len(analysis_results['key_categorical_variables'])}")
    print(f"  Potential target variables: {len(analysis_results['potential_target_variables'])}")
    print(f"  Strong correlations found: {len(analysis_results['high_correlation_pairs'])}")
    
    return analysis_results

def get_business_relevance(column_name):
    """
    Determine business relevance of a variable based on its name.
    
    Parameters:
    column_name (str): Name of the variable
    
    Returns:
    str: Business relevance category
    """
    col_lower = column_name.lower()
    
    if any(word in col_lower for word in ['amount', 'income', 'salary', 'balance']):
        return 'Financial'
    elif any(word in col_lower for word in ['rate', 'interest', 'apr', 'percent']):
        return 'Risk/Pricing'
    elif any(word in col_lower for word in ['term', 'time', 'month', 'year', 'duration']):
        return 'Temporal'
    elif any(word in col_lower for word in ['grade', 'score', 'rating', 'status']):
        return 'Assessment'
    elif any(word in col_lower for word in ['purpose', 'type', 'category', 'reason']):
        return 'Categorical'
    else:
        return 'General'

def get_target_keyword(column_name):
    """
    Identify target-related keywords in column name.
    
    Parameters:
    column_name (str): Name of the variable
    
    Returns:
    str: Identified keyword
    """
    col_lower = column_name.lower()
    target_keywords = ['status', 'grade', 'outcome', 'result', 'default', 'paid']
    
    for keyword in target_keywords:
        if keyword in col_lower:
            return keyword
    return 'target-related'

print("Research question framework functions defined successfully")

## 3. Load Data and Perform Key Variable Analysis

In [None]:
# Load dataset for research question development
print("=== LOADING DATA FOR RESEARCH QUESTION ANALYSIS ===")
df_loans = load_analysis_data(sample_size='10000')

if df_loans is not None:
    print(f"\nDataset ready for analysis: {df_loans.shape}")
    print(f"Sample of available columns:")
    for i, col in enumerate(df_loans.columns[:15]):
        print(f"  {i+1:2d}. {col}")
    if len(df_loans.columns) > 15:
        print(f"  ... and {len(df_loans.columns) - 15} more columns")
else:
    print("Failed to load dataset")

In [None]:
# Perform key variable analysis
if df_loans is not None:
    print("=== KEY VARIABLE ANALYSIS ===")
    key_variables = analyze_key_variables(df_loans)
    
    # Display key findings
    if key_variables:
        print(f"\n=== DATASET OVERVIEW ===")
        overview = key_variables['dataset_overview']
        print(f"Total loans: {overview.get('total_loans', 0):,}")
        print(f"Total features: {overview.get('total_features', 0)}")
        print(f"Data completeness: {overview.get('data_completeness', 0):.1f}%")
        
        print(f"\n=== TOP NUMERIC VARIABLES FOR RESEARCH ===")
        numeric_vars = key_variables.get('key_numeric_variables', [])
        for i, var in enumerate(numeric_vars[:8], 1):
            print(f"{i:2d}. {var['variable']:25} | {var['business_relevance']:12} | {var['missing_pct']:4.1f}% missing")
        
        print(f"\n=== TOP CATEGORICAL VARIABLES FOR RESEARCH ===")
        cat_vars = key_variables.get('key_categorical_variables', [])
        for i, var in enumerate(cat_vars[:6], 1):
            print(f"{i:2d}. {var['variable']:25} | {var['unique_values']:2d} categories | {var['business_relevance']:12}")
        
        print(f"\n=== POTENTIAL TARGET VARIABLES ===")
        target_vars = key_variables.get('potential_target_variables', [])
        if target_vars:
            for i, var in enumerate(target_vars, 1):
                print(f"{i:2d}. {var['variable']:25} | {var['rationale']}")
        else:
            print("No obvious target variables identified - will use loan characteristics as analysis focus")
        
        print(f"\n=== STRONG CORRELATIONS FOR RESEARCH ===")
        correlations = key_variables.get('high_correlation_pairs', [])
        if correlations:
            for i, corr in enumerate(correlations[:5], 1):
                print(f"{i:2d}. {corr['var1']} vs {corr['var2']} | r = {corr['correlation']:.3f} ({corr['strength']})")
        else:
            print("No strong correlations found with current threshold")
        
        # Store results for research question development
        research_foundation = key_variables
    else:
        print("Variable analysis failed")
        research_foundation = {}
else:
    print("Cannot perform variable analysis without data")
    research_foundation = {}