# Financial Data Exploratory Data Analysis

This notebook demonstrates EDA for financial datasets using NeoCred's data science toolkit.

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')

from data_science.feature_engineering import FinancialFeatureEngineer
from data_science.visualization import FinancialVisualizer
from data_science.statistical_modeling import FinancialStatisticalModeler
from data_science.imbalanced_learning import ImbalancedLearningHandler

## 1. Load Sample Financial Data

In [None]:
# Create sample financial dataset
np.random.seed(42)
n_samples = 1000

data = {
    'age': np.random.randint(18, 65, n_samples),
    'monthly_income': np.random.lognormal(10, 0.5, n_samples),
    'monthly_expenses': np.random.lognormal(9, 0.4, n_samples),
    'credit_score': np.random.randint(300, 850, n_samples),
    'credit_limit': np.random.lognormal(9, 0.6, n_samples),
    'credit_used': np.random.lognormal(8, 0.7, n_samples),
    'total_debt': np.random.lognormal(10, 0.8, n_samples)
}

df = pd.DataFrame(data)
df['default_risk'] = (df['credit_score'] < 600).astype(int)

print(f"Dataset shape: {df.shape}")
df.head()

## 2. Feature Engineering

In [None]:
# Initialize feature engineer
fe = FinancialFeatureEngineer()

# Create income features
df_features = fe.create_income_features(df)

# Create credit features
df_features = fe.create_credit_features(df_features)

print("New features created:")
new_cols = set(df_features.columns) - set(df.columns)
for col in new_cols:
    print(f"- {col}")

df_features.head()

## 3. Data Visualization

In [None]:
# Initialize visualizer
viz = FinancialVisualizer()

# Create dashboard
dashboard = viz.create_financial_dashboard(df_features)

# Display plots
for plot_name, fig in dashboard.items():
    fig.show()
    print(f"\n{plot_name} plot displayed")

## 4. Statistical Modeling

In [None]:
# Initialize statistical modeler
sm = FinancialStatisticalModeler()

# Credit risk modeling
features = ['age', 'monthly_income', 'credit_score', 'debt_to_income', 'credit_utilization']
credit_analysis = sm.credit_risk_modeling(df_features, 'default_risk', features)

print("Credit Risk Analysis Results:")
print(f"Default Rate: {credit_analysis['default_rate']:.2%}")
print(f"Sample Size: {credit_analysis['sample_size']}")
print(f"Positive Cases: {credit_analysis['positive_cases']}")

print("\nFeature Correlations with Default:")
for feature, corr in credit_analysis['feature_correlations'].items():
    print(f"{feature}: {corr:.3f}")

## 5. Imbalanced Learning Analysis

In [None]:
# Initialize imbalanced learning handler
imb = ImbalancedLearningHandler()

# Analyze class imbalance
imbalance_analysis = imb.analyze_imbalance(df_features['default_risk'])

print("Class Imbalance Analysis:")
print(f"Class Counts: {imbalance_analysis['class_counts']}")
print(f"Class Percentages: {imbalance_analysis['class_percentages']}")
print(f"Imbalance Ratio: {imbalance_analysis['imbalance_ratio']:.2f}")
print(f"Minority Class: {imbalance_analysis['minority_class']}")

# Visualize class distribution
class_dist_fig = viz.plot_class_distribution(df_features['default_risk'], "Default Risk Distribution")
class_dist_fig.show()

## 6. Model Performance Comparison

In [None]:
from sklearn.model_selection import train_test_split

# Prepare data for modeling
X = df_features[features].fillna(0)
y = df_features['default_risk']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Compare sampling methods
comparison_results = imb.compare_sampling_methods(X_train, y_train, X_test, y_test)

print("Sampling Methods Comparison:")
for method, results in comparison_results.items():
    print(f"\n{method.upper()}:")
    print(f"  ROC-AUC: {results['roc_auc']:.3f}")
    print(f"  F1-Score: {results['classification_report']['weighted avg']['f1-score']:.3f}")
    print(f"  Training Samples: {results['training_samples']}")

## 7. Feature Importance Analysis

In [None]:
# Train balanced random forest
brf_results = imb.train_balanced_random_forest(X_train, y_train)

print(f"Balanced Random Forest OOB Score: {brf_results['oob_score']:.3f}")

# Plot feature importance
feature_importance = brf_results['feature_importance']
importance_fig = viz.plot_feature_importance(
    feature_importance['feature'].tolist(),
    feature_importance['importance'].tolist()
)
importance_fig.show()

print("\nTop 5 Most Important Features:")
print(feature_importance.head())

## Summary

This notebook demonstrated:
1. **Feature Engineering**: Created financial ratios and categorical features
2. **Visualization**: Interactive plots for data exploration
3. **Statistical Modeling**: Credit risk analysis with statsmodels
4. **Imbalanced Learning**: Handling class imbalance in credit datasets
5. **Model Comparison**: Evaluated different sampling techniques
6. **Feature Importance**: Identified key predictors for credit risk

The toolkit provides comprehensive ML explainability and EDA capabilities for financial data analysis.