# Loan Data Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory data analysis on the loan dataset, including:
- Data overview and statistics
- Distribution analysis
- Correlation analysis
- Visual insights and patterns

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

## 1. Data Loading and Overview

In [None]:
# Load the loan data
df = pd.read_csv('../data/loan_data.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few records:")
df.head()

In [None]:
# Data types and missing values
print("Data Types and Missing Values:")
df.info()

In [None]:
# Summary statistics
print("Summary Statistics:")
df.describe()

## 2. Loan Status Distribution

In [None]:
# Loan status distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Count plot
status_counts = df['loan_status'].value_counts()
axes[0].bar(status_counts.index, status_counts.values, color='steelblue')
axes[0].set_title('Loan Status Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Loan Status')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Pie chart
axes[1].pie(status_counts.values, labels=status_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Loan Status Percentage', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nLoan Status Statistics:")
print(status_counts)

## 3. Loan Amount Analysis

In [None]:
# Loan amount distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df['loan_amount'], bins=20, color='green', alpha=0.7, edgecolor='black')
axes[0].axvline(df['loan_amount'].mean(), color='red', linestyle='--', label=f'Mean: ${df["loan_amount"].mean():.2f}')
axes[0].axvline(df['loan_amount'].median(), color='orange', linestyle='--', label=f'Median: ${df["loan_amount"].median():.2f}')
axes[0].set_title('Loan Amount Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Loan Amount ($)')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Box plot
axes[1].boxplot(df['loan_amount'], vert=True)
axes[1].set_title('Loan Amount Box Plot', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Loan Amount ($)')

plt.tight_layout()
plt.show()

## 4. Interest Rate Analysis

In [None]:
# Interest rate distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['interest_rate'], bins=15, color='coral', alpha=0.7, edgecolor='black')
plt.axvline(df['interest_rate'].mean(), color='red', linestyle='--', label=f'Mean: {df["interest_rate"].mean():.2f}%')
plt.title('Interest Rate Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Interest Rate (%)')
plt.ylabel('Frequency')
plt.legend()

plt.subplot(1, 2, 2)
sns.boxplot(data=df, y='interest_rate', x='loan_status', palette='Set2')
plt.title('Interest Rate by Loan Status', fontsize=14, fontweight='bold')
plt.xlabel('Loan Status')
plt.ylabel('Interest Rate (%)')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 5. Credit Score Analysis

In [None]:
# Credit score analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Distribution
axes[0].hist(df['credit_score'], bins=20, color='purple', alpha=0.7, edgecolor='black')
axes[0].axvline(df['credit_score'].mean(), color='red', linestyle='--', label=f'Mean: {df["credit_score"].mean():.0f}')
axes[0].set_title('Credit Score Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Credit Score')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# By loan status
sns.boxplot(data=df, y='credit_score', x='loan_status', ax=axes[1], palette='viridis')
axes[1].set_title('Credit Score by Loan Status', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Loan Status')
axes[1].set_ylabel('Credit Score')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Loan Purpose Analysis

In [None]:
# Loan purpose distribution
purpose_counts = df['purpose'].value_counts()

plt.figure(figsize=(14, 6))
plt.bar(purpose_counts.index, purpose_counts.values, color='teal', alpha=0.7)
plt.title('Loan Distribution by Purpose', fontsize=14, fontweight='bold')
plt.xlabel('Loan Purpose')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\nLoan Purpose Statistics:")
print(purpose_counts)

## 7. Correlation Analysis

In [None]:
# Select numerical columns for correlation
numerical_cols = ['loan_amount', 'interest_rate', 'loan_term', 'monthly_payment', 
                  'credit_score', 'annual_income', 'employment_length', 'debt_to_income']

# Calculate correlation matrix
correlation_matrix = df[numerical_cols].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Loan Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 8. Debt-to-Income Ratio Analysis

In [None]:
# DTI analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Distribution
axes[0].hist(df['debt_to_income'], bins=20, color='orange', alpha=0.7, edgecolor='black')
axes[0].axvline(df['debt_to_income'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["debt_to_income"].mean():.2f}')
axes[0].set_title('Debt-to-Income Ratio Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Debt-to-Income Ratio')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# By loan status
sns.boxplot(data=df, y='debt_to_income', x='loan_status', ax=axes[1], palette='Set3')
axes[1].set_title('DTI by Loan Status', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Loan Status')
axes[1].set_ylabel('Debt-to-Income Ratio')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 9. Loan Term Analysis

In [None]:
# Loan term distribution
term_counts = df['loan_term'].value_counts().sort_index()

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(term_counts.index, term_counts.values, color='navy', alpha=0.7)
plt.title('Loan Term Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Loan Term (months)')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
term_avg_amount = df.groupby('loan_term')['loan_amount'].mean()
plt.bar(term_avg_amount.index, term_avg_amount.values, color='darkgreen', alpha=0.7)
plt.title('Average Loan Amount by Term', fontsize=14, fontweight='bold')
plt.xlabel('Loan Term (months)')
plt.ylabel('Average Loan Amount ($)')

plt.tight_layout()
plt.show()

## 10. Income vs Loan Amount Analysis

In [None]:
# Scatter plot: Income vs Loan Amount
plt.figure(figsize=(12, 6))
scatter = plt.scatter(df['annual_income'], df['loan_amount'], 
                      c=df['credit_score'], cmap='RdYlGn', 
                      alpha=0.6, s=50)
plt.colorbar(scatter, label='Credit Score')
plt.title('Annual Income vs Loan Amount (colored by Credit Score)', fontsize=14, fontweight='bold')
plt.xlabel('Annual Income ($)')
plt.ylabel('Loan Amount ($)')
plt.tight_layout()
plt.show()

## 11. Employment Length Impact

In [None]:
# Employment length analysis
emp_stats = df.groupby('employment_length').agg({
    'loan_amount': 'mean',
    'interest_rate': 'mean',
    'credit_score': 'mean',
    'loan_id': 'count'
}).rename(columns={'loan_id': 'count'})

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

axes[0, 0].bar(emp_stats.index, emp_stats['count'], color='skyblue')
axes[0, 0].set_title('Loan Count by Employment Length', fontweight='bold')
axes[0, 0].set_xlabel('Employment Length (years)')
axes[0, 0].set_ylabel('Count')

axes[0, 1].plot(emp_stats.index, emp_stats['loan_amount'], marker='o', color='green', linewidth=2)
axes[0, 1].set_title('Average Loan Amount by Employment Length', fontweight='bold')
axes[0, 1].set_xlabel('Employment Length (years)')
axes[0, 1].set_ylabel('Average Loan Amount ($)')

axes[1, 0].plot(emp_stats.index, emp_stats['interest_rate'], marker='s', color='red', linewidth=2)
axes[1, 0].set_title('Average Interest Rate by Employment Length', fontweight='bold')
axes[1, 0].set_xlabel('Employment Length (years)')
axes[1, 0].set_ylabel('Average Interest Rate (%)')

axes[1, 1].plot(emp_stats.index, emp_stats['credit_score'], marker='^', color='purple', linewidth=2)
axes[1, 1].set_title('Average Credit Score by Employment Length', fontweight='bold')
axes[1, 1].set_xlabel('Employment Length (years)')
axes[1, 1].set_ylabel('Average Credit Score')

plt.tight_layout()
plt.show()

## 12. Key Insights Summary

In [None]:
# Generate key insights
print("=" * 60)
print("KEY INSIGHTS FROM LOAN DATA ANALYSIS")
print("=" * 60)

print(f"\n1. PORTFOLIO OVERVIEW:")
print(f"   - Total Loans: {len(df)}")
print(f"   - Total Loan Volume: ${df['loan_amount'].sum():,.2f}")
print(f"   - Average Loan Size: ${df['loan_amount'].mean():,.2f}")

print(f"\n2. LOAN STATUS:")
for status in df['loan_status'].unique():
    count = (df['loan_status'] == status).sum()
    pct = count / len(df) * 100
    print(f"   - {status}: {count} ({pct:.1f}%)")

print(f"\n3. CREDIT QUALITY:")
print(f"   - Average Credit Score: {df['credit_score'].mean():.0f}")
print(f"   - Median Credit Score: {df['credit_score'].median():.0f}")

print(f"\n4. INTEREST RATES:")
print(f"   - Average Interest Rate: {df['interest_rate'].mean():.2f}%")
print(f"   - Median Interest Rate: {df['interest_rate'].median():.2f}%")
print(f"   - Range: {df['interest_rate'].min():.2f}% - {df['interest_rate'].max():.2f}%")

print(f"\n5. TOP LOAN PURPOSES:")
for purpose, count in df['purpose'].value_counts().head(3).items():
    print(f"   - {purpose}: {count}")

print("\n" + "=" * 60)