# Loan Application Approval Prediction - Analysis Report

This notebook contains a detailed analysis of the loan application dataset, including EDA and visualization.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from scipy import stats

# Set style for better visualization
plt.style.use('seaborn')
sns.set_palette('husl')

# Display all columns
pd.set_option('display.max_columns', None)

## 1. Data Loading and Initial Exploration

In [None]:
# Read the dataset
df = pd.read_csv('Loan Application Accept or Reject.csv')

# Display top 5 rows
print("Top 5 rows:")
display(df.head())

print("\nBottom 5 rows:")
display(df.tail())

## 2. Rename Columns

In [None]:
# Rename columns to more meaningful names
df.columns = ['loan_id', 'dependents', 'education', 'self_employed', 'annual_income', 
              'loan_amount', 'loan_term', 'cibil_score', 'residential_assets', 
              'commercial_assets', 'luxury_assets', 'bank_assets', 'loan_status']

print("Updated column names:")
print(df.columns.tolist())

## 3. Dataset Information

In [None]:
# Display dataset shape
print(f"Total number of rows: {df.shape[0]}")
print(f"Total number of columns: {df.shape[1]}\n")

# Display column information
print("Column Information:")
display(df.info())

## 4. Check for Null Values

In [None]:
# Display null values count
print("Null values in each column:")
display(df.isnull().sum())

# Visualize null values
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
plt.title('Null Values in Dataset')
plt.show()

## 5. Handle Missing Values

In [None]:
# Handle missing values
# Numerical columns - fill with median
numerical_cols = ['annual_income', 'residential_assets', 'commercial_assets', 'luxury_assets', 'bank_assets']
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Categorical columns - fill with mode
categorical_cols = ['education', 'self_employed']
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("Null values after handling:")
display(df.isnull().sum())

## 6. Check and Remove Duplicates

In [None]:
# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Remove duplicates if any
df.drop_duplicates(inplace=True)
print(f"Dataset shape after removing duplicates: {df.shape}")

## 7. Encode Categorical Data

In [None]:
# Encode categorical variables
le = LabelEncoder()
df['education'] = le.fit_transform(df['education'])
df['self_employed'] = le.fit_transform(df['self_employed'])
df['loan_status'] = le.fit_transform(df['loan_status'])

print("Dataset after encoding:")
display(df.head())

## 8. Handle Outliers

In [None]:
# Function to detect and handle outliers using IQR method
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
    return df

# Handle outliers for numerical columns
numerical_cols = ['annual_income', 'loan_amount', 'loan_term', 'cibil_score',
                 'residential_assets', 'commercial_assets', 'luxury_assets', 'bank_assets']

for col in numerical_cols:
    df = handle_outliers(df, col)

# Visualize distributions after handling outliers
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 4, i)
    sns.boxplot(y=df[col])
    plt.title(col)
plt.tight_layout()
plt.show()

## 9. Data Normalization

In [None]:
# Normalize numerical columns
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print("Dataset after normalization:")
display(df.head())

## 10. Visualization Analysis

In [None]:
# 1. Bar Plot - Loan Status Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='loan_status')
plt.title('Distribution of Loan Status')
plt.show()

# 2. Histogram - Annual Income Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='annual_income', bins=30)
plt.title('Distribution of Annual Income')
plt.show()

# 3. Box Plot - Loan Amount by Education
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='education', y='loan_amount')
plt.title('Loan Amount Distribution by Education')
plt.show()

# 4. Area Plot - Assets Distribution
plt.figure(figsize=(12, 6))
df[['residential_assets', 'commercial_assets', 'luxury_assets', 'bank_assets']].plot(kind='area', stacked=True)
plt.title('Distribution of Different Types of Assets')
plt.show()

# 5. Scatter Plot - CIBIL Score vs Loan Amount
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='cibil_score', y='loan_amount', hue='loan_status')
plt.title('CIBIL Score vs Loan Amount')
plt.show()

# 6. Hexbin Plot - Annual Income vs Loan Amount
plt.figure(figsize=(10, 6))
plt.hexbin(df['annual_income'], df['loan_amount'], gridsize=20, cmap='YlOrRd')
plt.colorbar(label='count')
plt.xlabel('Annual Income')
plt.ylabel('Loan Amount')
plt.title('Hexbin Plot: Annual Income vs Loan Amount')
plt.show()

# 7. Pie Plot - Education Distribution
plt.figure(figsize=(8, 8))
df['education'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Education Levels')
plt.show()

# 8. Heatmap - Correlation Matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

## Analysis Insights

1. **Data Quality**:
   - The dataset contains information about loan applications with various features
   - Some missing values were present and handled appropriately
   - No significant duplicate entries were found

2. **Loan Approval Patterns**:
   - There's a relationship between CIBIL score and loan approval
   - Education level appears to influence loan amounts
   - Asset distribution varies significantly among applicants

3. **Financial Indicators**:
   - Annual income shows a wide range of distribution
   - Different types of assets contribute differently to loan approval
   - Correlation exists between income and loan amount

4. **Risk Factors**:
   - CIBIL score is a crucial factor in loan approval
   - Employment status and education level impact loan decisions
   - Asset backing provides security for loan approval

This analysis provides valuable insights for understanding loan approval patterns and risk assessment factors.