In [None]:
Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical functions
from scipy import stats

# Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

Load the Dataset

In [None]:
# Load the loan approval dataset
df = pd.read_csv('loan_approval_data.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Number of rows: {df.shape[0]:,}")
print(f"Number of columns: {df.shape[1]}")

Initial Data Exploration

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

# Display column names and data types
print("Column Information:")
df.info()

# Display all column names
print("All column names:")
print(df.columns.tolist())

Feature Selection for Classification Task 1

Dropping irrelevant features based on domain understanding

In [None]:
# Create a copy for classification modeling
df_classification = df.copy()

# Drop 'id' - unique identifier, not predictive
# Drop 'max_allowed_loan' - this is the target for regression, not classification
columns_to_drop = ['id', 'max_allowed_loan']

print(f"Columns before dropping: {df_classification.shape[1]}")
print(f"Dropping columns: {columns_to_drop}")

df_classification = df_classification.drop(columns=columns_to_drop)

print(f"\nColumns after dropping: {df_classification.shape[1]}")
print(f"Retained columns: {df_classification.columns.tolist()}")

Descriptive Statistics (Task 2)

Analysing retained variables

# Basic descriptive statistics for numerical variables
print("Descriptive Statistics for Numerical Variables:")
df_classification.describe()


# Descriptive statistics for categorical variables
print("Descriptive Statistics for Categorical Variables:")
df_classification.describe(include='object')

In [None]:
# Variable scale types
print("Variable Scale Types:")
print("="*60)

# Identify numerical and categorical variables
numerical_vars = df_classification.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_vars = df_classification.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumerical Variables (Continuous/Discrete): {len(numerical_vars)}")
for var in numerical_vars:
    unique_count = df_classification[var].nunique()
    var_type = "Continuous" if unique_count > 20 else "Discrete"
    print(f"  - {var}: {var_type} (Unique values: {unique_count})")

print(f"\nCategorical Variables (Nominal/Ordinal): {len(categorical_vars)}")
for var in categorical_vars:
    unique_count = df_classification[var].nunique()
    print(f"  - {var}: Nominal (Unique values: {unique_count})")

Distribution of Target Variables

# Distribution of target variable
print("Target Variable: loan_approval_status")
print("="*60)
print("\nValue Counts:")
print(df_classification['loan_approval_status'].value_counts())
print("\nPercentage Distribution:")
print(df_classification['loan_approval_status'].value_counts(normalize=True) * 100)

# Visualize target distribution
plt.figure(figsize=(10, 6))
ax = sns.countplot(data=df_classification, x='loan_approval_status', palette='Set2')
plt.title('Distribution of Loan Approval Status', fontsize=14, fontweight='bold')
plt.xlabel('Loan Approval Status (0=Rejected, 1=Approved)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Add value labels on bars
for container in ax.containers:
    ax.bar_label(container, fmt='%d', padding=3)

plt.tight_layout()
plt.show()