In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [None]:
# Load the dataset
data = pd.read_csv('loan_approval_dataset.csv')

In [None]:
# Preprocess the data
# Handling missing values (if any)
data.fillna(method='ffill', inplace=True)

In [None]:
data.head()

In [None]:
# Drop loan_id (not useful for prediction)
data = data.drop('loan_id', axis=)

In [None]:
# Drop loan_id (not useful for prediction)
# Check if the column exists before dropping
if 'loan_id' in data.columns:
    data = data.drop('loan_id', axis=1)
else:
    print("Column 'loan_id' not found in the DataFrame.")

In [None]:
# Handle missing values (fill forward or with median/mean depending on the feature)
data.fillna(method='ffill', inplace=True)  # Or handle missing values differently if needed

# Encoding categorical variables
le = LabelEncoder()

In [None]:
print(data.columns)

In [None]:
# Strip leading/trailing spaces from column names
data.columns = data.columns.str.strip()

# Now you can proceed with your code
le = LabelEncoder()
data['education'] = le.fit_transform(data['education'])
data['self_employed'] = le.fit_transform(data['self_employed'])


In [None]:
# Encode target variable 'loan_status'
data['loan_status'] = le.fit_transform(data['loan_status'])

In [None]:
# Strip leading/trailing spaces from column names (if needed)
data.columns = data.columns.str.strip()

# Check if the column exists before proceeding
if 'loan_status' in data.columns:
    # Encode target variable 'loan_status'
    data['loan_status'] = le.fit_transform(data['loan_status'])
else:
    print("Column 'loan_status' not found in the DataFrame. Please check your data.")

In [None]:
# Features (X) and Target (y)
X = data.drop('loan_status', axis=1)
y = data['loan_status']

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Feature scaling (especially for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
!pip install scikit-learn
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming 'education' is the problematic column
categorical_features = ['education']  # Add any other categorical columns
numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features),
    ])

# Apply preprocessing pipeline
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [None]:
# 1. Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)

In [None]:
# Evaluate Logistic Regression
print("Logistic Regression Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_lr))

In [None]:
# 2. Decision Tree Model
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

# Identify all categorical features (including the one with ' No')
categorical_features = ['education']  # Add the column containing ' No' and other categorical columns
numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),  # Passthrough for numerical features
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features),
    ])

# Fit and transform the preprocessor on X_train
X_train_encoded = preprocessor.fit_transform(X_train)

# Transform X_test using the fitted preprocessor
X_test_encoded = preprocessor.transform(X_test)

# Get feature names after transformation
feature_names = preprocessor.get_feature_names_out(input_features=X_train.columns)

# Create DataFrames with appropriate column names
X_train_encoded = pd.DataFrame(X_train_encoded, columns=feature_names, index=X_train.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=feature_names, index=X_test.index)

# Re-train the Decision Tree model with the updated X_train
dt_model = DecisionTreeClassifier()  # Re-initialize if needed
dt_model.fit(X_train_encoded, y_train)

y_pred_dt = dt_model.predict(X_test_encoded)

In [None]:
# Evaluate Decision Tree
print("\nDecision Tree Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_dt))

In [None]:
# prompt: visualize the statistics

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data' DataFrame is already created and preprocessed as in the previous code

# Visualize the distribution of the target variable
plt.figure(figsize=(6, 4))
sns.countplot(x='loan_status', data=data)
plt.title('Distribution of Loan Status')
plt.show()

# Visualize the relationship between 'loan_status' and 'education'
plt.figure(figsize=(8, 6))
sns.countplot(x='education', hue='loan_status', data=data)
plt.title('Loan Status vs. Education')
plt.show()


# Visualize the correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()


# Box plots to visualize the relationship between numerical features and the target variable.
numerical_features = ['applicantincome', 'coapplicantincome', 'loanamount']
for col in numerical_features:
  plt.figure(figsize=(8,6))
  sns.boxplot(x='loan_status', y=col, data=data)
  plt.title(f'Box Plot of {col} by Loan Status')
  plt.show()

# Confusion Matrix visualization (for Logistic Regression)
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix (Logistic Regression)')
plt.show()


# You can create similar visualizations for the Decision Tree model.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data' DataFrame is already created and preprocessed

# Visualize the distribution of the target variable
plt.figure(figsize=(6, 4))
sns.countplot(x='loan_status', data=data)
plt.title('Distribution of Loan Status')
plt.show()

# Visualize the relationship between 'loan_status' and 'education'
plt.figure(figsize=(8, 6))
sns.countplot(x='education', hue='loan_status', data=data)
plt.title('Loan Status vs. Education')
plt.show()

# Visualize the correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()

# Example: Visualize the distribution of 'income_annum'
plt.figure(figsize=(8, 6))
sns.histplot(data['income_annum'], kde=True)
plt.title('Distribution of Applicant Income (Annum)')
plt.show()


# Confusion Matrix visualization (for Logistic Regression)
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix (Logistic Regression)')
plt.show()


In [None]:
# Box plots to visualize the relationship between numerical features and the target variable
numerical_features = ['income_annum', 'loan_amount', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
for col in numerical_features:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x='loan_status', y=col, data=data)
    plt.title(f'Box Plot of {col} by Loan Status')
    plt.show()
