# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# Data Collection and Analysis

In [None]:
data_descriptions = pd.read_csv('./Datasets/LoanData/data_descriptions.csv')
pd.set_option('display.max_colwidth', None)
data_descriptions

In [None]:
# Load the dataset
loan_dataset = pd.read_csv("./Datasets/LoanData/train.csv")
loan_dataset.head()

In [None]:
# shape of the dataset
loan_dataset.shape

In [None]:
# basic information about the dataset
loan_dataset.info()

In [None]:
# check the missing values
loan_dataset.isna().sum()

In [None]:
# statistical measures
loan_dataset.describe()

# Data Visualization

In [None]:
# Visualize Default percentage
default_count = loan_dataset["Default"].value_counts()
default_pct = default_count / len(loan_dataset) * 100
default_pct

# Create pie chart
fig, ax = plt.subplots()
ax.pie(default_pct, labels=['Loan Not Defaulted', 'Loan Defaulted'], autopct='%1.1f%%')
ax.set_title('Loan Default Percentage')
plt.show()

In [None]:
# Plot the distribution of the target variable "Default "
sns.countplot(x="Default", data=loan_dataset)
plt.title("Default Distribution")
plt.xlabel("Default")
plt.ylabel("Count")
plt.show()

In [None]:
sns.boxplot(x='Default', y='Age', data=loan_dataset)
plt.title('Age vs. Loan Default')
plt.xlabel('Default')
plt.ylabel('Age')
plt.show()

In [None]:
# Compares the distribution of loan amounts for defaulted and non-defaulted loans
loan_dataset[loan_dataset['Default'] == 0]['LoanAmount'].hist(alpha=0.5, color='blue', bins=30, label='No Default')
loan_dataset[loan_dataset['Default'] == 1]['LoanAmount'].hist(alpha=0.5, color='red', bins=30, label='Default')
plt.legend()
plt.title('Loan Amount Distribution by Default Status')
plt.xlabel('Loan Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Plot for 'Education'
sns.countplot(x='Education', hue='Default', data=loan_dataset)
plt.title('Education vs. Loan Default')
plt.xlabel('Education')
plt.ylabel('Count')
plt.show()

In [None]:
# Plot for 'MaritalStatus'
sns.countplot(x='MaritalStatus', hue='Default', data=loan_dataset)
plt.title('Marital Status vs. Loan Default')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.show()

In [None]:
# Plot for 'EmploymentType'
sns.countplot(x='EmploymentType', hue='Default', data=loan_dataset)
plt.title('Employment Type vs. Loan Default')
plt.xlabel('Employment Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Plot for 'HasMortgage'
sns.countplot(x='HasMortgage', hue='Default', data=loan_dataset)
plt.title('Has Mortgage vs. Loan Default')
plt.xlabel('Has Mortgage')
plt.ylabel('Count')
plt.show()

In [None]:
# Plot for 'HasDependents'
sns.countplot(x='HasDependents', hue='Default', data=loan_dataset)
plt.title('Has Dependents vs. Loan Default')
plt.xlabel('Has Dependents')
plt.ylabel('Count')
plt.show()

In [None]:
# Plot for 'LoanPurpose'
sns.countplot(x='LoanPurpose', hue='Default', data=loan_dataset)
plt.title('Loan Purpose vs. Loan Default')
plt.xlabel('Loan Purpose')
plt.ylabel('Count')
plt.show()

In [None]:
# Plot for 'HasCoSigner'
sns.countplot(x='HasCoSigner', hue='Default', data=loan_dataset)
plt.title('Has Co-Signer vs. Loan Default')
plt.xlabel('Has Co-Signer')
plt.ylabel('Count')
plt.show()

In [None]:
# Shows the distribution of a feature, such as interest rate, for defaulted and non-defaulted loans
plt.figure(figsize=(10,6))
sns.kdeplot(loan_dataset[loan_dataset['Default'] == 0]['InterestRate'], fill=True, color='blue', label='No Default')
sns.kdeplot(loan_dataset[loan_dataset['Default'] == 1]['InterestRate'], fill=True, color='red', label='Default')
plt.title('Interest Rate Distribution by Default Status')
plt.xlabel('Interest Rate')
plt.ylabel('Density')
plt.legend()
plt.show()

## Encoding Data

In [None]:
# Convert categorical columns to numeric using LabelEncoder
le = LabelEncoder()
loan_dataset = loan_dataset.apply(lambda x: le.fit_transform(x) if x.dtype == 'object' else x)

In [None]:
loan_dataset.head()

## Correlation

In [None]:
loan_dataset = loan_dataset.drop(columns=["LoanID"], axis=1)

In [None]:
loan_dataset.corr()

In [None]:
# heatmap for correlation
plt.figure(figsize=(12,8))
sns.heatmap(data=loan_dataset.corr(), fmt=".3f", annot=True, annot_kws={"size": 8}, cmap="Blues")
plt.title("Correlation")
plt.show()

## Separate Data into Features and label

In [None]:
x = loan_dataset.drop(columns=["Default"], axis=1)
y = loan_dataset["Default"]

## Imbalance Data Handling

In [None]:
y.value_counts()

In [None]:
y.value_counts().plot(kind="bar")
plt.title("Value Counts (Before Smote)")
plt.show()

In [None]:
# SMOTE
smote = SMOTE()

In [None]:
x_smote, y_smote = smote.fit_resample(x, y)

In [None]:
y_smote.value_counts()

In [None]:
y_smote.value_counts().plot(kind="bar")
plt.title("Value Counts (After Smote)")
plt.show()

In [None]:
x = x_smote
y = y_smote

## Data Standardization

In [None]:
scaler = StandardScaler()

In [None]:
x = scaler.fit_transform(x_smote)

In [None]:
x

## Split Dataset Into Train and Test

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

# Train and Evaluation The Model

### Logistic Regression Model

In [None]:
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(x_train, y_train)

# Predictions
y_pred_train_lr = logistic_model.predict(x_train)
y_pred_test_lr = logistic_model.predict(x_test)

# Accuracy and classification report for Logistic Regression
print("Logistic Regression:")
print("Train Accuracy:", accuracy_score(y_train, y_pred_train_lr))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test_lr))
print("Classification Report for Test Data:")
print(classification_report(y_test, y_pred_test_lr))

### Decision Tree Model

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(x_train, y_train)

# Predictions
y_pred_train_dt = dt_model.predict(x_train)
y_pred_test_dt = dt_model.predict(x_test)

# Accuracy and classification report for Decision Tree
print("\nDecision Tree:")
print("Train Accuracy:", accuracy_score(y_train, y_pred_train_dt))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test_dt))
print("Classification Report for Test Data:")
print(classification_report(y_test, y_pred_test_dt))

### Random Forest model

In [None]:
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(x_train, y_train)

# Predictions
y_pred_train_rf = random_forest_model.predict(x_train)
y_pred_test_rf = random_forest_model.predict(x_test)

# Accuracy and classification report for Random Forest
print("\nRandom Forest:")
print("Train Accuracy:", accuracy_score(y_train, y_pred_train_rf))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test_rf))
print("Classification Report for Test Data:")
print(classification_report(y_test, y_pred_test_rf))
print("Confusion Matrix for Test Data:")
print(confusion_matrix(y_test, y_pred_test_rf))

#### Cross Validation

In [None]:
cv_scores = cross_val_score(xgb_model, x, y, cv=5, scoring='accuracy')

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

#### Hypher Parameter Optimization

In [None]:
"""

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

# Train the model with the best parameters
random_forest_model = grid_search.best_estimator_
random_forest_model.fit(x_train, y_train)

# Evaluate the model
y_pred_train_rf = random_forest_model.predict(x_train)
y_pred_test_rf = random_forest_model.predict(x_test)

train_accuracy = accuracy_score(y_train, y_pred_train_rf)
test_accuracy = accuracy_score(y_test, y_pred_test_rf)

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

"""

In [None]:
# heatmap for confusion matrix
sns.heatmap(data=confusion_matrix(y_test, y_pred_test_rf), fmt="d", annot=True, cmap="Blues")
plt.title("Confusion Matrix Random Forest Model")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# plotting ROC Curve
y_test_prob = random_forest_model.predict_proba(x_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_test_prob)
roc_auc = roc_auc_score(y_test, y_test_prob)

plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (Random Forest Model)')
plt.legend()
plt.show()

### XGBoost Model

In [None]:
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(x_train, y_train)

# Predictions
y_pred_train_xgb = xgb_model.predict(x_train)
y_pred_test_xgb = xgb_model.predict(x_test)

# Accuracy and classification report for XGBoost
print("\nXGBoot:")
print("Train Accuracy:", accuracy_score(y_train, y_pred_train_xgb))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test_xgb))
print("Classification Report for Test Data:")
print(classification_report(y_test, y_pred_test_xgb))
print("Confusion Matrix for Test Data:")
print(confusion_matrix(y_test, y_pred_test_xgb))

#### Cross Validation

In [None]:
cv_scores = cross_val_score(xgb_model, x, y, cv=5, scoring='accuracy')

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

#### Hipher Parameter Optimization

In [None]:
"""

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

xgb_model = grid_search.best_estimator_

# Evaluate the model with the best parameters
y_pred_train_xgb = xgb_model.predict(x_train)
y_pred_test_xgb = xgb_model.predict(x_test)

train_accuracy = accuracy_score(y_train, y_pred_train_xgb)
test_accuracy = accuracy_score(y_test, y_pred_test_xgb)

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

"""

In [None]:
# heatmap for confusion matrix
sns.heatmap(data=confusion_matrix(y_test, y_pred_test_xgb), fmt="d", annot=True, cmap="Blues")
plt.title("Confusion Matrix XGBoost Model")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# plotting ROC Curve
y_test_prob = xgb_model.predict_proba(x_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_test_prob)
roc_auc = roc_auc_score(y_test, y_test_prob)

plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (XGBoost Model)')
plt.legend()
plt.show()

# Extract and visualize feature importance

In [None]:
# Get the column names after standardizing
feature_names = x_smote.columns
feature_names

In [None]:
# Create a DataFrame with feature names for standardized x
x = pd.DataFrame(x, columns=feature_names)

### Random Forest Model

In [None]:
# Get feature importances
importances = random_forest_model.feature_importances_

# Create a DataFrame for visualization
feature_importances = pd.DataFrame({'Feature': x.columns, 'Importance': importances})
print(feature_importances)

# Sort by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Visualize the feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importances, orient="h")
plt.title('Feature Importances in Random Forest Model')
plt.show()

### XGBoost Model 

In [None]:
# Get feature importances
importances = xgb_model.feature_importances_

# Create a DataFrame for visualization
feature_importances = pd.DataFrame({'Feature': x.columns, 'Importance': importances})
print(feature_importances)

# Sort by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Visualize the feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importances, orient="h")
plt.title('Feature Importances in XGBoost Model')
plt.show()

## Making a Predictive System

In [None]:
input_data = (33,65056,91351,423,42,2,3.87,60,0.32,2,0,1,1,1,1,0)

# change the input data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instanse
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
# print(std_data)

prediction = xgb_model.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print("The person will not default on the loan.")
else:
    print("The person will default on the loan.")

## Make Predictions (test.csv)

In [None]:
# Load the test dataset
loan_test_dataset = pd.read_csv("./Datasets/LoanData/test.csv")
loan_test_dataset.head()

In [None]:
# shape of the datset
loan_test_dataset.shape

In [None]:
# check missing values
loan_test_dataset.isna().sum()

In [None]:
# basic information about the dataset
loan_test_dataset.info()

In [None]:
# drop LoanID column
loan_test_data = loan_test_dataset.drop(columns=["LoanID"], axis=1)

In [None]:
# Convert categorical columns to numeric using LabelEncoder
le = LabelEncoder()
loan_test_data = loan_test_data.apply(lambda x: le.fit_transform(x) if x.dtype == 'object' else x)

In [None]:
loan_test_data.head()

In [None]:
x_new = loan_test_data

## Test Data Standardization

In [None]:
x_test_new = scaler.fit_transform(x_new)
x_test_new

## Test Data Prediction (Random Forest Model)

In [None]:
# Predictions of "test.csv" dataset
predictions = random_forest_model.predict(x_test_new)
predictions

In [None]:
predictions.shape

In [None]:
loan_test_data["Predicted Defaults"] = predictions

In [None]:
loan_test_data.head()

In [None]:
loan_test_data["Predicted Defaults"].value_counts()

In [None]:
# Add "Predicted Defaults" column to the "loan_test_dataset"
loan_test_dataset["Predicted Defaults"] = predictions

In [None]:
loan_test_dataset.head()

## Test Data Prediction (XGBoost Model)

In [None]:
# Predictions of "test.csv" dataset
predictions = xgb_model.predict(x_test_new)
predictions

In [None]:
predictions.shape

In [None]:
loan_test_data["Predicted Defaults"] = predictions

In [None]:
loan_test_data.head()

In [None]:
loan_test_data["Predicted Defaults"].value_counts()

In [None]:
# Add "Predicted Defaults" column to the "loan_test_dataset"
loan_test_dataset["Predicted Defaults"] = predictions

In [None]:
loan_test_dataset.head()