## Import libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import seaborn as sns
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, roc_curve, auc
import warnings
from sklearn.preprocessing import StandardScaler
# !pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

In [6]:
warnings.filterwarnings('ignore')

### Load and prepreprocess the data

In [7]:
data = pd.read_csv('/kaggle/input/dataset/accepted_2007_to_2018Q4.csv')

In [8]:
a = data['installment']
a

0          123.03
1          820.28
2          432.66
3          829.90
4          289.91
            ...  
2260696    859.56
2260697    564.56
2260698    329.33
2260699       NaN
2260700       NaN
Name: installment, Length: 2260701, dtype: float64

In [9]:
print("Missing Values in Each Column:")
print(data.isnull().sum())

Missing Values in Each Column:
id                             0
member_id                2260701
loan_amnt                     33
funded_amnt                   33
funded_amnt_inv               33
                          ...   
settlement_status        2226455
settlement_date          2226455
settlement_amount        2226455
settlement_percentage    2226455
settlement_term          2226455
Length: 151, dtype: int64


- Identifying unique values in columns to get more knowledge about the data, which will facilitate in performing one-hot encoding efficiently

In [10]:
print(data['home_ownership'].unique(),
data['verification_status'].unique(),
data['application_type'].unique(),
data['purpose'].unique(),
data['term'].unique(),
data['grade'].unique())

['MORTGAGE' 'RENT' 'OWN' 'ANY' nan 'NONE' 'OTHER'] ['Not Verified' 'Source Verified' 'Verified' nan] ['Individual' 'Joint App' nan] ['debt_consolidation' 'small_business' 'home_improvement' 'major_purchase'
 'credit_card' 'other' 'house' 'vacation' 'car' 'medical' 'moving'
 'renewable_energy' 'wedding' 'educational' nan] [' 36 months' ' 60 months' nan] ['C' 'B' 'F' 'A' 'E' 'D' 'G' nan]


#### Perform mean of mode interpolation for the Null values (as mentioned in the paper)

In [11]:
column_modes = data.mode(dropna=True).iloc[0]    
data = data.fillna(column_modes)
data = data.fillna(data.mode().iloc[0])

In [12]:
data.isnull().sum().agg

<bound method Series.aggregate of id                             0
member_id                2260701
loan_amnt                      0
funded_amnt                    0
funded_amnt_inv                0
                          ...   
settlement_status              0
settlement_date                0
settlement_amount              0
settlement_percentage          0
settlement_term                0
Length: 151, dtype: int64>

#### Identify categorical columns and performing one-hot encoding to perform the resursive feature elimination

In [13]:
#X = data.drop('loan_status', axis=1)
#y = data['loan_status']

# # Identify categorical columns (change according to your dataset)
#categorical_columns = X.select_dtypes(include=['object']).columns

In [14]:
#print(categorical_columns)
#X = pd.get_dummies(X, columns=categorical_columns)

#### RFE (Recursive feature elimination)
- perform appropriate scaling and Recursive feature elimination to extract top 30 features.
- Here, randomforestclassifier is used as ML model for performing the recursive feature elimination

In [15]:
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Initialize the base classifier
# classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# # Initialize RFE with the classifier, to reduce features to 30
# selector = RFE(estimator=classifier, n_features_to_select=30, step=1)

# # Fit RFE on the scaled and encoded dataset
# selector.fit(X_scaled, y)

# # Get the support array (mask) of selected features
# selected_features = X.columns[selector.support_]

# # Filter the dataset to keep only selected features
# X_selected = X[selected_features]

# # Output the selected features
# print("Selected features:", selected_features.tolist())

In [16]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)
print(list(data.dtypes))
print(list(data.columns))


[dtype('O'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('O'), dtype('float64'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('float64'), dtype('O'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('O'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('O'), dtype('float64'), dtype('float64'), dtype('O'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('fl

#### Note: 
- On further analysing the data, there were one more categorical value in the loan_status column which needs to handle correclty, which was not mentioned in the paper. The given value corelates with fully paid so I added that value with 0 labelling.

In [17]:
# count_charged_off = data[data['loan_status'] == 'Does not meet the credit policy. Status:Fully Paid'].shape[0]
# print(count_charged_off)
# data['loan_status'].isnull().sum()

#### Conversion: Installment -> Installment_feat
conversion of installment to installment feat, which is fraction of monthly installment with respect to the monthly income. Here we need to handle extreme division in the data values and also calculating the monthly income based other column.

In [18]:
condition = data['annual_inc'] == 0.0
data = data.drop(data[condition].index)
data['monthly_income'] = data['annual_inc'] / 12
print(data['annual_inc'].max(), data['monthly_income'].min(), data['monthly_income'].max())

110000000.0 0.03 9166666.666666666


#### Convert specified categorical columns into binary nominal columns
- Due to limitation of resources and cpu. I need to manually form the one hot encoding on the few columns named home_ownership, verification_status, application_Type, loan_Status, purpose, and term.
- First I have extracted all the unique values and then map each value to a new binary column.
- Grade column is mapped to numerical values corresponds to its alphabetic values.
- At last 30 columns have been extracted, according to the dataset.

In [19]:
# Calculate installment_feat
data['monthly_income'] = data['annual_inc'] / 12
data['installment_feat'] = (data['installment'] / data['monthly_income']) * 100

# Convert specified categorical columns into binary nominal columns
# Home Ownership
data['home_ownership_MORTGAGE'] = (data['home_ownership'] == 'MORTGAGE').astype(int)
data['home_ownership_RENT'] = (data['home_ownership'] == 'RENT').astype(int)
data['home_ownership_OWN'] = (data['home_ownership'] == 'OWN').astype(int)
data['home_ownership_ANY'] = (data['home_ownership'] == 'ANY').astype(int)

# Verification Status
data['verification_status_Not_Verified'] = (data['verification_status'] == 'Not Verified').astype(int)
data['verification_status_Source_Verified'] = (data['verification_status'] == 'Source Verified').astype(int)
data['verification_status_Verified'] = (data['verification_status'] == 'Verified').astype(int)

# Application Type
data['application_type_Individual'] = (data['application_type'] == 'Individual').astype(int)
data['application_type_Joint_App'] = (data['application_type'] == 'Joint App').astype(int)

# Purpose
data['purpose_major_purchase'] = (data['purpose'] == 'major_purchase').astype(int)
data['purpose_renewable_energy'] = (data['purpose'] == 'renewable_energy').astype(int)
data['purpose_small_business'] = (data['purpose'] == 'small_business').astype(int)
data['purpose_vacation'] = (data['purpose'] == 'vacation').astype(int)

# Term
data['term'] = data['term'].str.strip().str.lower()
data['term_36_months'] = (data['term'] == '36 months').astype(int)
data['term_60_months'] = (data['term'] == '60 months').astype(int)

status_mapping = {
    'Current': 0, 'Fully Paid': 0, 'Issued': 0, 'Does not meet the credit policy. Status:Fully Paid':0,
    'Default': 1, 'Charged Off': 1, 'In Grace Period': 1, 'Does not meet the credit policy. Status:Charged Off': 1,
    'Late (16-30 days)': 1, 'Late (31-120 days)': 1
}

# Apply the mapping to the loan_status column
data['loan_status_encoded'] = data['loan_status'].map(status_mapping)


grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}

data['grade'] = data['grade'].map(grade_mapping)

# Select only the required columns (existing + newly created)
columns_to_keep = [
    'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'installment',
    'grade', 'open_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
    'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'open_act_il',
    'delinq_amnt', 'num_op_rev_tl', 'home_ownership_MORTGAGE',
    'home_ownership_RENT', 'home_ownership_OWN', 'home_ownership_ANY',
    'verification_status_Not_Verified', 'verification_status_Source_Verified',
    'verification_status_Verified', 'application_type_Individual',
    'application_type_Joint_App', 'purpose_major_purchase', 'purpose_renewable_energy',
    'purpose_small_business', 'purpose_vacation', 'term_36_months', 'term_60_months',
    'installment_feat','loan_status_encoded'
]

data_final = data[columns_to_keep]

- working with installment and installment feat.

In [20]:
data_final.drop(['installment'],axis =1, inplace = True)
len(data_final.columns)
print(data_final.dtypes)

loan_amnt                              float64
funded_amnt                            float64
funded_amnt_inv                        float64
grade                                    int64
open_acc                               float64
out_prncp                              float64
out_prncp_inv                          float64
total_pymnt                            float64
total_pymnt_inv                        float64
total_rec_prncp                        float64
total_rec_int                          float64
open_act_il                            float64
delinq_amnt                            float64
num_op_rev_tl                          float64
home_ownership_MORTGAGE                  int64
home_ownership_RENT                      int64
home_ownership_OWN                       int64
home_ownership_ANY                       int64
verification_status_Not_Verified         int64
verification_status_Source_Verified      int64
verification_status_Verified             int64
application_t

In [21]:
data_final.to_csv('30_columns.csv')

In [None]:
df = pd.read_csv("/kaggle/working/30_columns.csv")
print(data_final.shape)
df = data_final

#### SMOTE 
- As mentioned above, the target variable ‘loans status’ has a large difference in the number of normal and
default categories, which will cause trouble to model learning. The method of oversampling is used to handle
sample imbalance problem, we adopt SMOTE (Synthetic Minority Oversampling Technique) method in this
paper


In [None]:
import matplotlib.pyplot as plt

def plot_class_distribution_pie(y):
    normal_percentage = (sum(y == 0) / len(y)) * 100
    default_percentage = (sum(y == 1) / len(y)) * 100

    labels = 'Normal', 'Default'
    sizes = [normal_percentage, default_percentage]
    colors = ['blue', 'red']
    explode = (0.1, 0)  

    # Plot
    plt.figure(figsize=(6, 4))
    plt.pie(sizes, explode=explode, labels=labels, colors=colors,
            autopct='%1.2f%%', shadow=True, startangle=140)
    plt.axis('equal')  
    plt.title('Percentage of Each Loan Status Before SMOTE')
    plt.show()

plot_class_distribution_pie(data_final['loan_status_encoded'])


In [None]:
# # Check for infinite values
# print("Infinite values in X_train:", np.isinf(X_train).sum().sum())
# print("Infinite values in X_test:", np.isinf(X_test).sum().sum())
# # Optionally, drop rows with NaNs if imputation isn't suitable
# # Replace infinities with NaN
# X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
# X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# X_train.dropna(inplace=True)
# y_train = y_train[X_train.index]  # Make sure to align y_train with X_train after dropping

# X_test.dropna(inplace=True)
# y_test = y_test[X_test.index]  # Align y_test as well

# # Check again for any infinities or NaNs
# print("NaNs in X_train after processing:", X_train.isnull().sum().sum())
# print("Infinities in X_train after processing:", np.isinf(X_train).sum().sum())


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop(['loan_status_encoded'], axis=1)  
y = df['loan_status_encoded']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize SMOTE and resample the data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
plot_class_distribution_pie(y_train_resampled)

In [None]:
print(X.shape)

#### Pearson correlation graph
- Due to resource constraints, performing RFE for all columns in not feasible and appropriate code has been added previously. here we had manually implmented the correlation matrix for the30 columns which are mentioned in the research paper. 

In [None]:
selected_columns = [
    'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
    'grade', 'open_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
    'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'open_act_il',
    'delinq_amnt', 'num_op_rev_tl', 'home_ownership_MORTGAGE',
    'home_ownership_RENT', 'home_ownership_OWN', 'home_ownership_ANY',
    'verification_status_Not_Verified', 'verification_status_Source_Verified',
    'verification_status_Verified', 'application_type_Individual',
    'application_type_Joint_App', 'purpose_major_purchase', 'purpose_renewable_energy',
    'purpose_small_business', 'purpose_vacation', 'term_36_months', 'term_60_months',
    'installment_feat'
]
len(selected_columns)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
selected_data_30 = df[selected_columns]
selected_data_30_scaled = scaler.fit_transform(selected_data_30)
selected_data_30_scaled = pd.DataFrame(selected_data_30_scaled, columns=selected_columns)

correlation_matrix_30 = selected_data_30_scaled.corr()

plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix_30, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Pearson Correlation of 30 Selected Features")
plt.show()

#### Pearson correlation graph to Identify and remove highly correlated features.
- We plotted the Pearson correlation graph of the 30 features. 
On the basis of the first dimension reduction, redundant features are selected and eliminated by Pearso 
correlation graph, the dimension of features is reduced from 30 to 15

In [None]:
columns_to_remove = set()
correlation_threshold = 0.8

for i in range(len(correlation_matrix_30.columns)):
    for j in range(i):
        if abs(correlation_matrix_30.iloc[i, j]) > correlation_threshold:
            # Identify the feature to remove based on a criterion, such as lower variance
            if selected_data_30[correlation_matrix_30.columns[i]].var() > selected_data_30[correlation_matrix_30.columns[j]].var():
                columns_to_remove.add(correlation_matrix_30.columns[j])
            else:
                columns_to_remove.add(correlation_matrix_30.columns[i])

removed_due_to = {col: correlation_matrix_30[col][correlation_matrix_30[col] > correlation_threshold].index.tolist() 
                  for col in columns_to_remove}

# Retain the first 15 features after removing highly correlated ones
reduced_columns = [col for col in selected_columns if col not in columns_to_remove][:15]

# Get the final dataset with reduced features
final_data_15 = selected_data_30[reduced_columns]


In [None]:
# Calculate and plot the Pearson correlation for the 15 features
correlation_matrix_15 = final_data_15.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_15, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Pearson Correlation of 15 Reduced Features")
plt.show()


#### Manual feature extraction
- As mentioned above, RFE was not feasible. so the top 15 columns will not match with the paper. here we have manually extracted 15 feature as mentioned in the paper. Also ensure to perform all the preprocessing mentioned in the paper.


In [None]:
#15 features
selected_features = [
    'loan_amnt', 'installment', 'grade', 'open_acc', 'total_pymnt', 'total_rec_int',
    'home_ownership_MORTGAGE', 'home_ownership_ANY', 
    'verification_status_Not_Verified', 'application_type_Individual',
    'purpose_major_purchase', 'purpose_renewable_energy', 'purpose_small_business', 'purpose_vacation',
    'term_36_months'
]


In [None]:
# Calculate and plot the Pearson correlation for the 15 features
correlation_matrix_15 = data[selected_features].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix_15, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Pearson Correlation of 15 Reduced Features")
plt.show()


#### The ranking of the importance of features
- As mentioned in the paper, plotting the ranking of importance of features. we adopt the Random Forest algorithm to rank the importance of features and reduce the learning 
difficulty to achieve the purpose of optimizing the model calculation

In [None]:
selected_features.remove('installment')
selected_features

In [None]:
X = data_final[selected_features]
y = data_final['loan_status_encoded']
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize SMOTE and apply it on the training data

# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Scale the data - important to scale after resampling to avoid data leakage
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Load your dataset
data = pd.read_csv('/kaggle/working/30_columns.csv')
data = data[selected_columns]
# Assuming 'loan_status_encoded' is your target and other preprocessing has been done
# X = data.drop(['loan_status_encoded'], axis=1)
# y = data['loan_status_encoded']
# X = df['selected_features']

# Optionally scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict the test set results
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f'Accuracy of Random Forest Classifier: {accuracy:.2f}')
print(f'ROC-AUC Score: {roc_auc:.2f}')

# Feature Importance Visualization
importances = rf.feature_importances_
indices = pd.DataFrame(importances, index=X.columns).sort_values(0, ascending=False)

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
plt.title('Feature Importances by Random Forest')
plt.barh(indices.index, indices[0], color='b', align='center')
plt.xlabel('Relative Importance')
plt.show()


In [None]:
global_metrics = {}

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier
rf_classifier.fit(X_train_scaled, y_train)

# Predict on the test data
rf_predictions = rf_classifier.predict(X_test_scaled)
rf_proba = rf_classifier.predict_proba(X_test_scaled)[:, 1]

# Calculate the evaluation metrics
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_auc = roc_auc_score(y_test, rf_proba)
rf_f1 = f1_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)

print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Random Forest AUC: {rf_auc}")
print(f"Random Forest F1-Score: {rf_f1}")
print(f"Random Forest Recall: {rf_recall}")


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the classifier
dt_classifier.fit(X_train_scaled, y_train)

# Predict on the test data
dt_predictions = dt_classifier.predict(X_test_scaled)
dt_proba = dt_classifier.predict_proba(X_test_scaled)[:, 1]

# Calculate the evaluation metrics
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_auc = roc_auc_score(y_test, dt_proba)
dt_f1 = f1_score(y_test, dt_predictions)
dt_recall = recall_score(y_test, dt_predictions)

print(f"Decision Tree Accuracy: {dt_accuracy}")
print(f"Decision Tree AUC: {dt_auc}")
print(f"Decision Tree F1-Score: {dt_f1}")
print(f"Decision Tree Recall: {dt_recall}")


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', probability=True, random_state=42)

# Train the classifier
svm_classifier.fit(X_train_scaled, y_train)

# Predict on the test data
svm_predictions = svm_classifier.predict(X_test_scaled)
svm_proba = svm_classifier.predict_proba(X_test_scaled)[:, 1]

# Calculate the evaluation metrics
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_auc = roc_auc_score(y_test, svm_proba)
svm_f1 = f1_score(y_test, svm_predictions)
svm_recall = recall_score(y_test, svm_predictions)

print(f"SVM Accuracy: {svm_accuracy}")
print(f"SVM AUC: {svm_auc}")
print(f"SVM F1-Score: {svm_f1}")
print(f"SVM Recall: {svm_recall}")

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression classifier
logreg_classifier = LogisticRegression(random_state=42)

# Train the classifier
logreg_classifier.fit(X_train_scaled, y_train)

# Predict on the test data
logreg_predictions = logreg_classifier.predict(X_test_scaled)
logreg_proba = logreg_classifier.predict_proba(X_test_scaled)[:, 1]

# Calculate the evaluation metrics
logreg_accuracy = accuracy_score(y_test, logreg_predictions)
logreg_auc = roc_auc_score(y_test, logreg_proba)
logreg_f1 = f1_score(y_test, logreg_predictions)
logreg_recall = recall_score(y_test, logreg_predictions)

print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Logistic Regression AUC: {logreg_auc}")
print(f"Logistic Regression F1-Score: {logreg_f1}")
print(f"Logistic Regression Recall: {logreg_recall}")

In [None]:
import pandas as pd

# Create DataFrame from the metrics dictionary
results_df = pd.DataFrame(global_metrics).T

# Display the DataFrame
print(results_df)