In [9]:
#Imports the pandas library for data manipulation and analysis. We use it to work with data in tabular form.
import pandas as pd 

#This function is used to split the data into training and testing sets.
from sklearn.model_selection import train_test_split 

#It is used for encoding categorical variables into numerical values.
from sklearn.preprocessing import LabelEncoder 

#It's used to handle missing values by imputing (filling in) them with a specified strategy (in this case, the median).
from sklearn.impute import SimpleImputer 

# It is a machine learning model that can be used for classification tasks.
from sklearn.ensemble import RandomForestClassifier

#Importing functions and tools for evaluating the model's performance, such as accuracy and generating classification reports.
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# This line reads data from a CSV file ('credit_risk_dataset.csv') and stores it in a pandas 
Data = pd.read_csv('credit_risk_dataset.csv')

In [4]:
# To see data 
Data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [5]:
Data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [6]:
Data.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [7]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


# Step 1: Data Preprocessing

In [11]:
# Handling missing values
imputer = SimpleImputer(strategy='median')
Data['loan_int_rate'] = imputer.fit_transform(Data['loan_int_rate'].values.reshape(-1, 1))
Data['person_emp_length'] = imputer.fit_transform(Data['person_emp_length'].values.reshape(-1, 1))

Here, I handle missing values in two specific columns, 'loan_int_rate' and 'person_emp_length,' using the SimpleImputer. The strategy chosen is 'median,' which means missing values will be filled with the median of the respective column. The fit_transform method is used to apply the imputation to the specified columns.

# Step 2: Data Encoding

In [13]:
encoder = LabelEncoder()
categorical_columns = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
for column in categorical_columns:
    Data[column] = encoder.fit_transform(Data[column])

Categorical variables (non-numeric data) are encoded into numeric values using the LabelEncoder. This is necessary because most machine learning algorithms require numerical inputs. The code iterates through a list of categorical columns, encodes them using fit_transform, and replaces the original columns with the encoded values.

# Step 3: Data Splitting

In [15]:
X = Data.drop('loan_status', axis=1)
y = Data['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

This part prepares the data for model training and evaluation. It splits the dataset into features (X) and the target variable (y). X contains all columns except 'loan_status,' which is the column we want to predict. Then, the data is split into training and testing sets using train_test_split. The test_size parameter specifies the proportion of data to be used for testing (in this case, 20%), and random_state sets the random seed for reproducibility.

# Step 4: Model Selection

In [16]:
model = RandomForestClassifier()

Here, I create an instance of the RandomForestClassifier model. This model will be used for predicting the 'loan_status' based on the input features.

# Step 5: Model Training

In [17]:
model.fit(X_train, y_train)

Above line trains the machine learning model. It uses the training data (X_train and y_train) to learn patterns and make predictions.

# Step 6: Model Evaluation

In [18]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(report)

Accuracy: 0.9289002557544757
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      7613
           1       0.96      0.70      0.81      2162

    accuracy                           0.93      9775
   macro avg       0.94      0.85      0.89      9775
weighted avg       0.93      0.93      0.92      9775



In [None]:
The trained model is used to make predictions on the testing data (X_test), and the results are stored in y_pred.

# Step 8: Feature Importance

In [19]:
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
print('Feature Importance:')
print(feature_importance)

Feature Importance:
person_age                    0.044059
person_income                 0.149415
person_home_ownership         0.099729
person_emp_length             0.062965
loan_intent                   0.073023
loan_grade                    0.120856
loan_amnt                     0.072086
loan_int_rate                 0.105740
loan_percent_income           0.224974
cb_person_default_on_file     0.011897
cb_person_cred_hist_length    0.035256
dtype: float64


These lines evaluate the model's performance on the test data. The accuracy_score function calculates the accuracy of the model's predictions, and classification_report generates a detailed report that includes metrics like precision, recall, F1-score, and support for each class. Finally, the code prints the accuracy and the classification report to assess how well the model performs.