In [None]:
## CREDIT SCORING MODEL

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
import os
print(os.path.exists('credit_data.csv'))

True


In [3]:
data = pd.read_csv('credit_data.csv')

In [4]:
print(data.head())


  checking_balance  months_loan_duration credit_history    purpose  amount  \
0           < 0 DM                     6       critical   radio/tv    1169   
1       1 - 200 DM                    48         repaid   radio/tv    5951   
2          unknown                    12       critical  education    2096   
3           < 0 DM                    42         repaid  furniture    7882   
4           < 0 DM                    24        delayed  car (new)    4870   

  savings_balance employment_length  installment_rate personal_status  \
0         unknown           > 7 yrs                 4     single male   
1        < 100 DM         1 - 4 yrs                 2          female   
2        < 100 DM         4 - 7 yrs                 2     single male   
3        < 100 DM         4 - 7 yrs                 2     single male   
4        < 100 DM         1 - 4 yrs                 3     single male   

  other_debtors  ...                  property age  installment_plan  \
0          none  ...

In [5]:
print("the no of rows are {} and the no of cloumns are {}".format(data.shape[0],data.shape[1]))

the no of rows are 1000 and the no of cloumns are 21


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_length     1000 non-null   object
 7   installment_rate      1000 non-null   int64 
 8   personal_status       1000 non-null   object
 9   other_debtors         1000 non-null   object
 10  residence_history     1000 non-null   int64 
 11  property              1000 non-null   object
 12  age                   1000 non-null   int64 
 13  installment_plan      1000 non-null   object
 14  housing               1000 non-null   object
 15  existing_credits      1000 non-null   i

In [7]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
months_loan_duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
amount,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
installment_rate,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
residence_history,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
existing_credits,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
default,1000.0,1.3,0.458487,1.0,1.0,1.0,2.0,2.0
dependents,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0


In [8]:
if data.empty:
    print("dataset is empty")
else:
    print(data.isnull().sum())

checking_balance        0
months_loan_duration    0
credit_history          0
purpose                 0
amount                  0
savings_balance         0
employment_length       0
installment_rate        0
personal_status         0
other_debtors           0
residence_history       0
property                0
age                     0
installment_plan        0
housing                 0
existing_credits        0
default                 0
dependents              0
telephone               0
foreign_worker          0
job                     0
dtype: int64


In [9]:
numeric_columns = data.select_dtypes(include= [np.number]).columns 
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

In [10]:
categorical_columns = data.select_dtypes(include=[object]).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [11]:
print(data.dtypes)

checking_balance        int64
months_loan_duration    int64
credit_history          int64
purpose                 int64
amount                  int64
savings_balance         int64
employment_length       int64
installment_rate        int64
personal_status         int64
other_debtors           int64
residence_history       int64
property                int64
age                     int64
installment_plan        int64
housing                 int64
existing_credits        int64
default                 int64
dependents              int64
telephone               int64
foreign_worker          int64
job                     int64
dtype: object


In [12]:
x = data.drop('credit_history', axis=1)
y = data['credit_history']

In [13]:
assert len(x) == len(y), "mismatch in number of rows between features and target variables"

In [14]:
try:
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.3, random_state=42)
    print("data split successfully!")
except Exception as e:
    print(f"error during data split: {e}")

data split successfully!


In [15]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [16]:
try:
    clf.fit(x_train, y_train)
except ValueError as e:
    print(f"error during model training:{e}")

In [17]:
try:
    y_pred = clf.predict(x_test)
except  ValueError as e:
    print(f"error during prediction:{e}")

In [18]:
try:
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    print(f'accuracy: {accuracy}')
    print('confusion matrix:')
    print(conf_matrix)
    print('classification report:')
    print(class_report)
except ValueError as e:
    print(f"error during evaluation:{e}")

accuracy: 0.7333333333333333
confusion matrix:
[[ 62   2   0   0  18]
 [ 12   0   1   0  10]
 [  0   1   1   0   5]
 [  0   0   1   0  14]
 [ 14   1   0   1 157]]
classification report:
              precision    recall  f1-score   support

           0       0.70      0.76      0.73        82
           1       0.00      0.00      0.00        23
           2       0.33      0.14      0.20         7
           3       0.00      0.00      0.00        15
           4       0.77      0.91      0.83       173

    accuracy                           0.73       300
   macro avg       0.36      0.36      0.35       300
weighted avg       0.64      0.73      0.68       300

