In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [16]:
!pip install openpyxl



In [17]:
dataset=pd.read_excel('a_Dataset_CreditScoring.xlsx')

In [18]:
#To show the count of the number of rows and then columns (3000 rows and 30 columns)
dataset.shape

(3000, 30)

In [19]:
#To show the first few (first 5)records of the daatset
dataset.head()

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,582,3,3,0,4,0.0,5,117,27,...,3.0,0.9179,0.2083,2,3,7,0.2083,4,4,0.0
1,1,662,15,9,0,3,1.0,3,14,14,...,1.0,0.8,0.0,0,0,0,1.0,12,0,1.0
2,1,805,0,0,0,1,5.0,1,354,7,...,5.0,0.3552,0.6538,0,1,1,0.7308,1,1,0.5263
3,1,1175,8,5,0,6,1.0,10,16,4,...,3.0,0.9127,0.25,1,1,1,0.75,7,1,1.3333
4,1,1373,3,1,0,9,0.0,8,130,52,...,1.0,1.2511,0.0,0,1,4,0.1429,3,1,0.0


In [20]:
#Next , we drop the customer ID column from the dataset to make it more more relevant-
dataset=dataset.drop('ID',axis=1)
dataset.shape
#We can see the number of columns changed from 30 to 29 , implying a column was discarded.

(3000, 29)

In [21]:
#Next , we check the count for 'not available' values in the columns and return the number of such records(i.e. samples) for each column-
dataset.isna().sum()

TARGET               0
DerogCnt             0
CollectCnt           0
BanruptcyInd         0
InqCnt06             0
InqTimeLast        188
InqFinanceCnt24      0
TLTimeFirst          0
TLTimeLast           0
TLCnt03              0
TLCnt12              0
TLCnt24              0
TLCnt                3
TLSum               40
TLMaxSum            40
TLSatCnt             4
TLDel60Cnt           0
TLBadCnt24           0
TL75UtilCnt         99
TL50UtilCnt         99
TLBalHCPct          41
TLSatPct             4
TLDel3060Cnt24       0
TLDel90Cnt24         0
TLDel60CntAll        0
TLOpenPct            3
TLBadDerogCnt        0
TLDel60Cnt24         0
TLOpen24Pct          3
dtype: int64

In [22]:
#Now, we fill the missing values with mean -
dataset=dataset.fillna(dataset.mean())

In [23]:
#To see if our fix above was implemented correct , we again print the null 'not available' or 'na' values-
#We see that indeed our fix worked , as no column is showing 'na' values now-
dataset.isna().sum()

TARGET             0
DerogCnt           0
CollectCnt         0
BanruptcyInd       0
InqCnt06           0
InqTimeLast        0
InqFinanceCnt24    0
TLTimeFirst        0
TLTimeLast         0
TLCnt03            0
TLCnt12            0
TLCnt24            0
TLCnt              0
TLSum              0
TLMaxSum           0
TLSatCnt           0
TLDel60Cnt         0
TLBadCnt24         0
TL75UtilCnt        0
TL50UtilCnt        0
TLBalHCPct         0
TLSatPct           0
TLDel3060Cnt24     0
TLDel90Cnt24       0
TLDel60CntAll      0
TLOpenPct          0
TLBadDerogCnt      0
TLDel60Cnt24       0
TLOpen24Pct        0
dtype: int64

In [24]:
#NOW , WE SHALL DO THE TRAIN AND TEST SPLIT-
y=dataset.iloc[:,0].values #we extract all the rows of the dataset for the target variable 'y' and the first column , then using the'.values' method , we convert them to Numpy array.
#we only extracted the first row because that's how the dataset is structured , as the first column has the 'target' values( 0 or 1)
x=dataset.iloc[:,1:29].values#we extract all the rows and the columns 1 to 28 from the dataset and convert them to numpy array and store then in this variable'x'.
#we now have 'x' storing the rows as data points and each column as the feature using which prediction is made.

In [25]:
#Just to elaborate , the following shows the structure of the number of rows and columns in 'x' and 'y'-
print(y.shape)
print(x.shape)


(3000,)
(3000, 28)


In [26]:
#Now , we are going to split the dataset into training and testing (in ratio of 85:15)-
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state=0,stratify=y)

In [27]:
#Now , we perform z-score normalisation to better scale all the features and datapoints-=, using the Standardscaler class() from sklearn , or more specifically sklearn.preprocessing -
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)


In [28]:
# Savcing the normalisation coefficients and data for later use in prediction and future use-
import joblib
joblib.dump(sc, r'C:\Users\nahso\PycharmProjects\Credit Scoring Model\futureuse_normalisation_creditscoring.pkl')

['C:\\Users\\nahso\\PycharmProjects\\Credit Scoring Model\\futureuse_normalisation_creditscoring.pkl']

In [29]:
#Now , we do perform the classification action-

classifier =  LogisticRegression()#This initialises an object 'classifier' from the sklearn library or the sklearn.linear_model , to be more specific
classifier.fit(x_train, y_train)#This is to train the classifier object on  training data  ,as,x_train contains the feature data (independent variables) and y_train contains the corresponding labels/targets (dependent variable,here 0 or 1).
#The fit method adjusts weights of the model and minimises the error in predicting y_train from x_train.It is a commonly used method
y_pred = classifier.predict(x_test)#This uses the 'classifier' object to predict the labels or outputs 'y_pred' for a new set of data, i.e. test data , 'x_test'

In [30]:
# Saving the Logistic Regression Classifier for later implementation in prediction or future use-

# import joblib, not required as already imported
joblib.dump(classifier, r'C:\Users\nahso\PycharmProjects\Credit Scoring Model\futureuse_Classifier_CreditScoring.pkl')


['C:\\Users\\nahso\\PycharmProjects\\Credit Scoring Model\\futureuse_Classifier_CreditScoring.pkl']

In [31]:
# Model Performance will be judged .
#Now , first we will print the confusion matrix-
print(confusion_matrix(y_test,y_pred))


[[610  15]
 [104  21]]


In [32]:
#Now , we will print the accuracy of the model raw , then in percentage-

print(f"The fractional accuracy is:{accuracy_score(y_test, y_pred)}")
print(f"The accuracy percentage is:{accuracy_score(y_test, y_pred)*100}")

The fractional accuracy is:0.8413333333333334
The accuracy percentage is:84.13333333333334


In [33]:
#Now, to print the probability score of each datapoint as an array, based on which creditworthiness was classified as yes/no or 1/0, and this is binary as Logistic Regression is involved-

predictions = classifier.predict_proba(x_test)
predictions

array([[0.93243837, 0.06756163],
       [0.91599714, 0.08400286],
       [0.91140794, 0.08859206],
       ...,
       [0.88593837, 0.11406163],
       [0.88062764, 0.11937236],
       [0.53335147, 0.46664853]])

In [34]:
# writing model output file

df_pred_prob = pd.DataFrame(predictions, columns = ['Probability 0', 'Probability 1'])#This line creates a DataFrame from the 'predictions' array in the above cell, which contains the probabilities of each test sample belonging to class/category 0 and class/category 1. Columns are named 'Probability 0' and 'Probability 1'.

df_pred_target = pd.DataFrame(classifier.predict(x_test), columns = ['Predicted Target'])#This creates another DataFrame from the 'predictions' made by the classifier (object) (i.e. class labels, not the probabilities), with a single column as 'Predicted Target'.

df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])#This DataFrame is made from 'y_test, which contains the actual class 'labels' or 'mappings' for the test data, named 'Actual Outcome'.


df_x=pd.concat([df_test_dataset, df_pred_prob, df_pred_target], axis=1)#This specific line concatenates the three DataFrames along the columns (axis=1), resulting in a single DataFrame 'df_x' that includes the actual outcomes, the predicted probabilities, and the predicted class labels for each test datapoint/sample.

df_x.to_csv(r"C:\Users\nahso\PycharmProjects\Credit Scoring Model\Model_Prediction.csv", sep=',', encoding='UTF-8')
#This saves the DataFrame 'df_x' to a CSV file. The specific path and file name indicate it is saved as an 'Excel' file, but since the method 'to_csv' has been used,the file will actually be in CSV format, not XLSX.
#So can also save it as '.csv' for more clarity.
df_x.head()#To print the first few rows(5 specifically) of the merged dataframe-

Unnamed: 0,Actual Outcome,Probability 0,Probability 1,Predicted Target
0,1,0.932438,0.067562,0
1,0,0.915997,0.084003,0
2,0,0.911408,0.088592,0
3,0,0.84825,0.15175,0
4,0,0.809511,0.190489,0


In [None]:
#We have got the predictions for the credit scores now. For instance , out of the first 5 above, none of them are eligible for loans.  