In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [6]:
!pip install openpyxl



In [7]:
dataset=pd.read_excel('a_Dataset_CreditScoring.xlsx')

In [8]:
dataset.shape

(3000, 30)

In [9]:
dataset.isna().sum()

TARGET               0
ID                   0
DerogCnt             0
CollectCnt           0
BanruptcyInd         0
InqCnt06             0
InqTimeLast        188
InqFinanceCnt24      0
TLTimeFirst          0
TLTimeLast           0
TLCnt03              0
TLCnt12              0
TLCnt24              0
TLCnt                3
TLSum               40
TLMaxSum            40
TLSatCnt             4
TLDel60Cnt           0
TLBadCnt24           0
TL75UtilCnt         99
TL50UtilCnt         99
TLBalHCPct          41
TLSatPct             4
TLDel3060Cnt24       0
TLDel90Cnt24         0
TLDel60CntAll        0
TLOpenPct            3
TLBadDerogCnt        0
TLDel60Cnt24         0
TLOpen24Pct          3
dtype: int64

In [10]:
dataset=dataset.fillna(dataset.mean())

In [11]:
dataset.isna().sum()

TARGET             0
ID                 0
DerogCnt           0
CollectCnt         0
BanruptcyInd       0
InqCnt06           0
InqTimeLast        0
InqFinanceCnt24    0
TLTimeFirst        0
TLTimeLast         0
TLCnt03            0
TLCnt12            0
TLCnt24            0
TLCnt              0
TLSum              0
TLMaxSum           0
TLSatCnt           0
TLDel60Cnt         0
TLBadCnt24         0
TL75UtilCnt        0
TL50UtilCnt        0
TLBalHCPct         0
TLSatPct           0
TLDel3060Cnt24     0
TLDel90Cnt24       0
TLDel60CntAll      0
TLOpenPct          0
TLBadDerogCnt      0
TLDel60Cnt24       0
TLOpen24Pct        0
dtype: int64

In [12]:
y=dataset.iloc[:,0].values
x=dataset.iloc[:,1:29].values

In [13]:
print(y.shape)
print(x.shape)

(3000,)
(3000, 28)


In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state=0,stratify=y)

In [15]:
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [16]:
import joblib
joblib.dump(sc, r'C:\Users\gowth\OneDrive\Desktop\CODE-ALPHA\futureuse_normalisation_creditscoring.pkl')

['C:\\Users\\gowth\\OneDrive\\Desktop\\CODE-ALPHA\\futureuse_normalisation_creditscoring.pkl']

In [17]:
classifier =  LogisticRegression()#This initialises an object 'classifier' from the sklearn library or the sklearn.linear_model , to be more specific
classifier.fit(x_train, y_train)#This is to train the classifier object on  training data  ,as,x_train contains the feature data (independent variables) and y_train contains the corresponding labels/targets (dependent variable,here 0 or 1).
#The fit method adjusts weights of the model and minimises the error in predicting y_train from x_train.It is a commonly used method
y_pred = classifier.predict(x_test)#

In [18]:
joblib.dump(classifier, r'C:\Users\gowth\OneDrive\Desktop\CODE-ALPHA\futureuse_Classifier_CreditScoring.pkl')


['C:\\Users\\gowth\\OneDrive\\Desktop\\CODE-ALPHA\\futureuse_Classifier_CreditScoring.pkl']

In [19]:
print(confusion_matrix(y_test,y_pred))

[[609  16]
 [105  20]]


In [20]:

print(f"The fractional accuracy is:{accuracy_score(y_test, y_pred)}")
print(f"The accuracy percentage is:{accuracy_score(y_test, y_pred)*100}")

The fractional accuracy is:0.8386666666666667
The accuracy percentage is:83.86666666666667


In [21]:

predictions = classifier.predict_proba(x_test)
predictions

array([[0.93489876, 0.06510124],
       [0.90853572, 0.09146428],
       [0.90968011, 0.09031989],
       ...,
       [0.87596545, 0.12403455],
       [0.87717214, 0.12282786],
       [0.53797816, 0.46202184]])

In [22]:
# writing model output file

df_pred_prob = pd.DataFrame(predictions, columns = ['Probability 0', 'Probability 1'])#This line creates a DataFrame from the 'predictions' array in the above cell, which contains the probabilities of each test sample belonging to class/category 0 and class/category 1. Columns are named 'Probability 0' and 'Probability 1'.

df_pred_target = pd.DataFrame(classifier.predict(x_test), columns = ['Predicted Target'])#This creates another DataFrame from the 'predictions' made by the classifier (object) (i.e. class labels, not the probabilities), with a single column as 'Predicted Target'.

df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])#This DataFrame is made from 'y_test, which contains the actual class 'labels' or 'mappings' for the test data, named 'Actual Outcome'.


df_x=pd.concat([df_test_dataset, df_pred_prob, df_pred_target], axis=1)#This specific line concatenates the three DataFrames along the columns (axis=1), resulting in a single DataFrame 'df_x' that includes the actual outcomes, the predicted probabilities, and the predicted class labels for each test datapoint/sample.

df_x.to_csv(r"C:\Users\gowth\OneDrive\Desktop\CODE-ALPHA\Model_Prediction.csv", sep=',', encoding='UTF-8')
#This saves the DataFrame 'df_x' to a CSV file. The specific path and file name indicate it is saved as an 'Excel' file, but since the method 'to_csv' has been used,the file will actually be in CSV format, not XLSX.
#So can also save it as '.csv' for more clarity.
df_x.head()#To print the first few rows(5 specifically) of the merged dataframe

Unnamed: 0,Actual Outcome,Probability 0,Probability 1,Predicted Target
0,1.0,0.934899,0.065101,0.0
1,0.0,0.908536,0.091464,0.0
2,0.0,0.90968,0.09032,0.0
3,0.0,0.834471,0.165529,0.0
4,0.0,0.813557,0.186443,0.0
