# Step 1 Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

# Step 2 Importing dataset

In [2]:
dataset = pd.read_csv(r'3.scoredcard_etl.csv')

# Step 3 Building Model

In [3]:
# Train Test Split
y = dataset.iloc[:, 0].values
X = dataset.iloc[:, 1:29].values

In [4]:
# splitting dataset into training and test (in ratio 80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
# Data Normalization-scale all independent variables between 0 and 1.
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [6]:
# Exporting Normalisation Coefficients for later use in prediction
import joblib
joblib.dump(sc, r'scoredcard_normalisation')

['C:\\Users\\Yanhong\\Desktop\\python_projects\\scoredcard_normalisation']

In [7]:
# Risk Model building-Train and fit a logistic regression model on the training set.

classifier = LogisticRegression(max_iter=200)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [8]:
# Exporting Logistic Regression Classifier for later use in prediction
import joblib
joblib.dump(classifier, r'classifier_scorecard')

['C:\\Users\\Yanhong\\Desktop\\python_projects\\classifier_scorecard']

In [9]:
#  Evaluation
print(confusion_matrix(y_test,y_pred))

[[480  18]
 [ 85  17]]


In [10]:
print(accuracy_score(y_test, y_pred))

0.8283333333333334


In [11]:
# generate probabilities 
predictions = classifier.predict_proba(X_test)
predictions

array([[0.0377684 , 0.9622316 ],
       [0.92235643, 0.07764357],
       [0.69070075, 0.30929925],
       ...,
       [0.96608   , 0.03392   ],
       [0.39252003, 0.60747997],
       [0.84614947, 0.15385053]])

In [12]:
# writing model output file
df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(X_test), columns = ['predicted_TARGET'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])
dfx=pd.concat([df_test_dataset, df_prediction_prob, df_prediction_target], axis=1)
dfx.to_csv(r"scoredcard_model_prediction.csv", sep=',', encoding='UTF-8')
dfx.head()

Unnamed: 0,Actual Outcome,prob_0,prob_1,predicted_TARGET
0,1,0.037768,0.962232,1
1,0,0.922356,0.077644,0
2,0,0.690701,0.309299,0
3,0,0.902565,0.097435,0
4,0,0.878854,0.121146,0
