In [184]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import cross_val_score

In [185]:
df = pd.read_csv("../data/Credit Score Classification Dataset.csv")
df

Unnamed: 0,Age,Gender,Income,Education,Marital Status,Number of Children,Home Ownership,Credit Score
0,25,Female,50000,Bachelor's Degree,Single,0,Rented,High
1,30,Male,100000,Master's Degree,Married,2,Owned,High
2,35,Female,75000,Doctorate,Married,1,Owned,High
3,40,Male,125000,High School Diploma,Single,0,Owned,High
4,45,Female,100000,Bachelor's Degree,Married,3,Owned,High
...,...,...,...,...,...,...,...,...
159,29,Female,27500,High School Diploma,Single,0,Rented,Low
160,34,Male,47500,Associate's Degree,Single,0,Rented,Average
161,39,Female,62500,Bachelor's Degree,Married,2,Owned,High
162,44,Male,87500,Master's Degree,Single,0,Owned,High


In [186]:
df.drop(columns=["Education", "Number of Children"], inplace=True)
df.head()

Unnamed: 0,Age,Gender,Income,Marital Status,Home Ownership,Credit Score
0,25,Female,50000,Single,Rented,High
1,30,Male,100000,Married,Owned,High
2,35,Female,75000,Married,Owned,High
3,40,Male,125000,Single,Owned,High
4,45,Female,100000,Married,Owned,High


In [187]:
df["Gender Code"] = df["Gender"].map({"Male": 1, "Female": -1})
df.drop(columns=["Gender"], inplace=True)
df.head()

Unnamed: 0,Age,Income,Marital Status,Home Ownership,Credit Score,Gender Code
0,25,50000,Single,Rented,High,-1
1,30,100000,Married,Owned,High,1
2,35,75000,Married,Owned,High,-1
3,40,125000,Single,Owned,High,1
4,45,100000,Married,Owned,High,-1


In [188]:
df["Marital Status Code"] = df["Marital Status"].map({"Single": -1, "Married": 1})
df.drop(columns=["Marital Status"], inplace=True)
df.head()

Unnamed: 0,Age,Income,Home Ownership,Credit Score,Gender Code,Marital Status Code
0,25,50000,Rented,High,-1,-1
1,30,100000,Owned,High,1,1
2,35,75000,Owned,High,-1,1
3,40,125000,Owned,High,1,-1
4,45,100000,Owned,High,-1,1


In [189]:
df["Home Ownership Code"] = df["Home Ownership"].map({"Rented": -1, "Owned": 1})
df.drop(columns=["Home Ownership"], inplace=True)
df.head()

Unnamed: 0,Age,Income,Credit Score,Gender Code,Marital Status Code,Home Ownership Code
0,25,50000,High,-1,-1,-1
1,30,100000,High,1,1,1
2,35,75000,High,-1,1,1
3,40,125000,High,1,-1,1
4,45,100000,High,-1,1,1


In [190]:
df["Accept"] = df["Credit Score"].map({"High": 1, "Average": -1, "Low": -1})
df.drop(columns=["Credit Score"], inplace=True)
df

Unnamed: 0,Age,Income,Gender Code,Marital Status Code,Home Ownership Code,Accept
0,25,50000,-1,-1,-1,1
1,30,100000,1,1,1,1
2,35,75000,-1,1,1,1
3,40,125000,1,-1,1,1
4,45,100000,-1,1,1,1
...,...,...,...,...,...,...
159,29,27500,-1,-1,-1,-1
160,34,47500,1,-1,-1,-1
161,39,62500,-1,1,1,1
162,44,87500,1,-1,1,1


In [191]:
df["Income log"] = np.log1p(df["Income"])
upper = df["Income log"].quantile(0.99)
df["Income log"] = df["Income log"].clip(upper=upper)

scaler_income = StandardScaler()
df["Income log scaled"] = scaler_income.fit_transform(df[["Income log"]])
df.drop(columns=["Income", "Income log"], inplace=True)
df

Unnamed: 0,Age,Gender Code,Marital Status Code,Home Ownership Code,Accept,Income log scaled
0,25,-1,-1,-1,1,-1.015553
1,30,1,1,1,1,0.613179
2,35,-1,1,1,1,-0.062808
3,40,1,-1,1,1,1.137517
4,45,-1,1,1,1,0.613179
...,...,...,...,...,...,...
159,29,-1,-1,-1,-1,-2.420311
160,34,1,-1,-1,-1,-1.136079
161,39,-1,1,1,1,-0.491220
162,44,1,-1,1,1,0.299411


In [192]:
scaler_age = StandardScaler()

df["Age scaled"] = scaler_age.fit_transform(df[["Age"]])
df.drop(columns=["Age"], inplace=True)
df

Unnamed: 0,Gender Code,Marital Status Code,Home Ownership Code,Accept,Income log scaled,Age scaled
0,-1,-1,-1,1,-1.015553,-1.535320
1,1,1,1,1,0.613179,-0.943702
2,-1,1,1,1,-0.062808,-0.352085
3,1,-1,1,1,1.137517,0.239533
4,-1,1,1,1,0.613179,0.831151
...,...,...,...,...,...,...
159,-1,-1,-1,-1,-2.420311,-1.062026
160,1,-1,-1,-1,-1.136079,-0.470408
161,-1,1,1,1,-0.491220,0.121209
162,1,-1,1,1,0.299411,0.712827


In [193]:
X = df.drop(columns=["Accept"])
y = df["Accept"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [194]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

Accuracy: 0.9393939393939394
ROC AUC: 0.9956521739130435
              precision    recall  f1-score   support

          -1       0.90      0.90      0.90        10
           1       0.96      0.96      0.96        23

    accuracy                           0.94        33
   macro avg       0.93      0.93      0.93        33
weighted avg       0.94      0.94      0.94        33



In [195]:
scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print(scores, scores.mean(), scores.std())

[0.97826087 1.         1.         0.99586777 0.99090909] 0.9930075458138699 0.008097378326571522


In [196]:
joblib.dump(model, "../data/model.joblib")
joblib.dump(scaler_income, "../data/scaler_income.joblib")
joblib.dump(scaler_age, "../data/scaler_age.joblib")
print("Model saved as ../data/model.joblib")
print("Scaler Income saved as ../data/scaler_income.joblib")
print("Scaler Age saved as ../data/scaler_age.joblib")

Model saved as ../data/model.joblib
Scaler Income saved as ../data/scaler_income.joblib
Scaler Age saved as ../data/scaler_age.joblib
