In [38]:
# Imports
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from scipy.stats import ks_2samp
import pandas as pd
import numpy as np

In [None]:

# Loading Data
df = pd.read_csv("credit_score.csv")

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 87 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CUST_ID                  1000 non-null   object 
 1   INCOME                   1000 non-null   int64  
 2   SAVINGS                  1000 non-null   int64  
 3   DEBT                     1000 non-null   int64  
 4   R_SAVINGS_INCOME         1000 non-null   float64
 5   R_DEBT_INCOME            1000 non-null   float64
 6   R_DEBT_SAVINGS           1000 non-null   float64
 7   T_CLOTHING_12            1000 non-null   int64  
 8   T_CLOTHING_6             1000 non-null   int64  
 9   R_CLOTHING               1000 non-null   float64
 10  R_CLOTHING_INCOME        1000 non-null   float64
 11  R_CLOTHING_SAVINGS       1000 non-null   float64
 12  R_CLOTHING_DEBT          1000 non-null   float64
 13  T_EDUCATION_12           1000 non-null   int64  
 14  T_EDUCATION_6            

In [51]:
# Preprocessing
df = df[["INCOME", "SAVINGS", "DEBT", "CREDIT_SCORE", "DEFAULT"]]

In [52]:
df.head()

Unnamed: 0,INCOME,SAVINGS,DEBT,CREDIT_SCORE,DEFAULT
0,33269,0,532304,444,1
1,77158,91187,315648,625,0
2,30917,21642,534864,469,1
3,80657,64526,629125,559,0
4,149971,1172498,2399531,473,0


In [53]:
# Train and Test Datasets
X = df.drop(["CREDIT_SCORE"], axis=1)
y = df["CREDIT_SCORE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [54]:
# Model Fitting
model = LinearRegression()
model.fit(X_train, y_train)

In [84]:
y_pred = model.predict(X_test)

In [122]:
def create_binned_data_psi(actual, predicted, n_bins = 10):
    freq_actual, bin_edges = np.histogram(actual)
    freq_predicted, bins = np.histogram(predicted, bins=bin_edges)
    bins = []
    for i in range(len(bin_edges) - 1):
        bins.append(str(bin_edges[i]) + " - " + str(bin_edges[i+1]))
    df1 = pd.DataFrame({ "Bins": bins, "Actual_freq": freq_actual, "Predicted_freq": freq_predicted})
    return df1
df1 = create_binned_data_psi(y_test, y_pred)
df2 = pd.concat([pd.DataFrame({"Actual": y_test, "Predicted": y_pred}), df1], axis=0)
df2.to_csv("PSI_METRICS.csv")
    

In [123]:
df[(df["CREDIT_SCORE"] >= 350) & (df["CREDIT_SCORE"] <= 400)].shape[0]

14

In [124]:
# PSI Statistics
def psi_stats(data):
    actual_freq = data["Actual_freq"].dropna().to_numpy()
    predicted_freq = data["Predicted_freq"].dropna().to_numpy()
    actual_frac = actual_freq / data["Actual"].dropna().shape[0]
    predicted_frac = predicted_freq / data["Actual"].dropna().shape[0] + 0.001
    diff = actual_frac - predicted_frac
    log = np.log(actual_frac / predicted_frac)
    prod = diff * log
    return sum(prod)

psi_stats(df2)

np.float64(0.46369318293255607)

In [126]:
# Train and Test Datasets
X = df.drop(["DEFAULT"], axis=1)
y = df["DEFAULT"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [134]:
# KS satistics
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10, random_state=30)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
ks_df = pd.DataFrame({
    "p": y_pred[:,1],
    "y": y_test
})
ks_df.sort_values(by="p", ascending=False, inplace=True)
ks, p_value = ks_2samp(ks_df.loc[ks_df.y==0,"p"], ks_df.loc[ks_df.y==1,"p"])
print(ks, p_value)
ks_df.to_csv("KS.csv")

0.21776469144890198 0.03525332162712704
