In [1]:
# Importing libraries
import pandas as pd
import numpy as np
%matplotlib inline

# Load the data
df = pd.read_csv('credit_risk.csv')

# Importing liberaries for preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Drop Id column
df = df.drop(['Id'], axis=1)

Scale_cols = ['Age','Income','Amount','Percent_income','Cred_length']
impute_cols = ['Emp_length', 'Rate']
encode_cols = df.select_dtypes(exclude='number').columns.to_list()

# logarithmic transformation in the features
df_log = df.copy()
for col in Scale_cols:
    df_log[col] = df_log[col] + (df[df[col]!=0][col].min()/2)
    df_log[col] = 1/df_log[col]
    
# Impute missing values and scaling
imputer_mean= Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaling', RobustScaler())
])

Preprocessing= ColumnTransformer(
    transformers=[
        ('imputer', imputer_mean, impute_cols),
        ('encoder', OneHotEncoder(handle_unknown='ignore'), encode_cols)
    ], remainder="passthrough"
)


# Classificatin models
from sklearn.svm import SVC

# Model processing
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from tqdm import tqdm

# Split x and y for the model
y = df['Status']
x = df.drop(['Status'], axis=1)

# Split x and y for the model
y_log = df_log['Status']
x_log = df_log.drop(['Status'], axis=1)

# Split data to Train and Test data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24, stratify=y)
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(x_log, y_log, test_size=0.2, random_state=24, stratify=y)

model = Pipeline([
        ('preprocessing', Preprocessing),
        ('model', SVC())
    ])

# The data
data = [[X_train, y_train], [X_train_log, y_train_log]]

# List for score matrix
accuracy_score = []
accuracy_mean = []
accuracy_std = []

# Calculate model's matrix score
for feat in tqdm(data):
    
    # Cross validate split
    crossval = KFold(n_splits=5, shuffle=True, random_state=24)

    # Calculate the accuracy from cross validation
    accuracy = cross_val_score(
        model, 
        feat[0], 
        feat[1], 
        cv=crossval, 
        scoring='accuracy', 
        error_score='raise'
        )

    #accuracy_score.append(accuracy)
    accuracy_mean.append(accuracy.mean())
    accuracy_std.append(accuracy.std())

# Model Matrix Evaluation
model_matrix = pd.DataFrame({
    'Model': ['SVC no trans', 'SVC with reciprocal trans'],
    'Accuracy Mean': accuracy_mean,
    'Accuracy Std': accuracy_std
})
model_matrix.sort_values(by='Accuracy Mean', ascending=False)

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [06:15<00:00, 188.00s/it]


Unnamed: 0,Model,Accuracy Mean,Accuracy Std
1,SVC with reciprocal trans,0.850291,0.004687
0,SVC no trans,0.8031,0.012257
