In [6]:
# Importing libraries
import pandas as pd
%matplotlib inline

# Load the data
df = pd.read_csv('credit_risk.csv')

# Importing liberaries for preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Drop Id column
df = df.drop(['Id'], axis=1)

# Manipulation for zero values
for col in Scale_cols:
    df[col] = df[col] + (df[df[col]!=0][col].min()/2)

# Define columns for preprocessing
impute_cols = ['Emp_length', 'Rate']
encode_cols = df.select_dtypes(exclude='number').columns.to_list()
Scale_cols = ['Age','Income','Amount','Percent_income','Cred_length']

# Impute missing values and scaling
imputer_mean = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaling', RobustScaler())
])

Preprocessing1 = ColumnTransformer(
    transformers=[
        ('imputer', imputer_mean, impute_cols),
        ('encoder', OneHotEncoder(handle_unknown='ignore'), encode_cols)
    ], remainder="passthrough"
)

Preprocessing2 = ColumnTransformer(
    transformers=[
        ('imputer', imputer_mean, impute_cols),
        ('encoder', OneHotEncoder(handle_unknown='ignore'), encode_cols),
        ('scaling', PowerTransformer(method='box-cox'), Scale_cols)
    ], remainder="passthrough"
)

# Classificatin models
from sklearn.svm import SVC
# Model processing
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from tqdm import tqdm

# Split x and y for the model
y = df['Status']
x = df.drop(['Status'], axis=1)

# Split data to Train and Test data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24, stratify=y)

estimator1 = Pipeline([
        ('preprocessing', Preprocessing1),
        ('model', SVC())
    ])

estimator2 = Pipeline([
        ('preprocessing', Preprocessing2),
        ('model', SVC())
    ])

# The models
models = [
    estimator1, # without scaling
    estimator2 # with scaling
]

# List for score matrix
accuracy_score = []
accuracy_mean = []
accuracy_std = []

# Calculate model's matrix score
for model in tqdm(models):
    
    # Cross validate split
    crossval = KFold(n_splits=5, shuffle=True, random_state=24)

    # Calculate the accuracy from cross validation
    accuracy = cross_val_score(
        model, 
        X_train, 
        y_train, 
        cv=crossval, 
        scoring='accuracy', 
        error_score='raise'
        )

    #accuracy_score.append(accuracy)
    accuracy_mean.append(accuracy.mean())
    accuracy_std.append(accuracy.std())

# Model Matrix Evaluation
model_matrix = pd.DataFrame({
    'Model': ['SVC no normalization', 'SVC with power transform'],
    'Accuracy Mean': accuracy_mean,
    'Accuracy Std': accuracy_std
})
model_matrix.sort_values(by='Accuracy Mean', ascending=False)


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
 50%|█████████████████████████████████████████▌                                         | 1/2 [03:08<03:08, 188.89s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [05:25<00:00, 162.92s/it]


Unnamed: 0,Model,Accuracy Mean,Accuracy Std
1,SVC with power transform,0.894452,0.003853
0,SVC no normalization,0.801642,0.012816


In [5]:
df.describe(include='number').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,32581.0,27.7346,6.348078,20.0,23.0,26.0,30.0,144.0
Income,32581.0,66074.84847,61983.119168,4000.0,38500.0,55000.0,79200.0,6000000.0
Emp_length,31686.0,4.789686,4.14263,0.0,2.0,4.0,7.0,123.0
Amount,32581.0,9589.371106,6322.086646,500.0,5000.0,8000.0,12200.0,35000.0
Rate,29465.0,11.011695,3.240459,5.42,7.9,10.99,13.47,23.22
Status,32581.0,0.218164,0.413006,0.0,0.0,0.0,0.0,1.0
Percent_income,32581.0,0.170203,0.106782,0.0,0.09,0.15,0.23,0.83
Cred_length,32581.0,5.804211,4.055001,2.0,3.0,4.0,8.0,30.0
