In [1]:
#@title 1.3. IMPORT LIBRARY

import sys
sys.path.append('../utils')
from credit_base import *
from neural_net import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer, SimpleImputer

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchmetrics import Accuracy

from kan import *
torch.set_default_dtype(torch.float64)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cpu


In [2]:
df_credit = pd.read_csv('/workspaces/functional-kan/phase_1/data/german_credit_data.csv')

# Risk target
df_credit['Risk'] = df_credit['Risk'].map({'good': 1, 'bad': 0})

# Non-null savings and checking account
encoder = LabelEncoder()
non_nan_mask = df_credit['Saving accounts'].notna()
df_credit.loc[non_nan_mask, 'Saving accounts'] = encoder.fit_transform(
    df_credit.loc[non_nan_mask, 'Saving accounts']
)

non_nan_mask = df_credit['Checking account'].notna()
df_credit.loc[non_nan_mask, 'Checking account'] = encoder.fit_transform(
    df_credit.loc[non_nan_mask, 'Checking account']
)

# Sex 
df_credit['Sex'] = df_credit['Sex'].map({'male': 1, 'female': 0})

# Monthly pay
df_credit['Monthly pay'] = (df_credit["Credit amount"] / df_credit["Duration"])
df_credit['Monthly pay'] = np.log(df_credit['Monthly pay'])

# Age categorize
df_credit['Age'] = df_credit['Age'].apply(age_categorize)

# Housing categorize
df_credit['Housing'] = df_credit['Housing'].apply(housing_categorize)

# Purpose categorize
df_credit['Purpose'] = df_credit['Purpose'].apply(purpose_categorize)

# Credit amount
df_credit["Credit amount"] = np.log(df_credit["Credit amount"])

# Duration
df_credit['Duration'] = np.log(df_credit['Duration'])

X = df_credit.drop(columns='Risk')
y = df_credit['Risk']

# One-hot encoding for Age and Purpose
X = pd.get_dummies(X, columns=['Age', 'Purpose', 'Housing'], dtype='int')
X

# KNN imputer
# imputer = KNNImputer(n_neighbors=2)
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Standard scaler
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

df = pd.concat([X, y], axis=1)


In [8]:
from imblearn.over_sampling import SMOTE

# SMOTE Over-sampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check class distribution
print("Original class distribution:", y.value_counts())
print("Resampled class distribution:", pd.Series(y_resampled).value_counts())

df = pd.concat([X_resampled, y_resampled], axis=1)


Original class distribution: Risk
1    700
0    300
Name: count, dtype: int64
Resampled class distribution: Risk
1    700
0    700
Name: count, dtype: int64


In [9]:
def train_acc():
    return torch.mean((torch.argmax(model(dataset['train_input']), dim=1) == dataset['train_label']).type(dtype))

def test_acc():
    return torch.mean((torch.argmax(model(dataset['test_input']), dim=1) == dataset['test_label']).type(dtype))

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets):
        probs = torch.softmax(logits, dim=1)  # Convert logits to probabilities
        probs_class_1 = probs[:, 1]  # Focus on class 1
        targets = targets.double()
        bce_loss = nn.BCELoss(reduction='none')(probs_class_1, targets)
        pt = torch.where(targets == 1, probs_class_1, 1 - probs_class_1)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

criterion = FocalLoss(alpha=0.25, gamma=2)


In [18]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
n = df.shape[1] - 1

train_accuracies = []
test_accuracies = []

# Perform 5-fold stratified cross-validation
for fold, (train_idx, test_idx) in enumerate(skf.split(df, df['Risk'])):
    print(f"Fold {fold+1}/{5}")
    
    # Split the data for this fold
    df_train, df_test = df.iloc[train_idx], df.iloc[test_idx]
    X_train, y_train = df_train.drop(columns='Risk'), df_train['Risk']
    X_test, y_test = df_test.drop(columns='Risk'), df_test['Risk']
    
    # Prepare the dataset
    dataset = dict()
    dtype = torch.get_default_dtype()
    dataset['train_input'] = torch.from_numpy(X_train.values).type(dtype).to(device)
    dataset['train_label'] = torch.from_numpy(y_train.values).type(torch.long).to(device)
    dataset['test_input'] = torch.from_numpy(X_test.values).type(dtype).to(device)
    dataset['test_label'] = torch.from_numpy(y_test.values).type(torch.long).to(device)

    # Initialize model
    model = KAN(width=[n, 6, 2], grid=5, k=2, seed=42, device=device)

    # Train the model and evaluate it
    results = model.fit(dataset, opt="Adam", steps=250, lr=0.002, lamb=0.001, metrics=(train_acc, test_acc), loss_fn=torch.nn.CrossEntropyLoss())

    print(f'Fold accuracy: {results['train_acc'][-1], results['test_acc'][-1]}')
    
    train_accuracies.append(results['train_acc'][-1])
    test_accuracies.append(results['test_acc'][-1])

# Output the average accuracies
print(f"Average Train Accuracy: {torch.mean(torch.tensor(train_accuracies)):.4f}")
print(f"Average Test Accuracy: {torch.mean(torch.tensor(test_accuracies)):.4f}")

Fold 1/5
checkpoint directory created: ./model
saving model version 0.0


| train_loss: 8.35e-01 | test_loss: 8.35e-01 | reg: 1.76e+01 | :   1%| | 3/250 [00:00<00:09, 26.17it

| train_loss: 5.53e-01 | test_loss: 7.07e-01 | reg: 2.86e+01 | : 100%|█| 250/250 [00:07<00:00, 31.90


saving model version 0.1
Fold accuracy: (0.8758928571428571, 0.7714285714285715)
Fold 2/5
checkpoint directory created: ./model
saving model version 0.0


| train_loss: 5.46e-01 | test_loss: 7.61e-01 | reg: 2.99e+01 | : 100%|█| 250/250 [00:07<00:00, 32.14


saving model version 0.1
Fold accuracy: (0.8892857142857142, 0.7642857142857142)
Fold 3/5
checkpoint directory created: ./model
saving model version 0.0


| train_loss: 5.55e-01 | test_loss: 7.05e-01 | reg: 3.02e+01 | : 100%|█| 250/250 [00:07<00:00, 31.46


saving model version 0.1
Fold accuracy: (0.86875, 0.7678571428571429)
Fold 4/5
checkpoint directory created: ./model
saving model version 0.0


| train_loss: 5.68e-01 | test_loss: 7.64e-01 | reg: 2.88e+01 | : 100%|█| 250/250 [00:07<00:00, 32.61


saving model version 0.1
Fold accuracy: (0.8758928571428571, 0.7107142857142857)
Fold 5/5
checkpoint directory created: ./model
saving model version 0.0


| train_loss: 5.80e-01 | test_loss: 7.26e-01 | reg: 2.88e+01 | : 100%|█| 250/250 [00:07<00:00, 31.44


saving model version 0.1
Fold accuracy: (0.8589285714285714, 0.7535714285714286)
Average Train Accuracy: 0.8737
Average Test Accuracy: 0.7536
