In [5]:
#@title 1.3. IMPORT LIBRARY

import sys
sys.path.append('../utils')
from credit_base import *
from neural_net import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchmetrics import Accuracy

from kan import *
torch.set_default_dtype(torch.float64)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cpu


In [31]:
df_credit = pd.read_csv('/workspaces/functional-kan/phase_1/data/german_credit_data.csv')

# Risk target
df_credit['Risk'] = df_credit['Risk'].map({'good': 1, 'bad': 0})

# Non-null savings and checking account
encoder = LabelEncoder()
non_nan_mask = df_credit['Saving accounts'].notna()
df_credit.loc[non_nan_mask, 'Saving accounts'] = encoder.fit_transform(
    df_credit.loc[non_nan_mask, 'Saving accounts']
)

non_nan_mask = df_credit['Checking account'].notna()
df_credit.loc[non_nan_mask, 'Checking account'] = encoder.fit_transform(
    df_credit.loc[non_nan_mask, 'Checking account']
)

# Sex 
df_credit['Sex'] = df_credit['Sex'].map({'male': 1, 'female': 0})

# Monthly pay
df_credit['Monthly pay'] = (df_credit["Credit amount"] / df_credit["Duration"])
# df_credit['Monthly pay'] = np.log(df_credit['Monthly pay'])

# Age categorize
df_credit['Age'] = df_credit['Age'].apply(age_categorize)

# Housing categorize
df_credit['Housing'] = df_credit['Housing'].apply(housing_categorize)

# Purpose categorize
df_credit['Purpose'] = df_credit['Purpose'].apply(purpose_categorize)

# Credit amount
df_credit["Credit amount"] = np.log(df_credit["Credit amount"])

# Duration
df_credit['Duration'] = np.log(df_credit['Duration'])

X = df_credit.drop(columns='Risk')
y = df_credit['Risk']

# One-hot encoding for Age and Purpose
X = pd.get_dummies(X, columns=['Age', 'Purpose'], dtype='int')
X

# KNN imputer
imputer = KNNImputer(n_neighbors=10)
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Standard scaler
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

df = pd.concat([X, y], axis=1)
df


Unnamed: 0,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Monthly pay,Age_Middle,Age_Old,Age_Young,Purpose_Neutral,Purpose_Risky,Purpose_Safe,Risk
0,0.670280,0.146949,0.634448,0.047084,-1.214772,-0.933901,-1.864869,0.176948,-1.020204,3.414415,-0.835340,-1.330445,-0.296883,1.603567,1
1,-1.491914,0.146949,0.634448,-0.582377,0.665389,1.163046,1.708369,-0.284901,-1.020204,-0.292876,1.197117,-1.330445,-0.296883,1.603567,0
2,0.670280,-1.383771,0.634448,-0.582377,0.289357,-0.181559,-0.673790,0.045495,0.980196,-0.292876,-0.835340,-1.330445,3.368335,-0.623610,1
3,0.670280,0.146949,-1.576173,-0.582377,-1.214772,1.525148,1.478913,0.130233,0.980196,-0.292876,-0.835340,0.751628,-0.296883,-0.623610,1
4,0.670280,0.146949,-1.576173,-0.582377,-1.214772,0.904743,0.517289,0.229637,0.980196,-0.292876,-0.835340,0.751628,-0.296883,-0.623610,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-1.491914,-1.383771,0.634448,-0.582377,-0.650724,-0.424376,-0.673790,-0.150054,0.980196,-0.292876,-0.835340,0.751628,-0.296883,-0.623610,1
996,0.670280,1.677670,0.634448,-0.582377,-1.214772,0.604255,0.900731,-0.254998,0.980196,-0.292876,-0.835340,0.751628,-0.296883,-0.623610,1
997,0.670280,0.146949,0.634448,-0.582377,-0.274692,-1.416199,-0.673790,-0.656308,0.980196,-0.292876,-0.835340,-1.330445,-0.296883,1.603567,1
998,0.670280,0.146949,-1.576173,-0.582377,-1.214772,-0.345911,1.597468,-0.825784,-1.020204,-0.292876,1.197117,-1.330445,-0.296883,1.603567,0


In [14]:
def train_acc():
    return torch.mean((torch.argmax(model(dataset['train_input']), dim=1) == dataset['train_label']).type(dtype))

def test_acc():
    return torch.mean((torch.argmax(model(dataset['test_input']), dim=1) == dataset['test_label']).type(dtype))

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets):
        probs = torch.softmax(logits, dim=1)  # Convert logits to probabilities
        probs_class_1 = probs[:, 1]  # Focus on class 1
        targets = targets.double()
        bce_loss = nn.BCELoss(reduction='none')(probs_class_1, targets)
        pt = torch.where(targets == 1, probs_class_1, 1 - probs_class_1)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

criterion = FocalLoss(alpha=0.25, gamma=2)


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
n = df.shape[1] - 1

train_accuracies = []
test_accuracies = []

# Perform 5-fold stratified cross-validation
for fold, (train_idx, test_idx) in enumerate(skf.split(df, df['Risk'])):
    print(f"Fold {fold+1}/{5}")
    
    # Split the data for this fold
    df_train, df_test = df.iloc[train_idx], df.iloc[test_idx]
    X_train, y_train = df_train.drop(columns='Risk'), df_train['Risk']
    X_test, y_test = df_test.drop(columns='Risk'), df_test['Risk']
    
    # Prepare the dataset
    dataset = dict()
    dtype = torch.get_default_dtype()
    dataset['train_input'] = torch.from_numpy(X_train.values).type(dtype).to(device)
    dataset['train_label'] = torch.from_numpy(y_train.values).type(torch.long).to(device)
    dataset['test_input'] = torch.from_numpy(X_test.values).type(dtype).to(device)
    dataset['test_label'] = torch.from_numpy(y_test.values).type(torch.long).to(device)

    # Initialize model
    model = KAN(width=[n, 6, 2], grid=3, k=2, seed=42, device=device)

    # Train the model and evaluate it
    results = model.fit(dataset, opt="Adam", steps=500, lr=0.001, lamb=0.001, metrics=(train_acc, test_acc), loss_fn=torch.nn.CrossEntropyLoss())

    print(f'Fold accuracy: {results['train_acc'][-1], results['test_acc'][-1]}')
    
    train_accuracies.append(results['train_acc'][-1])
    test_accuracies.append(results['test_acc'][-1])

# Output the average accuracies
print(f"Average Train Accuracy: {torch.mean(torch.tensor(train_accuracies)):.4f}")
print(f"Average Test Accuracy: {torch.mean(torch.tensor(test_accuracies)):.4f}")