In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

In [2]:
df = pd.read_csv('data/creditcard.csv')

In [3]:
df = (df - df.min()) / (df.max() - df.min())

In [4]:
X = df.drop(columns=['Class'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

clf = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=42,
    class_weight='balanced'
)
clf.fit(X_train, y_train)

importances = clf.feature_importances_
feat_imp = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
})
feat_imp = feat_imp.sort_values('importance', ascending=False).reset_index(drop=True)

print(feat_imp)

# plt.figure(figsize=(8, 10))
# plt.barh(feat_imp['feature'], feat_imp['importance'])
# plt.gca().invert_yaxis()
# plt.title('Feature Importances')
# plt.xlabel('Importance')
# plt.tight_layout()
# plt.show()

   feature  importance
0      V14    0.185077
1       V4    0.120695
2      V10    0.116924
3      V12    0.101880
4      V17    0.089537
5       V3    0.064627
6      V11    0.046515
7      V16    0.042677
8       V2    0.036178
9       V9    0.026259
10     V21    0.022648
11      V7    0.013823
12     V18    0.011518
13     V19    0.011364
14  Amount    0.009923
15      V6    0.008869
16      V5    0.008617
17     V27    0.008098
18     V13    0.007603
19     V28    0.007559
20     V26    0.007484
21     V22    0.006798
22      V8    0.006705
23     V15    0.006655
24      V1    0.006035
25     V20    0.005969
26     V25    0.005746
27     V23    0.005162
28    Time    0.004622
29     V24    0.004432


In [5]:
features = feat_imp['feature'][:6].to_list()
features_with_class =  features + ['Class']
print(features)
print(features_with_class)

['V14', 'V4', 'V10', 'V12', 'V17', 'V3']
['V14', 'V4', 'V10', 'V12', 'V17', 'V3', 'Class']


In [6]:
df['Class'].unique()

array([0., 1.])

In [7]:
df['Class'].value_counts()

Class
0.0    284315
1.0       492
Name: count, dtype: int64

In [8]:
df = df[features_with_class].copy(deep=True)

In [9]:
df.describe()

Unnamed: 0,V14,V4,V10,V12,V17,V3,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,0.646053,0.25193,0.508722,0.704193,0.73113,0.837414,0.001727
std,0.032231,0.062764,0.022528,0.03766,0.024678,0.026275,0.041527
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.631744,0.214311,0.497644,0.688907,0.717074,0.821985,0.0
50%,0.647755,0.25105,0.5068,0.709471,0.729221,0.84053,0.0
75%,0.662635,0.284882,0.518113,0.727494,0.742743,0.855213,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import pytorch_lightning as pl

from torch import nn
from torchvision.datasets import MNIST
from torchvision import transforms
from torch.utils.data import DataLoader, random_split, Dataset
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, train_test_split

In [39]:
device = 'cpu'
# wtf ?
batch_size = 1
precision_model = '32'

data_precision_torch = torch.float16
match precision_model:
    case 'bf16-mixed': 
        data_precision_torch = torch.float16
    case '16': 
        data_precision_torch = torch.float16
    case '32-mixed': 
        data_precision_torch = torch.float32
    case '32': 
        data_precision_torch = torch.float32
    case '64': 
        data_precision_torch = torch.float64
    
num_workers = 7
epochs = 10
# name_batch_precision_epochs
model_name = f'titanic_{str(batch_size)}_{str(precision_model)}_{str(epochs)}.pt'

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   V14     284807 non-null  float64
 1   V4      284807 non-null  float64
 2   V10     284807 non-null  float64
 3   V12     284807 non-null  float64
 4   V17     284807 non-null  float64
 5   V3      284807 non-null  float64
 6   Class   284807 non-null  float64
dtypes: float64(7)
memory usage: 15.2 MB


In [15]:
df['Class'].unique()

array([0., 1.])

In [41]:
class IsFraud(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.layer_1 = nn.Linear(6, 128)
        self.layer_2 = nn.Linear(128, 256)
        self.layer_3 = nn.Linear(256, 512)
        self.layer_4 = nn.Linear(512, 1)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.leaky_relu(self.layer_1(x))
        x = F.leaky_relu(self.layer_2(x))
        x = F.leaky_relu(self.layer_3(x))
        return self.layer_4(x).squeeze(1)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        preds = torch.sigmoid(logits) > 0.5
        acc = (preds.float() == y).float().mean()
        self.log("val_acc", acc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [42]:
class IsFraudDataset(Dataset):
    def __init__(self, data, targets, data_precision=torch.float64):
        self.data = torch.tensor(data.values, dtype=data_precision)
        self.targets = torch.tensor(targets.values, dtype=data_precision)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [None]:
class IsFraudDataModule(pl.LightningDataModule):
    def __init__(self,
                 dataset,
                 batch_size=64, 
                 num_workers=8, 
                 n_splits=5, 
                 fold_idx=0,
                 shuffle_train=True,
                 data_precision=torch.float64):
        super().__init__()
        self.dataset = dataset
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.n_splits = n_splits
        self.fold_idx = fold_idx
        self.shuffle_train = shuffle_train
        self.data_precision = data_precision

    def prepare_data(self):
        # TODO clean up me pls
        # pd.read_csv(self.data_path)
        # pd.read_csv(self.test_path)
        pass

    def setup(self, stage=None):
        # setup train data
        df = self.dataset

        # setup target
        X = df.drop(columns=["Class"])
        y = df["Class"]

        X_train, X_test, y_train, y_test = train_test_split(X, 
                                                            y,
                                                            test_size=0.3,
                                                            stratify=y,
                                                            random_state=42)

        # FIXME crossvalidation K-Fold ?
        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        splits = list(kf.split(X_train))
        train_idx, val_idx = splits[self.fold_idx]      
        self.train_dataset = IsFraudDataset(X_train.iloc[train_idx], 
                                            y_train.iloc[train_idx], 
                                            data_precision=self.data_precision)
        self.val_dataset = IsFraudDataset(X_train.iloc[val_idx], 
                                          y_train.iloc[val_idx], 
                                          data_precision=self.data_precision)
        
        # setup test data
        self.test_dataset = torch.tensor(X_test.values, dtype=self.data_precision)
        # self.test_dataset = IsFraudDataset(X_test, 
        #                                    y_test, 
        #                                    data_precision=self.data_precision) 

    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size, 
                          num_workers=self.num_workers, 
                          persistent_workers=True,
                          shuffle=self.shuffle_train)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size, 
                          persistent_workers=True,
                          num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size, 
                          persistent_workers=True,
                          num_workers=self.num_workers)

In [48]:
df['Class'].unique()

array([0., 1.])

In [49]:
model = IsFraud()
logger = TensorBoardLogger("tb_logs", name="my_model")

checkpoint_callback = ModelCheckpoint(
    # ?
    monitor="val_acc",         
    dirpath="checkpoints/",     
    filename="best-checkpoint",  
    save_top_k=3,              
    mode="min"                    
)

dm = IsFraudDataModule(
    dataset=df,
    batch_size=batch_size, 
    num_workers=num_workers,
    data_precision=data_precision_torch
)

trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback],
    max_epochs=epochs, 
    accelerator='gpu' if torch.cuda.is_available() and device == 'cuda' else 'cpu',
    precision=precision_model,
    detect_anomaly=False,
    accumulate_grad_batches=batch_size
)
trainer.fit(model, datamodule=dm)

torch.save(model.state_dict(), model_name)
print(f"Model saved to {model_name}")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type   | Params | Mode 
-------------------------------------------
0 | layer_1 | Linear | 896    | train
1 | layer_2 | Linear | 33.0 K | train
2 | layer_3 | Linear | 131 K  | train
3 | layer_4 | Linear | 513    | train
-------------------------------------------
166 K     Trainable params
0         Non-trainable params
166 K     Total params
0.664     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking: |                                                                 | 0/? [00:00<?, ?it/s]

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/miniconda3/envs/ds/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/ds/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'IsFraudDataset' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/miniconda3/envs/ds/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/ds/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

RuntimeError: DataLoader worker (pid(s) 20131, 20132, 20134, 20136, 20137, 20138, 20139) exited unexpectedly

In [None]:
model = IsFraud()
model.load_state_dict(torch.load(model_name))

In [None]:
model = IsTitanicPassengerDead()
model.load_state_dict(torch.load(model_name))
model.to(device)
model.eval()

In [None]:
df = pd.read_csv(data_path)
df = df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Fare"] = df["Fare"].fillna(df["Fare"].median())

# setup target
X = df.drop(columns=["Survived"])
y = df["Survived"]

# crossvalidation K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
splits = list(kf.split(X))
train_idx, val_idx = splits[0]
train_dataset = TitanicDataset(X.iloc[train_idx], y.iloc[train_idx], data_precision=data_precision_torch)

test_df = pd.read_csv(test_path)
test_df_surv = pd.read_csv('titanic/gender_submission.csv')
test_df = test_df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
test_df["Sex"] = test_df["Sex"].map({"male": 0, "female": 1})
test_df["Age"] = test_df["Age"].fillna(df["Age"].median())
test_df["Fare"] = test_df["Fare"].fillna(df["Fare"].median())
test_dataset = torch.tensor(test_df.values, dtype=data_precision_torch)

test_loader = DataLoader(
    test_dataset, 
    batch_size=1, 
    num_workers=num_workers
)

In [None]:
# simple test
# 0 -> [3, 0, 22, 1, 0, 7.25]
# 1 -> [1, 1, 38, 1, 0, 71.2833]
with torch.no_grad():
    out = model(torch.tensor([[3, 0, 22, 1, 0, 7.25]], device=device))
    preds = 0 if out < 0 else 1 #torch.argmax(out, dim=0)
    print(out)
    print(preds)

In [None]:
# make metrics
y_true = []
y_pred = []
idx = 0
theshold = 0.0

with torch.no_grad():
    for x in test_loader:
        x = x.to(device)
        x = x.type(torch.float32)
        outputs = model(x)
        preds = 0 if outputs < theshold else 1
        y_true.append(test_df_surv['Survived'][idx])
        y_pred.append(preds)
        idx = idx + 1

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))