# Fraud Detection | Credit Card Transactions | Data Analysis & Exploration
-----

Dataset link: https://www.kaggle.com/datasets/kelvinobiri/credit-card-transactions

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('data/transactions.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,8,CASH_OUT,158007.12,C424875646,0.0,0.0,C1298177219,474016.32,1618631.97,0
1,236,CASH_OUT,457948.3,C1342616552,0.0,0.0,C1323169990,2720411.37,3178359.67,0
2,37,CASH_IN,153602.99,C900876541,11160428.67,11314031.67,C608741097,3274930.56,3121327.56,0
3,331,CASH_OUT,49555.14,C177696810,10865.0,0.0,C462716348,0.0,49555.14,0
4,250,CASH_OUT,29648.02,C788941490,0.0,0.0,C1971700992,56933.09,86581.1,0


In [4]:
n_obs = len(df)
print(f"Number of observations in the dataset: {n_obs}")

Number of observations in the dataset: 199999


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            199999 non-null  int64  
 1   type            199999 non-null  object 
 2   amount          199999 non-null  float64
 3   nameOrig        199999 non-null  object 
 4   oldbalanceOrg   199999 non-null  float64
 5   newbalanceOrig  199999 non-null  float64
 6   nameDest        199999 non-null  object 
 7   oldbalanceDest  199999 non-null  float64
 8   newbalanceDest  199999 non-null  float64
 9   isFraud         199999 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 15.3+ MB


In [7]:
df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64

In [8]:
df['step'].describe()

count    199999.000000
mean        243.289836
std         141.800473
min           1.000000
25%         156.000000
50%         238.000000
75%         334.000000
max         741.000000
Name: step, dtype: float64

In [9]:
df['step'].value_counts()

step
19     1624
187    1618
18     1520
163    1501
235    1487
       ... 
511       1
391       1
90        1
627       1
712       1
Name: count, Length: 524, dtype: int64

In [14]:
X = df.drop(['isFraud', 'nameOrig', 'nameDest'], axis=1).copy()
y = df['isFraud']

In [15]:
X.columns

Index(['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
       'oldbalanceDest', 'newbalanceDest'],
      dtype='object')

In [19]:
X['amount'] = X['amount'].apply(lambda amount: np.log(1 + amount))
X['oldbalanceOrg'] = X['oldbalanceOrg'].apply(lambda amount: np.log(1 + amount))
X['newbalanceOrig'] = X['newbalanceOrig'].apply(lambda amount: np.log(1 + amount))
X['newbalanceDest'] = X['newbalanceDest'].apply(lambda amount: np.log(1 + amount))
X['oldbalanceDest'] = X['oldbalanceDest'].apply(lambda amount: np.log(1 + amount))

# Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer 
from sklearn.metrics import log_loss, average_precision_score, auc
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [26]:
# transformer
preprocessor = ColumnTransformer(transformers=[
    ('num_features', StandardScaler(), ['step']),
    ('cat_features', OneHotEncoder(handle_unknown='ignore'), ['type']),
], sparse_threshold=0, remainder='passthrough')

In [27]:
preprocessor.fit(X)
X_transformed_df = pd.DataFrame(preprocessor.transform(X), 
                                columns=preprocessor.get_feature_names_out())

X_transformed_df.head()

Unnamed: 0,num_features__step,cat_features__type_CASH_IN,cat_features__type_CASH_OUT,cat_features__type_DEBIT,cat_features__type_PAYMENT,cat_features__type_TRANSFER,remainder__amount,remainder__oldbalanceOrg,remainder__newbalanceOrig,remainder__oldbalanceDest,remainder__newbalanceDest
0,-1.659306,0.0,1.0,0.0,0.0,0.0,11.970402,0.0,0.0,13.068999,14.297093
1,-0.051409,0.0,1.0,0.0,0.0,0.0,13.034514,0.0,0.0,14.816294,14.971876
2,-1.454793,1.0,0.0,0.0,0.0,0.0,11.942133,16.227885,16.241554,15.001808,14.953769
3,0.618548,0.0,1.0,0.0,0.0,0.0,10.810861,9.293394,0.0,0.0,10.810861
4,0.047321,0.0,1.0,0.0,0.0,0.0,10.297184,0.0,0.0,10.94965,11.368848


In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed_df, y, stratify=y, random_state=42, test_size=0.2
)

In [30]:
lr_model = LogisticRegression(fit_intercept=True, class_weight='balanced')
lr_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,100


In [41]:
def get_model_metrics(model_train_preds, model_test_preds):
    train_log_loss = log_loss(y_pred=model_train_preds, y_true=y_train)
    test_log_loss = log_loss(y_pred=model_test_preds, y_true=y_test)
    train_pr_auc = average_precision_score(y_score=model_train_preds, y_true=y_train)
    test_pr_auc = average_precision_score(y_score=model_test_preds, y_true=y_test)
    message = f"""
        Log loss --------------------------------\n 
        Train: {train_log_loss:.6f} \t Test: {test_log_loss:.6f}\n
        PR-AUC ----------------------------------\n
        Train: {train_pr_auc:.6f} \t Test: {test_pr_auc:.6f}
    """
    print(message)

In [42]:
lr_model_train_preds = lr_model.predict_proba(X_train)[:,1]
lr_model_test_preds = lr_model.predict_proba(X_test)[:,1]

get_model_metrics(lr_model_train_preds, lr_model_test_preds)


        Log loss --------------------------------
 
        Train: 0.077369 	 Test: 0.080718

        PR-AUC ----------------------------------

        Train: 0.495921 	 Test: 0.609343
    


In [38]:
N = len(X)
# pos = len(df[df['isFraud'] == 1])
n_pos = df['isFraud'].sum()
trivial_pr_auc = n_pos / N
trivial_pr_auc

np.float64(0.0014100070500352503)

In [45]:
df['lr_model_probs'] = np.round(lr_model.predict_proba(X_transformed_df)[:,1], 6)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,lr_model_probs
0,8,CASH_OUT,158007.12,C424875646,0.0,0.0,C1298177219,474016.32,1618631.97,0,0.0
1,236,CASH_OUT,457948.3,C1342616552,0.0,0.0,C1323169990,2720411.37,3178359.67,0,0.0
2,37,CASH_IN,153602.99,C900876541,11160428.67,11314031.67,C608741097,3274930.56,3121327.56,0,0.0
3,331,CASH_OUT,49555.14,C177696810,10865.0,0.0,C462716348,0.0,49555.14,0,0.091841
4,250,CASH_OUT,29648.02,C788941490,0.0,0.0,C1971700992,56933.09,86581.1,0,0.0


In [46]:
df_pos = df[df['isFraud'] == 1]
df_pos.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,lr_model_probs
86,125,CASH_OUT,222097.11,C1167153094,222097.11,0.0,C744862329,383533.13,605630.23,1,0.940785
1060,735,CASH_OUT,243459.96,C1148037845,243459.96,0.0,C387251817,9459523.04,9702983.01,1,0.975906
1428,59,CASH_OUT,394388.09,C1802030448,394388.09,0.0,C707965723,1260564.49,1654952.58,1,0.886886
1832,350,TRANSFER,640648.06,C1231531045,640648.06,0.0,C1282254449,0.0,0.0,1,1.0
2142,495,TRANSFER,1301090.39,C1386844753,1301090.39,0.0,C808323881,0.0,0.0,1,1.0


In [47]:
df_pos[df_pos['lr_model_probs'] < 0.5]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,lr_model_probs
14411,227,CASH_OUT,210.92,C332921972,210.92,0.0,C814473635,3437310.4,3437521.31,1,0.062965
171545,702,CASH_OUT,0.0,C1461113533,0.0,0.0,C1382150537,107777.02,107777.02,1,0.221227


In [51]:
df[(df['lr_model_probs'] >= 0.5) & (df['isFraud'] == 0)]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,lr_model_probs
45,253,CASH_OUT,32295.19,C1769812322,659582.00,627286.81,C2115080811,0.00,32295.19,0,0.612448
79,305,CASH_OUT,7089.89,C645168383,180813.00,173723.11,C598228047,6200.00,13289.89,0,0.972656
109,402,TRANSFER,58108.25,C219128708,22340.00,0.00,C289496053,4374.73,62482.98,0,0.713840
132,372,CASH_OUT,41643.50,C1810627627,25482.00,0.00,C1503279690,0.00,41643.50,0,0.890173
163,374,CASH_OUT,14962.03,C1647261588,13587.00,0.00,C667140364,0.00,14962.03,0,0.980085
...,...,...,...,...,...,...,...,...,...,...,...
199850,138,CASH_OUT,35785.50,C347108043,34827.00,0.00,C704021390,2270740.59,2306526.10,0,0.621266
199869,213,CASH_OUT,63122.66,C768091065,61450.00,0.00,C48199427,72612.90,135735.56,0,0.967590
199907,228,CASH_IN,256.57,C306190295,5747300.28,5747556.85,C399130089,118755.71,118499.14,0,0.995741
199947,178,CASH_OUT,176546.74,C969398462,100058.00,0.00,C1079208050,25195.72,201742.46,0,0.784877


In [52]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [90]:
class CreditCardData(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        features = torch.tensor(self.X.iloc[index].values, dtype=torch.float)
        label = torch.tensor(self.y.iloc[index], dtype=torch.float)
        return features, label

batch_size = 64

train_dataset = CreditCardData(X_train, y_train)
test_dataset = CreditCardData(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [91]:
X_, y_ = next(iter(train_dataloader))

In [93]:
y_.unsqueeze(dim=1)

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])

In [113]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        probs = self.linear_relu_stack(x)
        return probs
    
model = NeuralNetwork(in_features=X_transformed_df.shape[1])
print(model)

NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=11, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=1, bias=True)
    (3): Sigmoid()
  )
)


In [114]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

In [115]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        y = y.unsqueeze(dim=1)
        pred = model(X) # forward pass
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 1000 == 0:
            loss, current = loss.item(), (batch+1)*len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")


def test(dataloader, model, loss_fn):
    # size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval() # evaluation mode
    test_loss = 0
    with torch.no_grad():
        for X, y in dataloader:
            y = y.unsqueeze(dim=1)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            # correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    print(f"Test Error Avg loss: {test_loss:>8f}\n")

In [116]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1} \n ---------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1 
 ---------------------------------
loss: 0.891929 [   64/159999]
loss: 0.002381 [64064/159999]
loss: 0.002538 [128064/159999]
Test Error Avg loss: 0.012224

Epoch 2 
 ---------------------------------
loss: 0.003369 [   64/159999]
loss: 0.002181 [64064/159999]
loss: 0.069341 [128064/159999]
Test Error Avg loss: 0.010425

Epoch 3 
 ---------------------------------
loss: 0.105722 [   64/159999]
loss: 0.002236 [64064/159999]
loss: 0.001184 [128064/159999]
Test Error Avg loss: 0.009397

Epoch 4 
 ---------------------------------
loss: 0.002318 [   64/159999]
loss: 0.001538 [64064/159999]
loss: 0.002290 [128064/159999]
Test Error Avg loss: 0.008804

Epoch 5 
 ---------------------------------
loss: 0.003317 [   64/159999]
loss: 0.001440 [64064/159999]
loss: 0.003431 [128064/159999]
Test Error Avg loss: 0.008292

Epoch 6 
 ---------------------------------
loss: 0.001581 [   64/159999]
loss: 0.000887 [64064/159999]
loss: 0.000345 [128064/159999]
Test Error Avg loss: 0.007983

Epoc

In [117]:
model.eval()
with torch.no_grad():
    my_test_x = torch.tensor(X_test.values, dtype=torch.float)
    my_train_x = torch.tensor(X_train.values, dtype=torch.float)
    y_test_pred = model(my_test_x)
    y_train_pred = model(my_train_x)

get_model_metrics(y_train_pred, y_test_pred)


        Log loss --------------------------------
 
        Train: 0.007413 	 Test: 0.006823

        PR-AUC ----------------------------------

        Train: 0.057132 	 Test: 0.202364
    


In [83]:
y_.unsqueeze(dim=1).shape

torch.Size([64, 1])