In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df= pd.read_csv("/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv")
df.columns

step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount - amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction

nameDest - customer who is the recipient of the transaction

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

There are 0.1 % fradulent transactions recorded

In [None]:
fraudcount = df[df["isFraud"] == 1].count()
totalcount = df.count()
fraudcount/totalcount

In the first 17 days, only 0.07% of fraud is recorded

In [None]:
first17 = df[df["step"] < 17 * 24]
fraudcount = first17[first17["isFraud"] == 1].count()
totalcount = first17.count()
fraudcount/totalcount

Fradulent Transactions only occur in: 'TRANSFER', 'CASH_OUT'

In [None]:
frauds = df[df["isFraud"] == 1]
frauds["type"].unique()

In [None]:
frauds["day"] = round(frauds["step"]/24)
frauds_by_day = frauds[['step', 'day']].groupby("day").count()
frauds_by_day = frauds_by_day.rename(columns = {"step": "count"})
frauds_by_day.plot(kind = "bar", figsize = (10, 5), title="Number of frauds per day")

Observed that Flagged Frauds missed out "CASH_OUT" transactions. So we hope to detect them in our model.

In [None]:
correctlyFlaggedFraud = df[(df["isFlaggedFraud"] == 1) & (df["isFraud"] == 1)]
correctlyFlaggedFraud["type"].unique()

All flagged frauds are actual fraud cases.

In [None]:
incorrectlyFlaggedFraud = df[(df["isFlaggedFraud"] == 1) & (df["isFraud"] == 0)]
incorrectlyFlaggedFraud["type"].unique()

# Distribution of Transaction type column
(credits to https://www.kaggle.com/pritampaul360/credit-card-fraud-detection-random-forest)


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_theme(style="darkgrid")

print(df.type.value_counts())
f, ax = plt.subplots(1, 1, figsize=(8, 8))
df.type.value_counts().plot(kind='bar', title="Transaction type", ax=ax, figsize=(8,8))
plt.ticklabel_format(style='plain', axis='y')
for p in ax.patches:
    ax.annotate(str(format(int(p.get_height()), ',d')), (p.get_x(), p.get_height()))
plt.show()

# Feature Engineering

In [None]:
df_new = df.loc[(df.type == 'TRANSFER') | (df.type == 'CASH_OUT')]
df_new["errorbalanceOrg"] = df_new.newbalanceOrig + df_new.amount - df_new.oldbalanceOrg
df_new["errorbalanceDest"] = df_new.oldbalanceDest + df_new.amount - df_new.newbalanceDest

In [None]:
df_new[["oldbalanceDest", "amount", "newbalanceDest", "errorbalanceDest"]].round(5)

In [None]:
df_new["errorbalanceOrg"].corr(df_new["isFraud"])

In [None]:
df_new["errorbalanceDest"].corr(df_new["isFraud"])

In [None]:
dfErrorsOrg = df_new[df_new["errorbalanceOrg"] != 0]
dfErrorsOrg[["errorbalanceOrg", "isFraud"]].groupby("isFraud").count()

In [None]:
dfErrorsOrg = df_new[df_new["errorbalanceOrg"] == 0]
dfErrorsOrg[["errorbalanceOrg", "isFraud"]].groupby("isFraud").count()

In [None]:
dfErrorsOrg['isFraud'].value_counts()/dfErrorsOrg['isFraud'].count()

In [None]:
dfErrorsDest = df_new[df_new["errorbalanceDest"] != 0]
dfErrorsDest[["errorbalanceDest", "isFraud"]].groupby("isFraud").count()

In [None]:
dfErrorsDest = df_new[df_new["errorbalanceDest"] > 0]
dfErrorsDest[["errorbalanceDest", "isFraud"]].groupby("isFraud").count()

In [None]:
dfErrorsDest['isFraud'].value_counts()/dfErrorsDest['isFraud'].count()

Interestingly, when there is NO error balance, the likelihood of the transaction being a fraud case is higher.

In [None]:
dfErrorsDest = df_new[df_new["errorbalanceDest"] == 0]
dfErrorsDest[["errorbalanceDest", "isFraud"]].groupby("isFraud").count()

It was observed that frauds occur when: Amount == oldbalanceOrg

In [None]:
df_new["isEqual"] = (df_new["amount"] == df_new["oldbalanceOrg"])
df_new['isEqual'].corr(df_new['isFraud'])

In [None]:
round_amt = df_new[df_new['amount']%1000 == 0]
round_amt['isFraud'].value_counts()/round_amt['isFraud'].count()

In [None]:
df_new['rule_pred'] = (df_new['isEqual'] == 1) | ((df_new['amount']%1000 == 0) & (df_new['isEqual'] == 0))
df_new['rule_pred'].corr(df_new['isFraud'])

In [None]:
nonEqualFraud = df_new[(df_new['isFraud'] == 1) & (df_new['isEqual'] == 0)]
round_amt = nonEqualFraud[nonEqualFraud['amount']%1000 == 0]
round_amt['isFraud'].value_counts()/round_amt['isFraud'].count()

zeroBalance is a weak indicator of fraud

In [None]:
zeroBalance = df_new[df_new["newbalanceOrig"] == 0.0]
zeroBalance[["newbalanceOrig", "isFraud"]].groupby("isFraud").count()

In [None]:
df_new["hour_of_day"] = df_new['step']%24

Are frauds happening on specific hours of a day?

% of frauds?

In [None]:
fraud = df_new[df_new['isFraud'] == 1]
fraud_hours = fraud[['isFraud', 'hour_of_day']].groupby(['hour_of_day']).count()
fraud_hours.plot(kind="bar", color="red", title="Fradulation transactions across the day")

In [None]:
nofraud = df_new[df_new['isFraud'] == 0]
normal_hours = nofraud[['isFraud', 'hour_of_day']].groupby(['hour_of_day']).count()
normal_hours = normal_hours.rename(columns={"isFraud":"normal"})
normal_hours.plot(kind="bar")

In [None]:
mergeHours = pd.merge(fraud_hours, normal_hours, on="hour_of_day")
mergeHours["fraudProportion"] = mergeHours['isFraud']/mergeHours['normal']
mergeHours["fraudProportion"].plot(kind="bar", title="Proportion of fraudulent transactions across the day")

New feature of probability of destination

In [None]:
df_new['isFraud'] = df_new['isFraud'].astype('category')
fraudDest = df_new.groupby(['isFraud', 'nameDest']).size().unstack(fill_value=0).T
fraudDest

In [None]:
fraudDest1 = fraudDest.reset_index()
fraudDest2 = fraudDest1[['nameDest', 0, 1]]
fraudDest2['fraud_prob'] = fraudDest2[1]/(fraudDest2[0] + fraudDest2[1])

In [None]:
dest_counts = df_new[['step', 'nameDest']].groupby("nameDest").count()
dest = dest_counts.rename(columns = {'step':'count'})
dest.head(200).plot(kind="bar", figsize=(20,5), title="Number of Transactions per Destination")

In [None]:
total_destinations = df_new["nameDest"].count()
dest_counts = df_new[['step', 'nameDest']].groupby("nameDest").count()
dest_counts["dest_prob"] = dest_counts['step']/total_destinations
dest_dict = dict(zip(dest_counts.index, dest_counts.dest_prob))
df_new["dest_prob"] = df_new["nameDest"].map(dest_dict)

Correlation between commonality of destination and probability of fraudulent transaction is 0.06 (low)

In [None]:
destCounts = dest_counts.reset_index()
mergeDest = pd.merge(fraudDest2, destCounts, on = "nameDest")
mergeDest['fraud_prob'].corr(mergeDest['step'])

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
df_new['type'] = enc.fit_transform(df_new['type'])

# Modelling

In [None]:
pip install pycaret==2.3.5


In [None]:
from pycaret.anomaly import *

How is the distribution of transactions across days?

In [None]:
df['day'] = round(df['step']/24)
dayCount = df[['step', 'day']].groupby('day').count()
dayCount.plot(kind = 'bar', title = "Distribution of Transactions across days")

Manual train test split using first 15 days for training and next 15 days as testing.

In [None]:
from sklearn import preprocessing

df_train = df_new[df_new["step"] <= 15*24]
#x_train = preprocessing.normalize(df_train[["amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest", "errorbalanceOrg", "errorbalanceDest", "type", "hour_of_day", "dest_prob"]])
x_train = preprocessing.normalize(df_train[["amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest", "isEqual", "type"]])
y_train = df_train[["isFraud"]].values

df_test = df_new[df_new["step"] > 15*24]
#x_test = df_test[["amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest", "errorbalanceOrg", "errorbalanceDest", "type", "hour_of_day", "dest_prob"]].values
x_test = df_test[["amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest", "isEqual", "type"]].values
y_test = df_test[["isFraud"]].values

In [None]:
anom = setup(data = pd.DataFrame(x_train), 
             silent = True)

In [None]:
iso_model = create_model(model = 'iforest', fraction = 0.001)

In [None]:
predictionIF = iso_model.predict(x_test)


In [None]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, roc_curve, auc, precision_score

cmIF = confusion_matrix(y_test,pd.DataFrame(predictionIF))
CR = classification_report(y_test,pd.DataFrame(predictionIF))
fpr, recall, thresholds = roc_curve(y_test,pd.DataFrame(predictionIF))
AUC = auc(fpr, recall)

results = {"Confusion Matrix":cmIF,"Classification Report":CR,"Area Under Curve":AUC}

for measure in results:
    print(measure,": \n",results[measure])

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize = (8,5))
sn.heatmap(cmIF, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Truth')

plt.show()