## Import modules

In [1]:
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Import data

In [2]:
transactions = pd.read_csv("credit_dard_data.csv")

In [3]:
transactions.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Clean data

In [20]:
# transactions.info(show_counts = True)

Data contains no null values

In [22]:
#transactions.drop_duplicates()

Data contains no duplicates

## Summary statistics

In [11]:
#suppress scientific notation
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [14]:
#get summary statistics of amount column
transactions.amount.describe()

count    6362620.00000
mean      179861.90355
std       603858.23146
min            0.00000
25%        13389.57000
50%        74871.94000
75%       208721.47750
max     92445516.64000
Name: amount, dtype: float64

In [23]:
transactions.type.unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [33]:
transactions.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud', 'isPayment', 'isMovement'],
      dtype='object')

## Define columns

Column stating whether a transaction is a payment

In [30]:
transactions["isPayment"] = transactions.type.apply(lambda x: 1 if x == "PAYMENT"or x =="DEBIT" else 0)

Column stating whether a transaction is movement of funds

In [31]:
transactions["isMovement"] = transactions.type.apply(lambda x: 1 if x == "TRANSFER" or x == "CASH_OUT" else 0)

Column stating the absolute difference between origin and destination accounts

In [35]:
transactions["accDiff"] = abs(transactions["oldbalanceOrg"] - transactions["oldbalanceDest"])

## Define features and labels

In [38]:
features = transactions[["amount","isPayment","isMovement","accDiff"]]

In [39]:
label = transactions["isFraud"]

## Split data into training and test sets

In [40]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.3)

## Normalize the data

In [44]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Fit logistic regression model

In [45]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

## Score model

In [46]:
print(model.score(X_train, y_train))

0.9986937097341302


The model scored 99% on the training data

In [47]:
print(model.score(X_test, y_test))

0.9986944581529831


The model scored 99% on the test data

## Evaluate model coefficients

In [51]:
print(model.coef_)

[[ 0.21975937 -0.99219857  3.63369013 -0.66699752]]


In [52]:
features.columns

Index(['amount', 'isPayment', 'isMovement', 'accDiff'], dtype='object')

- isMovement had the largest effect with a coefficient of 3.6
- amount had the smallest effect with a coefficient of 0.2

## Predict with the model

In [56]:
#load sample transactions
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])
transaction4 = np.array([11111.11, 0.0, 1.0, 42069.11])

In [62]:
#condense into one array
sample_transactions = np.stack([transaction1, transaction2, transaction3, transaction4])

As the model was trained on scaled feature data, we must also scale the data we're making predictions on.

In [63]:
#scale prediction data
sample_transactions = scaler.transform(sample_transactions)

In [64]:
#predict values using model
model.predict(sample_transactions)

array([0, 0, 0, 0])

In [65]:
#view probailities
model.predict_proba(sample_transactions)

array([[9.96828381e-01, 3.17161866e-03],
       [9.99999741e-01, 2.58600154e-07],
       [9.99999721e-01, 2.79155571e-07],
       [9.96946789e-01, 3.05321143e-03]])

None of the sample transactions were flagged as fraud