Loading the Required Libraries

In [15]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Load the data and examine rows and datatypes

In [16]:
transactions = pd.read_csv("transactions_mod.csv")
print(transactions.head(5))
print(transactions.info(5))

print(f"Fraudulent Transactions: {transactions.isFraud.sum()}")

   step      type      amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0   206  CASH_OUT    62927.08   C473782114           0.00            0.00   
1   380   PAYMENT    32851.57  C1915112886           0.00            0.00   
2   570  CASH_OUT  1131750.38  C1396198422     1131750.38            0.00   
3   184  CASH_OUT    60519.74   C982551468       60519.74            0.00   
4   162   CASH_IN    46716.01  C1759889425     7668050.60      7714766.61   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isPayment  \
0  C2096898696       649420.67       712347.75        0          0   
1   M916879292            0.00            0.00        0          1   
2  C1612235515       313070.53      1444820.92        1          0   
3  C1378644910        54295.32       182654.50        1          0   
4  C2059152908      2125468.75      2078752.75        0          0   

   isMovement  accountDiff  
0           1    649420.67  
1           0         0.00  
2           1    818679.85  


Summary statistics on the amount column

In [17]:
transactions['amount'].describe()

count    1.000000e+03
mean     5.373080e+05
std      1.423692e+06
min      0.000000e+00
25%      2.933705e+04
50%      1.265305e+05
75%      3.010378e+05
max      1.000000e+07
Name: amount, dtype: float64

Create new columns
1. isPayment = 1 when type is PAYMENT or DEBIT else 0.
2. isMovement = 1 when type is either CASH_OUT or TRANSFER, 0 otherwise.
3. accountDiff = absolute difference between oldbalanceOrg and oldbalanceDest

In [18]:
transactions['isPayment'] = transactions.apply(lambda row: 1 if row.type == "PAYMENT" or "DEBIT" else 0, axis=1)
transactions['isMovement'] = transactions.apply(lambda row: 1 if row.type == "CASH_OUT" or "TRANSFER" else 0, axis=1)
transactions['accountDiff'] = np.absolute(transactions.oldbalanceDest - transactions.oldbalanceOrg)

Set the features and label for the logistic regression model

In [19]:
features = transactions[['amount', 'isPayment', 'isMovement', 'accountDiff']]
label = transactions.isFraud

Splitting the Data to Training set to train the model and Test set to Test the accuracy of the model.
Train:Test = 7:3

In [20]:
x_train, x_test, y_train, y_test = train_test_split(features, label, train_size=0.7, test_size=0.3, random_state=6)

Normalizing the Feature Variables.

In [21]:
stdslr = StandardScaler()
train_features= stdslr.fit_transform(x_train)
test_features = stdslr.fit_transform(x_test)

Fitting the model to the training set.

In [22]:
logistic_model = LogisticRegression()
logistic_model.fit(train_features, y_train)

Scoring the model on Training set and Test set.

In [23]:
print(f"Model's Score on training data: {logistic_model.score(train_features, y_train)}")
print(f"Model's Score on test data: {logistic_model.score(test_features, y_test)}")

Model's Score on training data: 0.8314285714285714
Model's Score on test data: 0.8266666666666667


In [24]:
print(logistic_model.coef_)

[[ 3.80998231  0.          0.         -1.29931768]]


Creating new sample Transactions.

In [25]:
# new transaction data
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])
transaction4 = np.array([694200.11, 1.0, 0.0, 120.0])

sample_transactions = np.stack([transaction1, transaction2, transaction3, transaction4])
print(sample_transactions)

[[1.2345678e+05 0.0000000e+00 1.0000000e+00 5.4670100e+04]
 [9.8765430e+04 1.0000000e+00 0.0000000e+00 8.5247500e+03]
 [5.4367831e+05 1.0000000e+00 0.0000000e+00 5.1002550e+05]
 [6.9420011e+05 1.0000000e+00 0.0000000e+00 1.2000000e+02]]


Predicting which sample transactions are fraudulent.

In [26]:
sample_transactions = stdslr.fit_transform(sample_transactions)

sample_predicted = logistic_model.predict(sample_transactions)
print(sample_predicted)
print(logistic_model.predict_proba(sample_transactions))

[0 0 0 1]
[[0.97508685 0.02491315]
 [0.97697064 0.02302936]
 [0.56968832 0.43031168]
 [0.00641164 0.99358836]]
