In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('transactions_train.csv')

In [3]:
# Removing the null values
df = data.dropna()

# Checking for null values
print(df.isnull().sum())

# Shows the shape of the dataset
print("(No. of rows, No. of columns) -->", df.shape)

# Runs the first five rows of the dataset
df.head()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrig    0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64
(No. of rows, No. of columns) --> (6351193, 10)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0


In [4]:
# Check the distribution of data
print (df['isFraud'].value_counts(),'\n')
print(pd.value_counts(df.isFraud, normalize=True))

0    6343476
1       7717
Name: isFraud, dtype: int64 

0    0.998785
1    0.001215
Name: isFraud, dtype: float64


In [None]:
!pip install imblearn


In [5]:
# Undersampling the normal transactions data
from imblearn.under_sampling import RandomUnderSampler

X = df.drop(['isFraud', 'type', 'nameOrig', 'nameDest'], axis = 1)
Y = df.isFraud

# Reducing the majority class equal to minority class 
rus = RandomUnderSampler(sampling_strategy=1)

X_res, Y_res = rus.fit_resample(X, Y)

print(X_res.shape, Y_res.shape)
print(pd.value_counts(Y_res))

ModuleNotFoundError: No module named 'imblearn'

In [13]:
# Defining the training, validation, & test split
def train_validation_test_split(
    X, Y, train_size=0.8, val_size=0.1, test_size=0.1, 
    random_state=None, shuffle=True):
  
    assert int(train_size + val_size + test_size + 1e-7) == 1
    
    X_train_val, X_test, Y_train_val, Y_test = train_test_split(
        X, Y, test_size=test_size, random_state=random_state, shuffle=shuffle)
    
    X_train, X_val, Y_train, Y_val = train_test_split(
        X_train_val, Y_train_val,    test_size=val_size/(train_size+val_size), 
        random_state=random_state, shuffle=shuffle)
    
    return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [14]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = train_validation_test_split(
    X_res, Y_res, train_size=0.8, val_size=0.1, test_size=0.1, random_state=1)
model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression()

In [15]:
Y_pred = model.predict(X_val)
print(classification_report(Y_val, Y_pred))
print('Validation Dataset:','\n')
print('Accuracy score: ', accuracy_score(Y_val, Y_pred))
print('ROC AUC Score: ', roc_auc_score(Y_val, Y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.92       775
           1       0.92      0.91      0.91       769

    accuracy                           0.92      1544
   macro avg       0.92      0.92      0.92      1544
weighted avg       0.92      0.92      0.92      1544

Validation Dataset: 

Accuracy score:  0.9151554404145078
ROC AUC Score:  0.9151264734258987


In [16]:
# Testing the model on the test dataset
Y_pred = model.predict(X_test)
print(classification_report(Y_test, Y_pred))
print('Test Dataset:','\n')
print('Accuracy score on test data: ', accuracy_score(Y_test, Y_pred))
print('ROC AUC score:', roc_auc_score(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.89      0.92      0.91       770
           1       0.92      0.89      0.91       774

    accuracy                           0.91      1544
   macro avg       0.91      0.91      0.91      1544
weighted avg       0.91      0.91      0.91      1544

Test Dataset: 

Accuracy score on test data:  0.9067357512953368
ROC AUC score: 0.9067821067821067


In [17]:
# Testing the model to full (unsampled) dataset
Y_pred = model.predict(X_test)
print(classification_report(Y_test, Y_pred))
print('Full (Unsampled) Dataset:','\n')
print('Accuracy score on unsampled data: ', accuracy_score(Y_test, Y_pred))
print('ROC AUC score:', roc_auc_score(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.89      0.92      0.91       770
           1       0.92      0.89      0.91       774

    accuracy                           0.91      1544
   macro avg       0.91      0.91      0.91      1544
weighted avg       0.91      0.91      0.91      1544

Full (Unsampled) Dataset: 

Accuracy score on unsampled data:  0.9067357512953368
ROC AUC score: 0.9067821067821067


In [18]:
import pickle 
pickle.dump(model,open('logistic_regression_model.sav','wb'))