In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [37]:
data = pd.read_csv('transactions_train.csv')

In [38]:
# Removing the null values
df = data.dropna()

# Checking for null values
print(df.isnull().sum())

# Shows the shape of the dataset
print("(No. of rows, No. of columns) -->", df.shape)

# Runs the first five rows of the dataset
df.head()

Unnamed: 0        0
step              0
amount            0
oldbalanceOrig    0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64
(No. of rows, No. of columns) --> (15434, 8)


Unnamed: 0.1,Unnamed: 0,step,amount,oldbalanceOrig,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,0,373,120744.93,68997.0,0.0,0.0,120744.93,0
1,1,93,67903.23,634.0,68537.23,3528462.96,3460559.73,0
2,2,96,11196.39,1355806.66,1367003.05,493018.52,481822.13,0
3,3,354,1665.34,171563.59,169898.25,0.0,0.0,0
4,4,282,74813.74,0.0,0.0,1112970.42,1187784.16,0


In [39]:
# Check the distribution of data
print (df['isFraud'].value_counts(),'\n')
print(pd.value_counts(df.isFraud, normalize=True))

1    7717
0    7717
Name: isFraud, dtype: int64 

1    0.5
0    0.5
Name: isFraud, dtype: float64


In [40]:
df
X = df.drop(['Unnamed: 0','isFraud'],axis=1)
Y=df['isFraud']

In [41]:
# Defining the training, validation, & test split
def train_validation_test_split(
    X, Y, train_size=0.8, val_size=0.1, test_size=0.1, 
    random_state=None, shuffle=True):
  
    assert int(train_size + val_size + test_size + 1e-7) == 1
    
    X_train_val, X_test, Y_train_val, Y_test = train_test_split(
        X, Y, test_size=test_size, random_state=random_state, shuffle=shuffle)
    
    X_train, X_val, Y_train, Y_val = train_test_split(
        X_train_val, Y_train_val,    test_size=val_size/(train_size+val_size), 
        random_state=random_state, shuffle=shuffle)
    
    return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [42]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = train_validation_test_split(
    X, Y, train_size=0.8, val_size=0.1, test_size=0.1, random_state=1)
model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression()

In [43]:
Y_pred = model.predict(X_val)
print(classification_report(Y_val, Y_pred))
print('Validation Dataset:','\n')
print('Accuracy score: ', accuracy_score(Y_val, Y_pred))
print('ROC AUC Score: ', roc_auc_score(Y_val, Y_pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91       775
           1       0.90      0.91      0.91       769

    accuracy                           0.91      1544
   macro avg       0.91      0.91      0.91      1544
weighted avg       0.91      0.91      0.91      1544

Validation Dataset: 

Accuracy score:  0.9060880829015544
ROC AUC Score:  0.9060992491295775


In [44]:
# Testing the model on the test dataset
Y_pred = model.predict(X_test)
print(classification_report(Y_test, Y_pred))
print('Test Dataset:','\n')
print('Accuracy score on test data: ', accuracy_score(Y_test, Y_pred))
print('ROC AUC score:', roc_auc_score(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.89      0.90      0.90       770
           1       0.90      0.89      0.90       774

    accuracy                           0.90      1544
   macro avg       0.90      0.90      0.90      1544
weighted avg       0.90      0.90      0.90      1544

Test Dataset: 

Accuracy score on test data:  0.8957253886010362
ROC AUC score: 0.8957431457431457


In [45]:
# Testing the model to full (unsampled) dataset
Y_pred = model.predict(X_test)
print(classification_report(Y_test, Y_pred))
print('Full (Unsampled) Dataset:','\n')
print('Accuracy score on unsampled data: ', accuracy_score(Y_test, Y_pred))
print('ROC AUC score:', roc_auc_score(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.89      0.90      0.90       770
           1       0.90      0.89      0.90       774

    accuracy                           0.90      1544
   macro avg       0.90      0.90      0.90      1544
weighted avg       0.90      0.90      0.90      1544

Full (Unsampled) Dataset: 

Accuracy score on unsampled data:  0.8957253886010362
ROC AUC score: 0.8957431457431457


In [46]:
import pickle 
pickle.dump(model,open('logistic_regression_model.sav','wb'))