## Fraud Detection using Machine Learning
##### using a logistic regression model to predict binary outcome of wheteher fraud is detected in a statement

In [2]:
import pandas as pd
import joblib

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
# USING THE FOLLOWING FEATURES:
# amt, gender, state, city_pop, job, dob, is_fraud

data = pd.read_csv('./data/fraudTrain.csv')

In [5]:
# Split data into training and test sets
X = data[['amt', 'gender', 'state', 'city_pop', 'job', 'age']]
y = data['is_fraud']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
# Separate the features that require encoding
string_features = ['gender', 'job', 'state']
numeric_features = ['amt', 'city_pop', 'age']

# Create column transformer to encode

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('str', OneHotEncoder(handle_unknown='ignore'), string_features)
    ]
)
preprocessor.fit(X_train)
joblib.dump(preprocessor, 'preprocessor.pkl')

X_train_processed = preprocessor.transform(X_train)
X_val_processed = preprocessor.transform(X_val)
print(X_train_processed)


  (0, 0)	-0.4375949400472785
  (0, 1)	-0.29312495132245125
  (0, 2)	-1.3086119749306617
  (0, 4)	1.0
  (0, 336)	1.0
  (0, 516)	1.0
  (1, 0)	-0.4108434284736928
  (1, 1)	-0.20241725804365865
  (1, 2)	-0.6178910278309626
  (1, 3)	1.0
  (1, 309)	1.0
  (1, 549)	1.0
  (2, 0)	-0.41026327521065126
  (2, 1)	-0.21646343056437217
  (2, 2)	-0.5027708699810127
  (2, 4)	1.0
  (2, 102)	1.0
  (2, 514)	1.0
  (3, 0)	-0.29268554723421686
  (3, 1)	0.08944128992208188
  (3, 2)	-0.5603309489059877
  (3, 3)	1.0
  (3, 73)	1.0
  (3, 544)	1.0
  (4, 0)	-0.28759309081418494
  :	:
  (907667, 547)	1.0
  (907668, 0)	-0.437917247415635
  (907668, 1)	-0.27699369664149986
  (907668, 2)	0.5333105506685357
  (907668, 3)	1.0
  (907668, 275)	1.0
  (907668, 505)	1.0
  (907669, 0)	0.30944907832931007
  (907669, 1)	-0.2939656034949916
  (907669, 2)	-0.2725305542811131
  (907669, 3)	1.0
  (907669, 444)	1.0
  (907669, 531)	1.0
  (907670, 0)	-0.4168383455251229
  (907670, 1)	-0.29452824471283356
  (907670, 2)	1.799632287017984


In [26]:
# Get the number of features after preprocessing
num_features = X_train_processed.shape[1]

# Build the neural network model
model = Sequential([
    Input(shape=(num_features,)),  # Define input layer explicitly
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [20]:
# Define early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) # Stop if no large change in 5 consecutive epochs

# Train the model
history = model.fit(
    X_train_processed, y_train,
    epochs=10,  # Number of epochs
    batch_size=32,  # Batch size for training
    validation_data=(X_val_processed, y_val),
    callbacks=[early_stopping]  # Use early stopping
)

Epoch 1/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 2ms/step - accuracy: 0.9942 - loss: 0.0279 - val_accuracy: 0.9949 - val_loss: 0.0197
Epoch 2/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2ms/step - accuracy: 0.9953 - loss: 0.0191 - val_accuracy: 0.9950 - val_loss: 0.0192
Epoch 3/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2ms/step - accuracy: 0.9954 - loss: 0.0180 - val_accuracy: 0.9950 - val_loss: 0.0185
Epoch 4/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 2ms/step - accuracy: 0.9954 - loss: 0.0174 - val_accuracy: 0.9951 - val_loss: 0.0184
Epoch 5/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 2ms/step - accuracy: 0.9957 - loss: 0.0171 - val_accuracy: 0.9954 - val_loss: 0.0181
Epoch 6/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 2ms/step - accuracy: 0.9958 - loss: 0.0165 - val_accuracy: 0.9953 - val_loss: 0.018

In [21]:
# Evaluate the model on validation data
val_loss, val_accuracy = model.evaluate(X_val_processed, y_val, verbose=2)

print(f'Validation Loss: {val_loss}')
print(f'Validation Accuracy: {val_accuracy}')

# Predict on the validation data
y_pred_prob = model.predict(X_val_processed)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions

# Evaluate the model
print('Confusion Matrix:')
print(tf.math.confusion_matrix(y_val, y_pred))

print('\nClassification Report:')
print(classification_report(y_val, y_pred))

print('Accuracy:', accuracy_score(y_val, y_pred))

12157/12157 - 9s - 703us/step - accuracy: 0.9954 - loss: 0.0178
Validation Loss: 0.017841769382357597
Validation Accuracy: 0.9953547716140747
[1m12157/12157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step
Confusion Matrix:
tf.Tensor(
[[386276    442]
 [  1365    920]], shape=(2, 2), dtype=int32)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    386718
           1       0.68      0.40      0.50      2285

    accuracy                           1.00    389003
   macro avg       0.84      0.70      0.75    389003
weighted avg       0.99      1.00      0.99    389003

Accuracy: 0.995354791608291


In [22]:
# Save the model
model.save('./model/fraud_detection_model.keras')