## Fraud Detection using Machine Learning
##### using a logistic regression model to predict binary outcome of wheteher fraud is detected in a statement

In [38]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [25]:
# USING THE FOLLOWING FEATURES:
# category, amt, gender, state, city_pop, job, dob, is_fraud

data = pd.read_csv('fraudTrain.csv')

In [26]:
# Convert dob to age

data['dob'] = pd.to_datetime(data['dob'])

# Calculate age
current_year = pd.to_datetime('now').year
data['age'] = current_year - data['dob'].dt.year

# Drop 'dob' and replace with 'age' for column clarity
data.drop(['dob'], axis=1, inplace=True)

In [27]:
# Split data into training and test sets
X = data[['category', 'amt', 'gender', 'state', 'city_pop', 'job', 'age']]
y = data['is_fraud']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)


In [30]:
# Separate the features that require encoding
string_features = ['category', 'gender', 'job', 'state']
numeric_features = ['amt', 'city_pop', 'age']

# Create column transformer to encode

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('str', OneHotEncoder(handle_unknown='ignore'), string_features)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)


In [33]:
# Get the number of features after preprocessing
num_features = X_train_processed.shape[1]

# Build the neural network model
model = Sequential([
    Input(shape=(num_features,)),  # Define input layer explicitly
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [35]:
# Define early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) # Stop if no large change in 5 consecutive epochs

# Train the model
history = model.fit(
    X_train_processed, y_train,
    epochs=10,  # Number of epochs
    batch_size=32,  # Batch size for training
    validation_data=(X_val_processed, y_val),
    callbacks=[early_stopping]  # Use early stopping
)

Epoch 1/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 1ms/step - accuracy: 0.9969 - loss: 0.0112 - val_accuracy: 0.9967 - val_loss: 0.0113
Epoch 2/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1ms/step - accuracy: 0.9973 - loss: 0.0102 - val_accuracy: 0.9972 - val_loss: 0.0105
Epoch 3/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 1ms/step - accuracy: 0.9975 - loss: 0.0097 - val_accuracy: 0.9970 - val_loss: 0.0109
Epoch 4/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 1ms/step - accuracy: 0.9976 - loss: 0.0090 - val_accuracy: 0.9972 - val_loss: 0.0105
Epoch 5/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 1ms/step - accuracy: 0.9978 - loss: 0.0087 - val_accuracy: 0.9969 - val_loss: 0.0121
Epoch 6/10
[1m28365/28365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 1ms/step - accuracy: 0.9977 - loss: 0.0086 - val_accuracy: 0.9973 - val_loss: 0.010

In [39]:
# Evaluate the model on validation data
val_loss, val_accuracy = model.evaluate(X_val_processed, y_val, verbose=2)

print(f'Validation Loss: {val_loss}')
print(f'Validation Accuracy: {val_accuracy}')

# Predict on the validation data
y_pred_prob = model.predict(X_val_processed)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions

# Evaluate the model
print('Confusion Matrix:')
print(tf.math.confusion_matrix(y_val, y_pred))

print('\nClassification Report:')
print(classification_report(y_val, y_pred))

print('Accuracy:', accuracy_score(y_val, y_pred))

12157/12157 - 6s - 506us/step - accuracy: 0.9972 - loss: 0.0101
Validation Loss: 0.010088072158396244
Validation Accuracy: 0.9972365498542786
[1m12157/12157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 867us/step
Confusion Matrix:
tf.Tensor(
[[386349    369]
 [   706   1579]], shape=(2, 2), dtype=int32)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    386718
           1       0.81      0.69      0.75      2285

    accuracy                           1.00    389003
   macro avg       0.90      0.85      0.87    389003
weighted avg       1.00      1.00      1.00    389003

Accuracy: 0.9972365251681864
