In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# Load dataset from CSV file
file_path = 'Datasets.csv' 
df = pd.read_csv(file_path)

# Handle missing values
numerical_columns = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
imputer = SimpleImputer(strategy='mean')
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])

# Drop rows with missing values in the target variable (isFraud)
df.dropna(subset=['isFraud'], inplace=True)

# Separate features (X) and target variable (y)
X = df.drop(columns=['isFraud'])  
y = df['isFraud']  # Target variable

# Define preprocessing steps for different types of columns
numeric_features = X.select_dtypes(include=['float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Apply preprocessing transformations using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Display the shapes of the preprocessed training and testing sets
print(f"X_train_preprocessed shape: {X_train_preprocessed.shape}")
print(f"X_test_preprocessed shape: {X_test_preprocessed.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train_preprocessed shape: (8100, 13529)
X_test_preprocessed shape: (2025, 13529)
y_train shape: (8100,)
y_test shape: (2025,)


In [2]:
# Dictionary to store models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Neural Network': MLPClassifier()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_preprocessed, y_train)
    y_pred = model.predict(X_test_preprocessed)
    
    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    print(f"--- {name} Results ---")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")
    print(f"ROC-AUC score: {roc_auc}")
    print("\n")


--- Logistic Regression Results ---
Accuracy: 0.9940740740740741
Precision: 1.0
Recall: 0.07692307692307693
F1-score: 0.14285714285714288
ROC-AUC score: 0.5384615384615384




  _warn_prf(average, modifier, msg_start, len(result))


--- Random Forest Results ---
Accuracy: 0.9935802469135803
Precision: 0.0
Recall: 0.0
F1-score: 0.0
ROC-AUC score: 0.5


--- Gradient Boosting Results ---
Accuracy: 0.9935802469135803
Precision: 0.5
Recall: 0.07692307692307693
F1-score: 0.13333333333333336
ROC-AUC score: 0.5382130295152164


--- Neural Network Results ---
Accuracy: 0.9930864197530864
Precision: 0.3333333333333333
Recall: 0.07692307692307693
F1-score: 0.125
ROC-AUC score: 0.5379645205688943




## Determine the best performing model

In [3]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define a dictionary of classifiers to test
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Evaluate each classifier using cross-validation
results = {}

for name, clf in classifiers.items():
    scoring = {
        'Accuracy': 'accuracy',
        'Precision': 'precision',
        'Recall': 'recall',
        'F1-score': 'f1',
        'ROC-AUC': 'roc_auc'
    }
    scores = cross_validate(clf, X_train_preprocessed, y_train, cv=5, scoring=scoring)
    results[name] = {
        'Accuracy': scores['test_Accuracy'].mean(),
        'Precision': scores['test_Precision'].mean(),
        'Recall': scores['test_Recall'].mean(),
        'F1-score': scores['test_F1-score'].mean(),
        'ROC-AUC': scores['test_ROC-AUC'].mean()
    }

# Print results
print("Cross-validation scores:")
for name, scores in results.items():
    print(f"{name}:")
    for metric, score in scores.items():
        print(f"{metric}: {score:.4f}")
    print()


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Choose the best-performing model 
model = GradientBoostingClassifier()
model.fit(X_train_preprocessed, y_train)
y_pred = model.predict(X_test_preprocessed)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

# Calculate ROC curve and ROC AUC score
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
import joblib

# Save the trained model
joblib.dump(model, 'fraud_detection_model.pkl')
