In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import mlflow
import mlflow.sklearn
import joblib

# Load the datasets
fraud_data = pd.read_csv('C:/Users/bam/Documents/Data/Fraud_Data.csv')
creditcard_data = pd.read_csv('C:/Users/bam/Documents/Data/creditcard.csv')

# Specify target and features for each dataset
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']
X_creditcard = creditcard_data.drop(columns=['Class'])
y_creditcard = creditcard_data['Class']

# Convert date columns to datetime and extract year, month, day, hour 
for df in [X_fraud, X_creditcard]:
    for col in df.columns:
        if pd.api.types.is_string_dtype(df[col]) and 'date' in col.lower():
            df[col] = pd.to_datetime(df[col], errors='coerce')
            df['year'] = df[col].dt.year
            df['month'] = df[col].dt.month
            df['day'] = df[col].dt.day
            df['hour'] = df[col].dt.hour
            df.drop(columns=[col], inplace=True)

# Train-Test Split
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.3, random_state=42)
X_cc_train, X_cc_test, y_cc_train, y_cc_test = train_test_split(X_creditcard, y_creditcard, test_size=0.3, random_state=42)

# Normalize/Scale features
scaler = StandardScaler()
X_fraud_train = scaler.fit_transform(X_fraud_train.select_dtypes(include='number'))
X_fraud_test = scaler.transform(X_fraud_test.select_dtypes(include='number'))
X_cc_train = scaler.fit_transform(X_cc_train.select_dtypes(include='number'))
X_cc_test = scaler.transform(X_cc_test.select_dtypes(include='number'))

# Initialize Models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Multi-Layer Perceptron": MLPClassifier(max_iter=500)
}

# MLflow Tracking
mlflow.set_experiment("Fraud Detection Project")

# Function to save evaluation metrics to a CSV file
def save_metrics_to_csv(model_name, metrics, filename):
    df = pd.DataFrame(metrics, index=[0])
    df.to_csv(filename, mode='a', header=not pd.io.common.file_exists(filename), index=False)

# Function to save predictions to a CSV file
def save_predictions_to_csv(model_name, y_pred, filename):
    df = pd.DataFrame(y_pred, columns=['Predictions'])
    df['Model'] = model_name
    df.to_csv(filename, mode='a', header=not pd.io.common.file_exists(filename), index=False)

# Training and Evaluation Function
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name):
    with mlflow.start_run():
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Prepare metrics for saving
        metrics = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        }

        # Log model and metrics to MLflow
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("model_type", type(model).__name__)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        # Prepare input example for model logging
        input_example = X_test[0:1]  # Use the first row as an example
        
        mlflow.sklearn.log_model(model, model_name, input_example=input_example)

        # Save the trained model to a joblib file
        joblib_file_path = f"C:/Users/bam/Desktop/Week-8/notebooks/trained_model_{model_name}.joblib"
        joblib.dump(model, joblib_file_path)

        # Save metrics to a CSV file
        metrics_file_path = "C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv"
        save_metrics_to_csv(model_name, metrics, metrics_file_path)

        # Save predictions to a CSV file
        predictions_file_path = "C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv"
        save_predictions_to_csv(model_name, y_pred, predictions_file_path)

        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"Saved model to: {joblib_file_path}")
        print(f"Saved metrics to: {metrics_file_path}")
        print(f"Saved predictions to: {predictions_file_path}")
        print("-" * 30)

# Train and evaluate models on Credit Card Data
print("Training on Credit Card Data...")
for model_name, model in models.items():
    train_and_evaluate(model, X_cc_train, X_cc_test, y_cc_train, y_cc_test, model_name)

# Train and evaluate models on Fraud Data
print("Training on Fraud Data...")
for model_name, model in models.items():
    train_and_evaluate(model, X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test, model_name)


Training on Credit Card Data...


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 636.66it/s] 


Model: Logistic Regression
Accuracy: 0.9993
Precision: 0.8673
Recall: 0.6250
F1 Score: 0.7265
Saved model to: C:/Users/bam/Desktop/Week-8/notebooks/trained_model_Logistic Regression.joblib
Saved metrics to: C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv
Saved predictions to: C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv
------------------------------


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 350.19it/s]


Model: Decision Tree
Accuracy: 0.9992
Precision: 0.7333
Recall: 0.8088
F1 Score: 0.7692
Saved model to: C:/Users/bam/Desktop/Week-8/notebooks/trained_model_Decision Tree.joblib
Saved metrics to: C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv
Saved predictions to: C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv
------------------------------


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 466.93it/s] 


Model: Random Forest
Accuracy: 0.9996
Precision: 0.9244
Recall: 0.8088
F1 Score: 0.8627
Saved model to: C:/Users/bam/Desktop/Week-8/notebooks/trained_model_Random Forest.joblib
Saved metrics to: C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv
Saved predictions to: C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv
------------------------------


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 466.94it/s]


Model: Gradient Boosting
Accuracy: 0.9986
Precision: 0.8947
Recall: 0.1250
F1 Score: 0.2194
Saved model to: C:/Users/bam/Desktop/Week-8/notebooks/trained_model_Gradient Boosting.joblib
Saved metrics to: C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv
Saved predictions to: C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv
------------------------------


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 212.24it/s]


Model: Multi-Layer Perceptron
Accuracy: 0.9995
Precision: 0.8974
Recall: 0.7721
F1 Score: 0.8300
Saved model to: C:/Users/bam/Desktop/Week-8/notebooks/trained_model_Multi-Layer Perceptron.joblib
Saved metrics to: C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv
Saved predictions to: C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv
------------------------------
Training on Fraud Data...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 538.75it/s]


Model: Logistic Regression
Accuracy: 0.9070
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Saved model to: C:/Users/bam/Desktop/Week-8/notebooks/trained_model_Logistic Regression.joblib
Saved metrics to: C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv
Saved predictions to: C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv
------------------------------


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 291.84it/s]


Model: Decision Tree
Accuracy: 0.8884
Precision: 0.4132
Recall: 0.4759
F1 Score: 0.4424
Saved model to: C:/Users/bam/Desktop/Week-8/notebooks/trained_model_Decision Tree.joblib
Saved metrics to: C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv
Saved predictions to: C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv
------------------------------


Downloading artifacts: 100%|██████████| 7/7 [00:04<00:00,  1.65it/s] 


Model: Random Forest
Accuracy: 0.9342
Precision: 0.8815
Recall: 0.3386
F1 Score: 0.4893
Saved model to: C:/Users/bam/Desktop/Week-8/notebooks/trained_model_Random Forest.joblib
Saved metrics to: C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv
Saved predictions to: C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv
------------------------------


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 636.67it/s] 


Model: Gradient Boosting
Accuracy: 0.9070
Precision: 0.6000
Recall: 0.0014
F1 Score: 0.0028
Saved model to: C:/Users/bam/Desktop/Week-8/notebooks/trained_model_Gradient Boosting.joblib
Saved metrics to: C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv
Saved predictions to: C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv
------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 500.21it/s] 


Model: Multi-Layer Perceptron
Accuracy: 0.9070
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Saved model to: C:/Users/bam/Desktop/Week-8/notebooks/trained_model_Multi-Layer Perceptron.joblib
Saved metrics to: C:/Users/bam/Desktop/Week-8/notebooks/model_metrics.csv
Saved predictions to: C:/Users/bam/Desktop/Week-8/notebooks/predictions.csv
------------------------------
