In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split

import os, sys
# Add the 'src' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

# Custom class
from model_training import ModelPipeline

# Configure logging
from logger import SetupLogger
# Assuming this class is defined in src/
from data_preprocessing import LoadData  

logger = SetupLogger(log_file='../logs/model_training.log').get_logger()

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Initialize the LoadData class
fraud_data_init = LoadData(filepath='../data/processed/processed_fraud_data.csv', logger=logger)
creditcard_data_init = LoadData(filepath='../data/raw/creditcard.csv', logger=logger)


# Load the datasets
fraud_data = fraud_data_init.load_dataset().set_index('user_id')
credit_data = creditcard_data_init.load_dataset()

# Display the first few rows of both datasets
print("Fraud_Data.csv Head:")
print("=========================")
display(fraud_data.head())

print("\nCreditcard.csv Head:")
print("=========================")
display(credit_data.head())

In [None]:
# Define target and features
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

X_creditcard = credit_data.drop(columns=['Class'])
y_creditcard = credit_data['Class']

In [None]:
# Create train-test splits for both datasets
# Fraud_Data.csv split
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

# creditcard.csv split
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(X_creditcard, y_creditcard, test_size=0.2, random_state=42)

In [None]:
# Model Training for Fraud_Data.csv
# Create the pipeline for the Fraud_Data dataset
fraud_pipeline = ModelPipeline(X_train=X_fraud_train, X_test=X_fraud_test, y_train=y_fraud_train, y_test=y_fraud_test)

# Train and evaluate models
best_fraud_model, best_fraud_model_name = fraud_pipeline.train_and_evaluate()

# Save the best model
fraud_pipeline.save_best_models(best_fraud_model, best_fraud_model_name, dataset_name='fraud_data')

In [None]:
# Model Training for creditcard.csv
# Create the pipeline for the Credit Card Fraud dataset
creditcard_pipeline = ModelPipeline(X_train=X_creditcard_train, X_test=X_creditcard_test, y_train=y_creditcard_train, y_test=y_creditcard_test)

# Train and evaluate models
best_creditcard_model, best_creditcard_model_name = creditcard_pipeline.train_and_evaluate()

# Save the best model
creditcard_pipeline.save_best_models(best_creditcard_model, best_creditcard_model_name, dataset_name='creditcard')

In [None]:
# Review evaluation metrics for both datasets

# Fraud_Data.csv results
fraud_metrics, fraud_probs = fraud_pipeline.get_results()
print("Fraud_Data.csv Model Metrics:")
display(fraud_metrics)

# creditcard.csv results
creditcard_metrics, creditcard_probs = creditcard_pipeline.get_results()
print("\ncreditcard.csv Model Metrics:")
display(creditcard_metrics)