In [2]:
import os
os.chdir("../")
%pwd

'/Users/tapankheni/Data_Science/Data Science Projects/Credit_Card_Fault_Prediction'

In [16]:
from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    x_train_data_path: List[str]
    y_train_data_path: List[str]
    x_val_data_path: List[str]
    y_val_data_path: List[str]
    model_name: List[str]

In [5]:
from CreditCardFraudDetection.constants import PARAMS_YAML_FILE_PATH, CONFIG_YAML_FILE_PATH, SCHEMA_YAML_FILE_PATH
from CreditCardFraudDetection.utils.common import read_yaml, create_directories

In [18]:
class ConfigurationManager:
    def __init__(self,
                 params_yaml_file_path = PARAMS_YAML_FILE_PATH,
                 config_yaml_file_path = CONFIG_YAML_FILE_PATH,
                 schema_yaml_file_path = SCHEMA_YAML_FILE_PATH):
        
        self.params = read_yaml(params_yaml_file_path)
        self.config = read_yaml(config_yaml_file_path)
        self.schema = read_yaml(schema_yaml_file_path)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.model_selection

        create_directories([config.root_dir])

        return ModelTrainerConfig(
            root_dir = Path(config.root_dir),
            x_train_data_path = config.x_train_data_path,
            y_train_data_path = config.y_train_data_path,
            x_val_data_path = config.x_val_data_path,
            y_val_data_path = config.y_val_data_path,
            model_name = config.model_name
        )

In [19]:
import pandas as pd
import numpy as np  
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
import os
import time
from CreditCardFraudDetection import logger

In [20]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        self.models = {
            'SVC': SVC(),
            'RandomForestClassifier': RandomForestClassifier(),
            'GradientBoostingClassifier': GradientBoostingClassifier(),
            'AdaBoostClassifier': AdaBoostClassifier(),
            'DecisionTreeClassifier': DecisionTreeClassifier(),
            'KNeighborsClassifier': KNeighborsClassifier()
        }

    def evaluate_model(self, true, predicted):

        accuracy = accuracy_score(true, predicted)
        precision = precision_score(true, predicted)
        recall = recall_score(true, predicted)
        f1 = f1_score(true, predicted)

        return accuracy, precision, recall, f1

    def model_training(self, X_train, y_train, X_test, y_test, models):
        
        model_performance = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1_score", "training_time", "prediction_time", "total_time"])

        for i in range(len(models)):
            start_time = time.time()
            model = list(models.values())[i]
            model.fit(X_train, y_train)
            end_training = time.time()

            y_pred = model.predict(X_test)
            end_prediction = time.time()

            accuracy, precision, recall, f1 = self.evaluate_model(y_test, y_pred)

            model_performance.loc[list(models.keys())[i]] = [accuracy, precision, recall, f1, end_training-start_time, end_prediction-end_training, end_prediction-start_time]

        if not os.path.exists(self.config.root_dir):
            os.makedirs(self.config.root_dir)
        
        model_performance.to_json(os.path.join(self.config.root_dir, "fraudulent_model_performance.json"))
        best_score = model_performance["recall"].max()
        best_model_name = model_performance[model_performance["recall"] == best_score].index[0]

        return best_score, best_model_name


    def train(self):

        logger.info("Training the model...")
        X_train_path = self.config.x_train_data_path[0]
        y_train_path = self.config.y_train_data_path[0]
        X_val_path = self.config.x_val_data_path[0]
        y_val_path = self.config.y_val_data_path[0]

        X_train = np.load(X_train_path)
        y_train = np.load(y_train_path)
        X_val = np.load(X_val_path)
        y_val = np.load(y_val_path)
        logger.info("Data loaded successfully")

        logger.info(f"shape of X_train: {X_train.shape}")
        logger.info(f"shape of y_train: {y_train.shape}")
        logger.info(f"shape of X_val: {X_val.shape}")
        logger.info(f"shape of y_val: {y_val.shape}")

        best_model_score, best_model_name = self.model_training(X_train, y_train, X_val, y_val, self.models)
        logger.info(f"Best model name: {best_model_name}")
        logger.info(f"Best model score: {best_model_score}")

        model = self.models[best_model_name]
        model.fit(X_train, y_train)

        model_path = os.path.join(self.config.root_dir, self.config.model_name[0])
        joblib.dump(model, model_path)


In [21]:
try:
    config_manager = ConfigurationManager()
    model_trainer_config = config_manager.get_model_trainer_config()
    model_trainer = ModelTrainer(config = model_trainer_config)
    model_trainer.train()

except Exception as e:
    logger.error(e)
    raise e

[2024-07-05 19:11:42,873: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-05 19:11:42,877: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-05 19:11:42,881: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-05 19:11:42,881: INFO: common: created directory at: artifacts]
[2024-07-05 19:11:42,882: INFO: common: created directory at: artifacts/model_trainer]
[2024-07-05 19:11:42,882: INFO: 364367102: Training the model...]
[2024-07-05 19:11:42,911: INFO: 364367102: Data loaded successfully]
[2024-07-05 19:11:42,920: INFO: 364367102: shape of X_train: (278628, 29)]
[2024-07-05 19:11:42,921: INFO: 364367102: shape of y_train: (278628,)]
[2024-07-05 19:11:42,926: INFO: 364367102: shape of X_val: (119413, 29)]
[2024-07-05 19:11:42,927: INFO: 364367102: shape of y_val: (119413,)]




[2024-07-05 19:22:43,391: INFO: 364367102: Best model name: RandomForestClassifier]
[2024-07-05 19:22:43,391: INFO: 364367102: Best model score: 1.0]
