In [2]:
import os
os.chdir("../")
%pwd

'/Users/tapankheni/Data_Science/Data Science Projects/Credit_Card_Fault_Prediction'

In [3]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    x_train_data_path: List[str]
    y_train_data_path: List[str]
    x_val_data_path: List[str]
    y_val_data_path: List[str]
    model_name: List[str]

In [4]:
from CreditCardFraudDetection.constants import PARAMS_YAML_FILE_PATH, CONFIG_YAML_FILE_PATH, SCHEMA_YAML_FILE_PATH
from CreditCardFraudDetection.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(self,
                 params_yaml_file_path = PARAMS_YAML_FILE_PATH,
                 config_yaml_file_path = CONFIG_YAML_FILE_PATH,
                 schema_yaml_file_path = SCHEMA_YAML_FILE_PATH):
        
        self.params = read_yaml(params_yaml_file_path)
        self.config = read_yaml(config_yaml_file_path)
        self.schema = read_yaml(schema_yaml_file_path)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.model_selection

        create_directories([config.root_dir])

        return ModelTrainerConfig(
            root_dir = Path(config.root_dir),
            x_train_data_path = config.x_train_data_path,
            y_train_data_path = config.y_train_data_path,
            x_val_data_path = config.x_val_data_path,
            y_val_data_path = config.y_val_data_path,
            model_name = config.model_name
        )

In [6]:
import pandas as pd
import numpy as np  
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import joblib
import os
import time
from CreditCardFraudDetection import logger

In [13]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        self.models = {
            'SVC': SVC(),
            'RandomForestClassifier': RandomForestClassifier(),
            'GradientBoostingClassifier': GradientBoostingClassifier(),
            'AdaBoostClassifier': AdaBoostClassifier(),
            'DecisionTreeClassifier': DecisionTreeClassifier(),
            'KNeighborsClassifier': KNeighborsClassifier(),
            'LGBMClassifier': LGBMClassifier(),
            'XGBClassifier': XGBClassifier(),
            'CatBoostClassifier' : CatBoostClassifier()
        }

    def evaluate_model(self, true, predicted):

        accuracy = accuracy_score(true, predicted)
        precision = precision_score(true, predicted)
        recall = recall_score(true, predicted)
        f1 = f1_score(true, predicted)

        return accuracy, precision, recall, f1

    def model_training(self, X_train, y_train, X_test, y_test, models):
        
        model_performance = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1_score", "training_time", "prediction_time", "total_time"])

        for i in range(len(models)):
            start_time = time.time()
            model = list(models.values())[i]
            model.fit(X_train, y_train)
            end_training = time.time()

            y_pred = model.predict(X_test)
            end_prediction = time.time()

            accuracy, precision, recall, f1 = self.evaluate_model(y_test, y_pred)

            model_performance.loc[list(models.keys())[i]] = [accuracy, precision, recall, f1, end_training-start_time, end_prediction-end_training, end_prediction-start_time]

        if not os.path.exists(self.config.root_dir):
            os.makedirs(self.config.root_dir)
        
        model_performance.to_json(os.path.join(self.config.root_dir, "default_model_performance.json"))
        best_score = model_performance["f1_score"].max()
        best_model_name = model_performance[model_performance["f1_score"] == best_score].index[0]

        return best_score, best_model_name


    def train(self):

        logger.info("Training the model...")
        X_train_path = self.config.x_train_data_path[1]
        y_train_path = self.config.y_train_data_path[1]
        X_val_path = self.config.x_val_data_path[1]
        y_val_path = self.config.y_val_data_path[1]

        X_train = np.load(X_train_path)
        y_train = np.load(y_train_path)
        X_val = np.load(X_val_path)
        y_val = np.load(y_val_path)
        logger.info("Data loaded successfully")

        logger.info(f"shape of X_train: {X_train.shape}")
        logger.info(f"shape of y_train: {y_train.shape}")
        logger.info(f"shape of X_val: {X_val.shape}")
        logger.info(f"shape of y_val: {y_val.shape}")

        best_model_score, best_model_name = self.model_training(X_train, y_train, X_val, y_val, self.models)
        logger.info(f"Best model name: {best_model_name}")
        logger.info(f"Best model score: {best_model_score}")

        model = self.models[best_model_name]
        model.fit(X_train, y_train)
        logger.info("Model trained successfully")

        model_path = os.path.join(self.config.root_dir, self.config.model_name[1])
        joblib.dump(model, model_path)
        logger.info(f"Model saved at: {model_path}")


In [14]:
try:
    config_manager = ConfigurationManager()
    model_trainer_config = config_manager.get_model_trainer_config()
    model_trainer = ModelTrainer(config = model_trainer_config)
    model_trainer.train()

except Exception as e:
    logger.error(e)
    raise e

[2024-07-05 19:52:54,892: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-05 19:52:54,895: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-05 19:52:54,899: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-05 19:52:54,899: INFO: common: created directory at: artifacts]
[2024-07-05 19:52:54,900: INFO: common: created directory at: artifacts/model_trainer]
[2024-07-05 19:52:54,900: INFO: 2903018702: Training the model...]
[2024-07-05 19:52:54,904: INFO: 2903018702: Data loaded successfully]
[2024-07-05 19:52:54,905: INFO: 2903018702: shape of X_train: (22425, 18)]
[2024-07-05 19:52:54,905: INFO: 2903018702: shape of y_train: (22425,)]
[2024-07-05 19:52:54,906: INFO: 2903018702: shape of X_val: (9611, 18)]
[2024-07-05 19:52:54,906: INFO: 2903018702: shape of y_val: (9611,)]




[LightGBM] [Info] Number of positive: 11213, number of negative: 11212
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2322
[LightGBM] [Info] Number of data points in the train set: 22425, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500022 -> initscore=0.000089
[LightGBM] [Info] Start training from score 0.000089
Learning rate set to 0.038877
0:	learn: 0.6815233	total: 4.29ms	remaining: 4.28s
1:	learn: 0.6705118	total: 7.93ms	remaining: 3.96s
2:	learn: 0.6587522	total: 11.6ms	remaining: 3.85s
3:	learn: 0.6489448	total: 14.9ms	remaining: 3.72s
4:	learn: 0.6402751	total: 18.8ms	remaining: 3.74s
5:	learn: 0.6321334	total: 22.9ms	remaining: 3.8s
6:	learn: 0.6245219	total: 26.4ms	remaining: 3.74s
7:	learn: 0.6176862	total: 30.3ms	remaining: 3.75s
8:	lear