In [1]:
import os

In [2]:
%pwd

'c:\\Users\\PASCAL\\Student_Performance_Prediction\\research'

In [3]:
os.chdir("../")

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    model_file_path: Path

In [5]:
from studentPerformance.constants import *
from studentPerformance.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
        root_dir=config.root_dir,
        data_path=config.data_path,
        model_path=config.model_path
        model_file_path=config.model_file_path
    )
        return model_trainer_config

In [7]:
import os
import sys
from dataclasses import dataclass

from catboost import CatBoostRegressor
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from src.studentPerformance.logger import logging
from src.studentPerformance.utils.common import evaluate_models
from src.studentPerformance.components.data_transformation import DataTransformation
import pickle

In [8]:
# Basic Import
import numpy as np
import pandas as pd

import os
import sys
from dataclasses import dataclass

# Modelling
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from src.studentPerformance.logger import logging
from src.studentPerformance.utils.common import evaluate_models
from src.studentPerformance.utils.common import print_evaluated_results
from src.studentPerformance.utils.common import model_metrics
from src.studentPerformance.utils.common import save_object
from src.studentPerformance.components.data_transformation import DataTransformation
import pickle

In [25]:

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def initiate_model_trainer(self, train_array, test_array):
        try:
            logging.info('Splitting Dependent and Independent variables from train and test data')
            xtrain, ytrain, xtest, ytest = (
                train_array[:,:-1],
                train_array[:,-1],
                test_array[:,:-1],
                test_array[:,-1]
            )
            
            models = {
                "Linear Regression": LinearRegression(),
                "Lasso": Lasso(),
                "Ridge": Ridge(),
                "K-Neighbors Regressor": KNeighborsRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Random Forest Regressor": RandomForestRegressor(),
                "XGBRegressor": XGBRegressor(), 
                "CatBoosting Regressor": CatBoostRegressor(verbose=False),
                "GradientBoosting Regressor":GradientBoostingRegressor(),
                "AdaBoost Regressor": AdaBoostRegressor()
            }

            model_report:dict = evaluate_models(xtrain,ytrain,xtest,ytest,models)

            print(model_report)
            print('\n====================================================================================\n')
            logging.info(f'Model Report : {model_report}')
            # To get best model score from dictionary 
            best_model_score = max(sorted(model_report.values()))

            best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]
            best_model = models[best_model_name]

            if best_model_score < 0.6 :
                logging.info('Best model has r2 Score less than 60%')
                raise Exception('No Best Model Found')
            
            # Save model object
            model_file_path = os.path.join("artifacts", "model_trainer", "model.pkl")
            with open(model_file_path, "wb") as file:
                pickle.dump(best_model, file)

            print(f'Best Model Found , Model Name : {best_model_name} , R2 Score : {best_model_score}')
            print('\n====================================================================================\n')
            logging.info(f'Best Model Found , Model Name : {best_model_name} , R2 Score : {best_model_score}')
            logging.info('Hyperparameter tuning started for catboost')

            # Hyperparameter tuning on Catboost
            # Initializing catboost
            cbr = CatBoostRegressor(verbose=False)

            # Creating the hyperparameter grid
            param_dist = {'depth'          : [4,5,6,7,8,9, 10],
                          'learning_rate' : [0.01,0.02,0.03,0.04],
                          'iterations'    : [300,400,500,600]}

            #Instantiate RandomSearchCV object
            rscv = RandomizedSearchCV(cbr , param_dist, scoring='r2', cv =5, n_jobs=-1)

            # Fit the model
            rscv.fit(xtrain, ytrain)

            # Print the tuned parameters and score
            print(f'Best Catboost parameters : {rscv.best_params_}')
            print(f'Best Catboost Score : {rscv.best_score_}')
            print('\n====================================================================================\n')

            best_cbr = rscv.best_estimator_

            logging.info('Hyperparameter tuning complete for Catboost')

            logging.info('Hyperparameter tuning started for KNN')

            # Initialize knn
            knn = KNeighborsRegressor()

            # parameters
            k_range = list(range(2, 31))
            param_grid = dict(n_neighbors=k_range)

            # Fitting the cvmodel
            grid = GridSearchCV(knn, param_grid, cv=5, scoring='r2',n_jobs=-1)
            grid.fit(xtrain, ytrain)

            # Print the tuned parameters and score
            print(f'Best KNN Parameters : {grid.best_params_}')
            print(f'Best KNN Score : {grid.best_score_}')
            print('\n====================================================================================\n')

            best_knn = grid.best_estimator_

            logging.info('Hyperparameter tuning Complete for KNN')

            logging.info('Voting Regressor model training started')

            # Creating final Voting regressor
            er = VotingRegressor([('cbr',best_cbr),('xgb',XGBRegressor()),('knn',best_knn)], weights=[3,2,1])
            er.fit(xtrain, ytrain)
            print('Final Model Evaluation :\n')
            print_evaluated_results(xtrain,ytrain,xtest,ytest,er)
            logging.info('Voting Regressor Training Completed')

            save_object(
                file_path=model_file_path,
                obj=er
            )

            logging.info('Model pickle file saved')
            # Evaluating Ensemble Regressor (Voting Classifier on test data)
            ytest_pred = er.predict(xtest)

            mae, rmse, r2 = model_metrics(ytest, ytest_pred)
            logging.info(f'Test MAE : {mae}')
            logging.info(f'Test RMSE : {rmse}')
            logging.info(f'Test R2 Score : {r2}')
            logging.info('Final Model Training Completed')
            
            return mae, rmse, r2 
        
        except Exception as e:
            logging.info('Exception occured at Model Training')
            raise e

In [26]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    data_transformation = DataTransformation(model_trainer_config)
    train_arr, test_arr, _ = data_transformation.initiate_data_transformation()

    model_trainer = ModelTrainer(model_trainer_config)
    model_trainer.initiate_model_trainer(train_arr, test_arr)
except Exception as e:
    raise e


[2023-08-24 05:44:20,759: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-08-24 05:44:20,762: INFO: common: yaml file: params.yaml loaded successfully]
[2023-08-24 05:44:20,764: INFO: common: created directory at: artifacts]
[2023-08-24 05:44:20,765: INFO: common: created directory at: artifacts/model_trainer]
[2023-08-24 05:44:20,766: INFO: data_transformation: Read train and test data completed]
[2023-08-24 05:44:20,766: INFO: data_transformation: Obtaining preprocessing object]
[2023-08-24 05:44:20,767: INFO: data_transformation: Categorical columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']]
[2023-08-24 05:44:20,768: INFO: data_transformation: Numerical columns: ['writing_score', 'reading_score']]
[2023-08-24 05:44:20,832: INFO: data_transformation: Saved preprocessing object.]
[2023-08-24 05:44:20,833: INFO: data_transformation: Transformation of the data is completed]
[2023-08-24 05:44:20,834: INFO: 4989

{'Linear Regression': 0.8795158595242263, 'Lasso': 0.8564278149469471, 'Ridge': 0.8805671790733921, 'K-Neighbors Regressor': 0.47561174068704326, 'Decision Tree': 0.7454573548605563, 'Random Forest Regressor': 0.8529852512962283, 'XGBRegressor': 0.8210206583029993, 'CatBoosting Regressor': 0.8523560006768236, 'GradientBoosting Regressor': 0.8723268951354977, 'AdaBoost Regressor': 0.8438670697740134}


[2023-08-24 05:44:23,957: INFO: 498946074: Model Report : {'Linear Regression': 0.8795158595242263, 'Lasso': 0.8564278149469471, 'Ridge': 0.8805671790733921, 'K-Neighbors Regressor': 0.47561174068704326, 'Decision Tree': 0.7454573548605563, 'Random Forest Regressor': 0.8529852512962283, 'XGBRegressor': 0.8210206583029993, 'CatBoosting Regressor': 0.8523560006768236, 'GradientBoosting Regressor': 0.8723268951354977, 'AdaBoost Regressor': 0.8438670697740134}]
Best Model Found , Model Name : Ridge , R2 Score : 0.8805671790733921


[2023-08-24 05:44:23,959: INFO: 498946074: Best Model Found ,

In [None]:
output_dir = os.path.join(self.config.root_dir, "artifacts/model_trainer")
            os.makedirs(output_dir, exist_ok=True)

            output_file = os.path.join(output_dir, "model.pkl")
            with open(output_file, "wb") as file:
                pickle.dump(best_model, file)