In [1]:
import os

In [2]:
%pwd

'c:\\Users\\PASCAL\\flight_price_prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\PASCAL\\flight_price_prediction'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    ## model_file_path: Path
    

In [6]:
from src.flightprice.constants import *
from src.flightprice.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
        root_dir=config.root_dir,
        data_path=config.data_path,
        model_path=config.model_path
        ## model_file_path=config.model_file_path
    )
        return model_trainer_config

In [8]:
# Basic Import
import numpy as np
import pandas as pd

import os
import sys
from dataclasses import dataclass

# Modelling
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from src.flightprice.logger import logging
from src.flightprice.components.data_transformation import DataTransformation
from src.flightprice.utils.common import evaluate_models
from src.flightprice.utils.common import print_evaluated_results
from src.flightprice.utils.common import model_metrics
from src.flightprice.utils.common import save_object
from src.flightprice.components.data_transformation import DataTransformation
import pickle

In [9]:

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def initiate_model_trainer(self, train_array, test_array):
        try:
            logging.info('Splitting Dependent and Independent variables from train and test data')
            xtrain, ytrain, xtest, ytest = (
                train_array[:,:-1],
                train_array[:,-1],
                test_array[:,:-1],
                test_array[:,-1]
            )
            
            models = {
                "Linear Regression": LinearRegression(),
                "Lasso": Lasso(),
                "Ridge": Ridge(),
                "K-Neighbors Regressor": KNeighborsRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Random Forest Regressor": RandomForestRegressor(),
                "XGBRegressor": XGBRegressor(), 
                "CatBoosting Regressor": CatBoostRegressor(verbose=False),
                "GradientBoosting Regressor":GradientBoostingRegressor(),
                "AdaBoost Regressor": AdaBoostRegressor()
            }

            model_report:dict = evaluate_models(xtrain,ytrain,xtest,ytest,models)

            print(model_report)
            print('\n====================================================================================\n')
            logging.info(f'Model Report : {model_report}')
            # To get best model score from dictionary 
            best_model_score = max(sorted(model_report.values()))

            best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]
            best_model = models[best_model_name]

            if best_model_score < 0.6 :
                logging.info('Best model has r2 Score less than 60%')
                raise Exception('No Best Model Found')
            
            # Save model object
            model_file_path = os.path.join("artifacts", "model_trainer", "model.pkl")
            with open(model_file_path, "wb") as file:
                pickle.dump(best_model, file)

            print(f'Best Model Found , Model Name : {best_model_name} , R2 Score : {best_model_score}')
            print('\n====================================================================================\n')
            logging.info(f'Best Model Found , Model Name : {best_model_name} , R2 Score : {best_model_score}')
            logging.info('Hyperparameter tuning started for catboost')

            # Hyperparameter tuning on Catboost
            # Initializing catboost
            cbr = CatBoostRegressor(verbose=False)

            # Creating the hyperparameter grid
            param_dist = {'depth'          : [4,5,6,7,8,9, 10],
                          'learning_rate' : [0.01,0.02,0.03,0.04],
                          'iterations'    : [300,400,500,600]}

            #Instantiate RandomSearchCV object
            rscv = RandomizedSearchCV(cbr , param_dist, scoring='r2', cv =5, n_jobs=-1)

            # Fit the model
            rscv.fit(xtrain, ytrain)

            # Print the tuned parameters and score
            print(f'Best Catboost parameters : {rscv.best_params_}')
            print(f'Best Catboost Score : {rscv.best_score_}')
            print('\n====================================================================================\n')

            best_cbr = rscv.best_estimator_

            logging.info('Hyperparameter tuning complete for Catboost')

            logging.info('Hyperparameter tuning started for KNN')

            # Initialize knn
            knn = KNeighborsRegressor()

            # parameters
            k_range = list(range(2, 31))
            param_grid = dict(n_neighbors=k_range)

            # Fitting the cvmodel
            grid = GridSearchCV(knn, param_grid, cv=5, scoring='r2',n_jobs=-1)
            grid.fit(xtrain, ytrain)

            # Print the tuned parameters and score
            print(f'Best KNN Parameters : {grid.best_params_}')
            print(f'Best KNN Score : {grid.best_score_}')
            print('\n====================================================================================\n')

            best_knn = grid.best_estimator_

            logging.info('Hyperparameter tuning Complete for KNN')

            logging.info('Voting Regressor model training started')

            # Creating final Voting regressor
            er = VotingRegressor([('cbr',best_cbr),('xgb',XGBRegressor()),('knn',best_knn)], weights=[3,2,1])
            er.fit(xtrain, ytrain)
            print('Final Model Evaluation :\n')
            print_evaluated_results(xtrain,ytrain,xtest,ytest,er)
            logging.info('Voting Regressor Training Completed')

            save_object(
                file_path=model_file_path,
                obj=er
            )

            logging.info('Model pickle file saved')
            # Evaluating Ensemble Regressor (Voting Classifier on test data)
            ytest_pred = er.predict(xtest)

            mae, rmse, r2 = model_metrics(ytest, ytest_pred)
            logging.info(f'Test MAE : {mae}')
            logging.info(f'Test RMSE : {rmse}')
            logging.info(f'Test R2 Score : {r2}')
            logging.info('Final Model Training Completed')
            
            return mae, rmse, r2 
        
        except Exception as e:
            logging.info('Exception occured at Model Training')
            raise e

In [27]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    data_transformation = DataTransformation(model_trainer_config)
    train_arr, test_arr, _ = data_transformation.initiate_data_transformation()

    model_trainer = ModelTrainer(model_trainer_config)
    model_trainer.initiate_model_trainer(train_arr, test_arr)
except Exception as e:
    raise e


2023-09-27 04:15:35,928 - INFO - yaml file: config\config.yaml loaded successfully
2023-09-27 04:15:35,935 - INFO - yaml file: params.yaml loaded successfully
2023-09-27 04:15:35,938 - INFO - created directory at: artifacts
2023-09-27 04:15:35,942 - INFO - created directory at: artifacts/model_trainer
2023-09-27 04:15:35,944 - INFO - Read train and test data completed
2023-09-27 04:15:35,947 - INFO - Obtaining preprocessing object


2023-09-27 04:15:36,141 - INFO - Read train and test data completed
2023-09-27 04:15:36,156 - INFO - Train Dataframe Head : 
       Airline   Source Destination                        Route Total_Stops Additional_Info  Price  Journey_day  Journey_month  Dep_hour  Dep_min  Arrival_hour  Arrival_min  Duration_hours  Duration_mins
0       IndiGo  Kolkata    Banglore                    CCU → BLR    non-stop         No info   4174           18              4        21       25             0            5               2             40
1  Jet Airways    Delhi      Cochin              DEL → BOM → COK      1 stop         No info  14714           27              6         7        5            12           35               5             30
2     Air Asia  Kolkata    Banglore              CCU → BBI → BLR      1 stop         No info   5162            9              5         6       50            10           30               3             40
3    Air India    Delhi      Cochin  DEL → RPR → NAG → 

{'Linear Regression': 0.3954290484659, 'Lasso': 0.3954498841925457, 'Ridge': 0.3954298615428563, 'K-Neighbors Regressor': 0.8541628375503538, 'Decision Tree': 0.9872829388316675, 'Random Forest Regressor': 0.983485112554469, 'XGBRegressor': 0.9670367466068637, 'CatBoosting Regressor': 0.9537768722732313, 'GradientBoosting Regressor': 0.8302342163747802, 'AdaBoost Regressor': 0.2731788657871862}


Best Model Found , Model Name : Decision Tree , R2 Score : 0.9872829388316675




2023-09-27 04:18:37,560 - INFO - Hyperparameter tuning complete for Catboost
2023-09-27 04:18:37,561 - INFO - Hyperparameter tuning started for KNN


Best Catboost parameters : {'learning_rate': 0.04, 'iterations': 600, 'depth': 9}
Best Catboost Score : 0.9501775120270196




2023-09-27 04:20:19,632 - INFO - Hyperparameter tuning Complete for KNN
2023-09-27 04:20:19,634 - INFO - Voting Regressor model training started


Best KNN Parameters : {'n_neighbors': 2}
Best KNN Score : 0.929448041113063


Final Model Evaluation :



2023-09-27 04:20:38,920 - INFO - Voting Regressor Training Completed
2023-09-27 04:20:39,071 - INFO - Model pickle file saved


Model performance for Training set
- Root Mean Squared Error: 696.1546
- Mean Absolute Error: 417.8627
- R2 Score: 0.9776
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 791.1942
- Mean Absolute Error: 479.8558
- R2 Score: 0.9685


2023-09-27 04:20:40,012 - INFO - Test MAE : 479.85577155209876
2023-09-27 04:20:40,014 - INFO - Test RMSE : 791.1942399326281
2023-09-27 04:20:40,016 - INFO - Test R2 Score : 0.9684960291662228
2023-09-27 04:20:40,018 - INFO - Final Model Training Completed


In [10]:
import pickle
import pandas as pd

In [None]:
file_path = 

In [11]:
with open('artifacts/model_trainer/model.pkl', 'rb') as file:
    data = pickle.load(file)

In [21]:
import pandas as pd
import re
from io import StringIO

# Your log text
log_text = """
2023-09-27 04:15:35,928 - INFO - yaml file: config\config.yaml loaded successfully
# ... (other log messages)
2023-09-27 04:15:36,156 - INFO - Train Dataframe Head : 
       Airline   Source Destination                        Route Total_Stops Additional_Info  Price  Journey_day  Journey_month  Dep_hour  Dep_min  Arrival_hour  Arrival_min  Duration_hours  Duration_mins
0       IndiGo  Kolkata    Banglore                    CCU → BLR    non-stop         No info   4174           18              4        21       25             0            5               2             40
1  Jet Airways    Delhi      Cochin              DEL → BOM → COK      1 stop         No info  14714           27              6         7        5            12           35               5             30
2     Air Asia  Kolkata    Banglore              CCU → BBI → BLR      1 stop         No info   5162            9              5         6       50            10           30               3             40
# ... (more rows)
2023-09-27 04:15:36,166 - INFO - Test Dataframe Head  : 
             Airline   Source Destination                  Route Total_Stops              Additional_Info  Price  Journey_day  Journey_month  Dep_hour  Dep_min  Arrival_hour  Arrival_min  Duration_hours  Duration_mins
0  Multiple carriers    Delhi      Cochin        DEL → BOM → COK      1 stop                      No info   7670           15              5         6        0            21            0              15              0
1        Jet Airways  Kolkata    Banglore        CCU → DEL → BLR      1 stop                      No info  14151            1              5        20       25            22            5              25             40
2             IndiGo   Mumbai   Hyderabad              BOM → HYD    non-stop                      No info   2754            1              5        19        5            20           35               1             30
# ... (more rows)
"""

# Split the log text into lines
log_lines = log_text.split('\n')

# Initialize a list to store the lines of tabular data
tabular_data = []

# Flag to indicate whether we are inside the tabular data section
inside_tabular_data = False

# Iterate through the log lines
for line in log_lines:
    if "Train Dataframe Head :" in line:
        inside_tabular_data = True
        continue  # Skip the header line
    elif "Test Dataframe Head  :" in line:
        inside_tabular_data = False
        continue  # Skip the header line
    elif inside_tabular_data and line.strip():
        tabular_data.append(line)

# Join the tabular data lines to form a single string
tabular_data_text = '\n'.join(tabular_data)

# Use regular expression to split on multiple spaces
delimiter = r'\s+'
# Create a DataFrame from the tabular data
df = pd.read_csv(StringIO(tabular_data_text), sep=delimiter)

# Display the DataFrame
print(df)


ParserError: Error tokenizing data. C error: Expected 19 fields in line 3, saw 23
