In [1]:
import os

In [2]:
%pwd

'd:\\ERP_Sales_Forecasting\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'd:\\ERP_Sales_Forecasting'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    data_file: Path
    result_file: Path
    model_path: Path

In [7]:
from ERPsalesForecasting.constants import *
from ERPsalesForecasting.utils.common import read_yaml, create_directories
from ERPsalesForecasting import logger

In [8]:
class ConfigurationManager:
    def __init__(self, config_file_path=CONFIG_FILE_PATH, param_file_path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.param = read_yaml(param_file_path)

        create_directories([self.config.artifacts_root])
        


    def get_model_training_config(self):
        config = self.config.model_training
        validationConfig = self.config.model_validation
        bestModelConfig = self.config.best_model

        create_directories([config.root_dir, validationConfig.root_dir, bestModelConfig.root_dir])
        column_names = "ModelName,MSE,MAE,R2\n"
        if (not os.path.exists(validationConfig.result_file) or os.path.getsize(validationConfig.result_file) == 0):
         with open(validationConfig.result_file, "w") as f:
            f.write(column_names)
            logger.info(f"Creating empty file: {validationConfig.result_file}")
            
        else:
            logger.info(f"{validationConfig.result_file} is already exists")

        model_training_config = ModelTrainingConfig(
            root_dir=config.root_dir,
            data_file=config.data_file,
            result_file=validationConfig.result_file,
            model_path=bestModelConfig.model_path,
        )

        return model_training_config

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
import numpy as np

In [10]:
from ERPsalesForecasting import logger

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import csv
import dill
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [12]:
class ModelTraining:
    def __init__(self, config: ModelTrainingConfig) -> None:
        self.config = config

    def data_preprocessing(self):

        df = pd.read_csv(self.config.data_file)

        encoder = OneHotEncoder(sparse=False)
        product_encoded = encoder.fit_transform(df[['ProductID']])
        product_encoded_df = pd.DataFrame(product_encoded, columns=[
                                          f'Product_{i}' for i in range(product_encoded.shape[1])])

        numeric_features = df.select_dtypes(
            include=[np.number]).drop(['SelledQTY'], axis=1)
        numeric_features.drop(['ProductTotalQty'], axis=1, inplace=True)
        return (product_encoded_df, numeric_features, df)

    def splitData(self, numeric_features, product_encoded_df, df):
        X = pd.concat([numeric_features, product_encoded_df], axis=1)
        y = df['SelledQTY']

        dates = df['Date']
        product_ids = df['ProductID']

        X_train, X_test, y_train, y_test, dates_train, dates_test, product_ids_train, product_ids_test = train_test_split(
            X, y, dates, product_ids, test_size=0.2, random_state=42)
        
        return (X_train, X_test, y_train, y_test, dates_train, dates_test, product_ids_train, product_ids_test)

    def dataScaling(self, X_train, X_test):
        # Normalize the Features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        return X_train_scaled, X_test_scaled
    
    def get_regression_model(self, model_type='linear'):
        if model_type == 'random_forest':
            return RandomForestRegressor(n_estimators=100, random_state=42)
        elif model_type == 'gradient_boosting':
            return GradientBoostingRegressor(n_estimators=100, random_state=42)
        # elif model_type == 'xgboost':
        #     return xgb(n_estimators=100, random_state=42)
        elif model_type == 'lasso':
            return Lasso(alpha=1.0, random_state=42)
        elif model_type == 'ridge':
            return Ridge(alpha=1.0, random_state=42)
        elif model_type == 'elastic_net':
            return ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)
        elif model_type == 'decision_tree':
            return DecisionTreeRegressor(random_state=42)
        elif model_type == 'knn':
            return KNeighborsRegressor(n_neighbors=5)
        elif model_type == 'svr':
            return SVR()
        elif model_type == 'linear_regression':
            from sklearn.linear_model import LinearRegression
            return LinearRegression()
    
    def model_evaluation(self, model, model_name, X_test_scaled, y_test, dates_test, product_ids_test):
        y_pred = model.predict(X_test_scaled)
        
        # print(y_pred)
        
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f'Model: {model_name}, MSE: {mse}, MAE: {mae}, R2 Score: {r2}')
        
        logger.info('Model: {model_name}, MSE: {mse}, MAE: {mae}, R2 Score: {r2}')
        
        # Your code for updating the DataFrame
        new_data = pd.DataFrame({
            'ModelName': [model_name],
            'MSE': [mse],
            'MAE': [mae],
            'R2': [r2]
        })
        
        new_row = [model_name, mse, mae, r2]
        
        result_df = pd.read_csv(self.config.result_file)
        
        if result_df['ModelName'].isin([model_name]).any():
            result_df.loc[result_df['ModelName'] == model_name, ['MSE', 'MAE', 'R2']] = [mse, mae, r2]
            
        else:
            result_df = result_df.append(new_data, ignore_index=True)

        
        # with open(self.config.result_file, 'a', newline='') as file:
        #     writer = csv.writer(file)
        #     # Write the new data
        #     writer.writerow(new_row)
                # result_df = pd.concat([result_df, new_data], ignore_index=True) 
        
        logger.info('Model: {model_name}, evaluation data has been updated to results file')
    
        comparison_df = pd.DataFrame({
            'Date': dates_test,
            'ProductId': product_ids_test,
            'ActualSales': y_test,
            'PredictedSales': y_pred
        })
        
        comparison_df.sort_values(by=['Date', 'ProductId'], inplace=True)
        
        logger.info(f'{comparison_df.head()}')
        
        comparison_df.to_csv(f'{self.config.root_dir}/{model_name}.csv')
        
        logger.info(f'test result of {model_name} is saved to: {self.config.root_dir}/{model_name}.csv')
    
    def best_model(self):
        
        df = pd.read_csv(self.config.result_file)
        sorted_data = df.sort_values(by=['MSE', 'MAE'])

        # Get the top model (lowest MSE and MAE)
        best_model = sorted_data.iloc[0]
        
        print("Best model based on lowest MSE and MAE:")
        print(best_model['ModelName'])
        
        logger.info(f"{best_model['ModelName']} is the best model")
        
        return best_model['ModelName']
        
        

In [13]:
def train(model_training, X_train_scaled, y_train, X_test_scaled, y_test, dates_test, product_ids_test, model_name="linear_regression"):
    logger.info(f"{model_name} Training started")
    
    model = model_training.get_regression_model(model_name)
    model.fit(X_train_scaled, y_train)
    
    model_training.model_evaluation(model, model_name, X_test_scaled, y_test, dates_test, product_ids_test)
    
    logger.info(f"{model_name} Training and evaluation completed successfully")
    
    return model

    

In [14]:
try:
    config = ConfigurationManager()
    model_training_config = config.get_model_training_config()
    model_training = ModelTraining(config=model_training_config)
    product_encoded_df, numeric_features, df = model_training.data_preprocessing()
    
    numeric_features['AvailableQtyAfterSell'] = numeric_features['AvailableQtyAfterSell'].apply(lambda x: 0 if x < 0 else x)
    
    X_train, X_test, y_train, y_test, dates_train, dates_test, product_ids_train, product_ids_test = model_training.splitData(product_encoded_df, numeric_features, df)
    X_train_scaled, X_test_scaled = model_training.dataScaling(X_train, X_test)
    
    models = {}

    # Train each model and store it in the dictionary
    models["linear_regression"] = train(model_training, X_train_scaled, y_train, X_test_scaled, y_test, dates_test, product_ids_test, "linear_regression")
    models["decision_tree"] = train(model_training, X_train_scaled, y_train, X_test_scaled, y_test, dates_test, product_ids_test, "decision_tree")
    models["svr"] = train(model_training, X_train_scaled, y_train, X_test_scaled, y_test, dates_test, product_ids_test, "svr")
    models["knn"] = train(model_training, X_train_scaled, y_train, X_test_scaled, y_test, dates_test, product_ids_test, "knn")
    models["gradient_boosting"] = train(model_training, X_train_scaled, y_train, X_test_scaled, y_test, dates_test, product_ids_test, "gradient_boosting")
    models["random_forest"] = train(model_training, X_train_scaled, y_train, X_test_scaled, y_test, dates_test, product_ids_test, "random_forest")
    models["ridge"] = train(model_training, X_train_scaled, y_train, X_test_scaled, y_test, dates_test, product_ids_test, "ridge")
    models["lasso"] = train(model_training, X_train_scaled, y_train, X_test_scaled, y_test, dates_test, product_ids_test, "lasso")
    models["elastic_net"] = train(model_training, X_train_scaled, y_train, X_test_scaled, y_test, dates_test, product_ids_test, "elastic_net")


    best_model_name = model_training.best_model()
    
    print(best_model_name)
    
    with open('best_model/model.dill', 'wb') as file:
        dill.dump(models[best_model_name], file)

    logger.info(f"{best_model_name} model saved in .dill format.")
    
    
except Exception as e:
    raise e

[2023-12-22 21:19:26,149]: INFO: common : 31: yaml file: config\config.yaml loaded successfully
[2023-12-22 21:19:26,151]: INFO: common : 31: yaml file: params.yaml loaded successfully
[2023-12-22 21:19:26,152]: INFO: common : 50: created directory at: artifacts
[2023-12-22 21:19:26,152]: INFO: common : 50: created directory at: artifacts/model_training
[2023-12-22 21:19:26,153]: INFO: common : 50: created directory at: artifacts/model_validation
[2023-12-22 21:19:26,154]: INFO: common : 50: created directory at: best_model
[2023-12-22 21:19:26,154]: INFO: 3593018843 : 23: artifacts/model_validation/result.csv is already exists
[2023-12-22 21:19:26,164]: INFO: 3019820898 : 2: linear_regression Training started
Model: linear_regression, MSE: 3.277926994842912e+28, MAE: 107702031516463.9, R2 Score: -1.772964275907369e+25
[2023-12-22 21:19:26,168]: INFO: 3586992906 : 72: Model: {model_name}, MSE: {mse}, MAE: {mae}, R2 Score: {r2}
[2023-12-22 21:19:26,171]: INFO: 3586992906 : 99: Model: {m



[2023-12-22 21:19:26,335]: INFO: 3019820898 : 9: knn Training and evaluation completed successfully
[2023-12-22 21:19:26,336]: INFO: 3019820898 : 2: gradient_boosting Training started
Model: gradient_boosting, MSE: 2609.9066145967927, MAE: 29.007438453889364, R2 Score: -0.4116455913796848
[2023-12-22 21:19:26,370]: INFO: 3586992906 : 72: Model: {model_name}, MSE: {mse}, MAE: {mae}, R2 Score: {r2}
[2023-12-22 21:19:26,373]: INFO: 3586992906 : 99: Model: {model_name}, evaluation data has been updated to results file
[2023-12-22 21:19:26,376]: INFO: 3586992906 : 110:           Date  ProductId  ActualSales  PredictedSales
0   2023-09-25          1          105        5.392483
10  2023-09-26          2            1        2.064800
22  2023-09-26          4            1        2.999059
30  2023-09-29         45           11        8.128843
49  2023-09-30        396           12       10.108183
[2023-12-22 21:19:26,378]: INFO: 3586992906 : 114: test result of gradient_boosting is saved to: ar