In [1]:
import os

In [2]:
%pwd

'd:\\ERP_Sales_Forecasting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\ERP_Sales_Forecasting'

In [5]:
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [6]:
from ERPsalesForecasting import logger
from ERPsalesForecasting.utils.common import read_yaml, create_directories, get_size
from ERPsalesForecasting.constants import *

In [7]:
@dataclass
class DataProcessingConfig:
    root_dir: Path
    data_file: Path
    preprocessed_file: Path
    isValid: bool

In [8]:
class ConfigurationManager:
    def __init__(self, config_file_path=CONFIG_FILE_PATH, param_file_path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.param = read_yaml(param_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_processing_config(self):
        config = self.config.data_processing

        create_directories([config.root_dir])

        data_processing_config = DataProcessingConfig(
            root_dir=config.root_dir,
            data_file=config.data_file,
            preprocessed_file=config.preprocessed_file,
            isValid=False,
        )

        return data_processing_config

In [9]:
class DataProcessing:

    def __init__(self, config: DataProcessingConfig) -> None:
        self.config = config

    def data_validation(self) -> None:
        dataFilePath = Path(self.config.data_file)

        try:
            df = pd.read_excel(dataFilePath)

            if (get_size(dataFilePath) != ""):
                logger.info(f"Dataset is available at: {dataFilePath}")
                logger.info(f"Dataset size: {df.shape}")
                logger.info(f"Columns in Dataset {df.columns}")
                self.config.isValid = True
        except Exception as e:
            raise e

    

    def prepare_data(self) -> None:

        create_directories([self.config.root_dir])

        try:
            if (self.config.isValid):
                df = pd.read_excel(self.config.data_file)
                columns = df.columns
                print(columns)

                df['Date'] = pd.to_datetime(df['billCreatedDateTime'], dayfirst = True)
                df.sort_values('Date', inplace=True)

                df.reset_index(inplace=True)

                product_id_column = 'ProductID'
                date_column = 'Date'
                quantity_sold_column = 'SelledQTY'
                product_name_column = 'ProductName'
                total_qty_column = 'ProductTotalQty'

                df[date_column] = pd.to_datetime(df[date_column]).dt.date

                # Calculate the cumulative quantity sold for each product
                df['CumulativeSelledQTY'] = df.groupby([product_id_column])[quantity_sold_column].cumsum()

                # Group by ProductID and Date, and calculate required fields
                merged_data = df.groupby([product_id_column, date_column]).agg({
                    product_name_column: 'first',
                    total_qty_column: 'max',
                    quantity_sold_column: 'sum',
                    'CumulativeSelledQTY': 'max'
                }).reset_index()

                # Calculate AvailableQtyAfterSell
                merged_data['AvailableQtyAfterSell'] = merged_data[total_qty_column] - merged_data['CumulativeSelledQTY']

                # Drop the CumulativeSelledQTY column
                merged_data.drop(columns=['CumulativeSelledQTY'], inplace=True)

                # Save file
                merged_data.to_csv(self.config.preprocessed_file, index=False)

                logger.info(
                    f"Preprocessed sales file saved at {self.config.preprocessed_file}")

            else:
                logger.info("Data is not available")

        except Exception as e:
            raise e

In [10]:
try:
    config = ConfigurationManager()
    data_processing_config = config.get_data_processing_config()
    data_processing = DataProcessing(config=data_processing_config)
    data_processing.data_validation()
    data_processing.prepare_data()

except Exception as e:
    raise e

[2023-12-22 13:31:48,972]: INFO: common : 31: yaml file: config\config.yaml loaded successfully
[2023-12-22 13:31:48,976]: INFO: common : 31: yaml file: params.yaml loaded successfully
[2023-12-22 13:31:48,979]: INFO: common : 50: created directory at: artifacts
[2023-12-22 13:31:48,980]: INFO: common : 50: created directory at: artifacts/data_processing
[2023-12-22 13:31:49,341]: INFO: 2892683706 : 13: Dataset is available at: artifacts\data_ingestion\data.csv
[2023-12-22 13:31:49,341]: INFO: 2892683706 : 14: Dataset size: (163, 8)
[2023-12-22 13:31:49,349]: INFO: 2892683706 : 15: Columns in Dataset Index(['ProductID', 'BillId', 'ProductName', 'SelledQTY', 'ProductTotalQty',
       'OverAllAvailableQty', 'AvailableQtyAfterSell', 'billCreatedDateTime'],
      dtype='object')
[2023-12-22 13:31:49,349]: INFO: common : 50: created directory at: artifacts/data_processing
Index(['ProductID', 'BillId', 'ProductName', 'SelledQTY', 'ProductTotalQty',
       'OverAllAvailableQty', 'AvailableQty

KeyError: 'Date'