### Data Preprocessing

<p> First we will Load the data using data Loader </p>

In [1]:
%pwd

'c:\\Arjun_Works\\SalesNexus\\research'

In [2]:
import os 
os.chdir('../')
%pwd

'c:\\Arjun_Works\\SalesNexus'

<p> Loading the Data </p>

In [3]:
from pathlib import Path
from ml_service.components.data_loader import DataLoader
from ml_service.config.configuration import ConfigurationManager
from ml_service.constants import *

config_manager = ConfigurationManager(CONFIG_FILE_PATH)
data_acquisition_config = config_manager.get_data_acquisition_config()

loader = DataLoader(
    data_dir=Path(data_acquisition_config.local_dir),
    source=data_acquisition_config.source,
    data_files=data_acquisition_config.data_files,
    dataset_name=data_acquisition_config.dataset_name
)

loader.download()

[2025-06-23 21:52:56,588: INFO: main_utils: yaml file: config\config.yaml loaded successfully]
[2025-06-23 21:52:56,591: INFO: main_utils: created directory at: artifacts]
[2025-06-23 21:52:56,593: INFO: main_utils: created directory at: artifacts/data_acquisition]
✅ All files already present in: artifacts\data_acquisition. Skipping download.


In [4]:
train_df = loader.load("train")
test_df = loader.load("test")
stores_df = loader.load("stores")
oil_df = loader.load("oil")
holidays_events_df = loader.load("holidays_events")
transactions_df = loader.load("transactions")

📥 Loading: artifacts\data_acquisition\train.csv
📥 Loading: artifacts\data_acquisition\test.csv
📥 Loading: artifacts\data_acquisition\stores.csv
📥 Loading: artifacts\data_acquisition\oil.csv
📥 Loading: artifacts\data_acquisition\holidays_events.csv
📥 Loading: artifacts\data_acquisition\transactions.csv


In [5]:
train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [6]:
test_df.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [7]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessingConfig:
    """Config for preprocessing data."""
    root_dir: Path
    train_file: Path
    test_file: Path

In [8]:
from ml_service.constants import *
from ml_service.utils.main_utils import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(self, config_filepath: str):
        """Initialize the configuration manager.

        Args:
            config_filepath (str): Path to the main configuration file (YAML).
        """
        self.config = read_yaml(config_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        """Get the configuration for data preprocessing.

        Returns:
            DataPreprocessingConfig: Paths for train and test preprocessed files.
        """
        config = self.config.data_preprocessing
        create_directories([config.root_dir])

        return DataPreprocessingConfig(
            root_dir=Path(config.root_dir),
            train_file=Path(config.root_dir) / config.train_file,
            test_file=Path(config.root_dir) / config.test_file
        )


<p>Let’s merge the train, stores, and transactions data (and similarly for the test set) for better clarity.</p>

In [10]:
train_merge_df = train_df.merge(stores_df, on='store_nbr', how='left')
train_merge_df = train_merge_df.merge(transactions_df, on=['store_nbr','date'], how='left')

test_merge_df = test_df.merge(stores_df, on='store_nbr', how='left')
test_merge_df = test_merge_df.merge(transactions_df, on=['store_nbr','date'], how='left')


In [11]:
train_merge_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,transactions
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,


<p>Now, Let’s merge the holidays_events and oil data (and similarly for the test set) for Date-level features.</p>

In [12]:
holidays_events_oil_merge_df = oil_df.merge(holidays_events_df, on='date', how='left')

In [13]:
holidays_events_oil_merge_df.head()

Unnamed: 0,date,dcoilwtico,type,locale,locale_name,description,transferred
0,2013-01-01,,Holiday,National,Ecuador,Primer dia del ano,False
1,2013-01-02,93.14,,,,,
2,2013-01-03,92.97,,,,,
3,2013-01-04,93.12,,,,,
4,2013-01-07,93.2,,,,,


In [14]:
train_final = train_merge_df.merge(holidays_events_oil_merge_df, on='date', how='left')
test_final = test_merge_df.merge(holidays_events_oil_merge_df, on='date', how='left')

In [15]:
train_final.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type_x,cluster,transactions,dcoilwtico,type_y,locale,locale_name,description,transferred
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,,,Holiday,National,Ecuador,Primer dia del ano,False
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,,,Holiday,National,Ecuador,Primer dia del ano,False
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,,,Holiday,National,Ecuador,Primer dia del ano,False
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,,,Holiday,National,Ecuador,Primer dia del ano,False
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,,,Holiday,National,Ecuador,Primer dia del ano,False


In [16]:
train_final.isna().sum()

id                    0
date                  0
store_nbr             0
family                0
sales                 0
onpromotion           0
city                  0
state                 0
type_x                0
cluster               0
transactions     248358
dcoilwtico       933768
type_y          2680128
locale          2680128
locale_name     2680128
description     2680128
transferred     2680128
dtype: int64

###  Dropping Low‑Value Features

We drop the following columns:

* **`description`**, **`locale_name`**, **`locale`**, and **`transferred`**

#### ✅ Why?

* These fields contain textual or highly sparse information that **doesn’t directly affect sales prediction**.
* They introduce **noise** and **high cardinality**, making it harder for the model to learn meaningful patterns.
* Similar information (holidays) is already captured by the **`type`** column, making these redundant.

#### 💡 Result

By removing these columns, we:

* **Improve training efficiency** (smaller, cleaner dataset).
* Reduce the risk of overfitting.
* Maintain focus on the features that matter for forecasting sales.


In [17]:
train_final.drop(columns=['description', 'locale_name', 'locale', 'transferred'], inplace=True)
test_final.drop(columns=['description', 'locale_name', 'locale', 'transferred'], inplace=True)

In [18]:
train_final.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type_x,cluster,transactions,dcoilwtico,type_y
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,,,Holiday
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,,,Holiday
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,,,Holiday
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,,,Holiday
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,,,Holiday


In [19]:
train_final.shape

(3032964, 13)

In [20]:
train_final.isna().sum()

id                    0
date                  0
store_nbr             0
family                0
sales                 0
onpromotion           0
city                  0
state                 0
type_x                0
cluster               0
transactions     248358
dcoilwtico       933768
type_y          2680128
dtype: int64

<p> We impute transactions due to its high predictive potential, while dropping columns with excessive sparsity and low information value. </p>

In [21]:
train_merge_df.drop(columns=['dcoilwtico', 'type_y'], inplace=True, errors='ignore')
test_merge_df.drop(columns=['dcoilwtico', 'type_y'], inplace=True, errors='ignore')

In [22]:
train_merge_df.isna().sum()

id                   0
date                 0
store_nbr            0
family               0
sales                0
onpromotion          0
city                 0
state                0
type                 0
cluster              0
transactions    245784
dtype: int64

### Data Cleaning

Note: Since transactions is the only column with missing values, we'll apply an imputation technique (such as filling with median or other suitable method) during the Data Transformation phase, where we'll also handle scaling, encoding, and splitting of the dataset. This ensures a clean and robust workflow.

<p>Now Let's look for duplicate values and handle them </p>

In [23]:
train_merge_df.duplicated().sum()

0

<p>Since the data is clean  with no duplicate rows and no significant missing values (other than the <code>transactions</code> column, which we'll impute later) we can now move on to the next phase of the pipeline.</p>

In [24]:
data_folder = data_acquisition_config.local_dir

train_merge_df.to_csv(data_folder / "train_merge.csv", index=False)
test_merge_df.to_csv(data_folder / "test_merge.csv", index=False)

In [25]:
from pathlib import Path
from typing import Dict
import pandas as pd

class DataProcessor:
    """Perform merging, cleaning, and exporting of Store Sales Time Series data."""

    def __init__(self, data_dir: Path, data_files: Dict[str, str]) -> None:
        self.data_dir = data_dir
        self.data_files = data_files
        self.data = {}  # Will hold all DF references

    def load(self) -> "DataProcessor":
        """Load all files from the data directory."""
        self.data = {name: pd.read_csv(self.data_dir / path) for name, path in self.data_files.items()}
        return self

    def merge_train_test(self) -> "DataProcessor":
        """Merge train/test with stores and transactions."""
        self.data["train_merged"] = (
            self.data["train"]
            .merge(self.data["stores"], on="store_nbr", how="left")
            .merge(self.data["transactions"], on=["store_nbr", "date"], how="left")
        )
        self.data["test_merged"] = (
            self.data["test"]
            .merge(self.data["stores"], on="store_nbr", how="left")
            .merge(self.data["transactions"], on=["store_nbr", "date"], how="left")
        )
        return self

    def merge_holidays_and_oil(self) -> "DataProcessor":
        """Merge holidays and oil data for final train/test."""
        holidays_oil_merged = self.data["oil"].merge(self.data["holidays_events"], on="date", how="left")
        self.data["train_final"] = self.data["train_merged"].merge(holidays_oil_merged, on="date", how="left")
        self.data["test_final"] = self.data["test_merged"].merge(holidays_oil_merged, on="date", how="left")
        return self

    def drop_irrelevant_columns(self) -> "DataProcessor":
        """Drop low-value and sparse columns."""
        drop_cols = ["description", "locale_name", "locale", "transferred", "dcoilwtico", "type_y"]
        self.data["train_final"] = self.data["train_final"].drop(columns=drop_cols, errors="ignore")
        self.data["test_final"] = self.data["test_final"].drop(columns=drop_cols, errors="ignore")
        return self

    def save(self) -> "DataProcessor":
        """Save final merged train and test files."""
        self.data["train_final"].to_csv(self.data_dir / "train_merge.csv", index=False)
        self.data["test_final"].to_csv(self.data_dir / "test_merge.csv", index=False)

        print(f"✅ Final train and test files saved to: {self.data_dir}")
        return self


In [26]:
from pathlib import Path
from ml_service.config.configuration import ConfigurationManager

config_manager = ConfigurationManager(CONFIG_FILE_PATH)  
data_acquisition_config = config_manager.get_data_acquisition_config()

data_dir = Path(data_acquisition_config.local_dir)  
files = data_acquisition_config.data_files

# Chain methods for a clean pipeline
DataProcessor(data_dir, files) \
    .load() \
    .merge_train_test() \
    .merge_holidays_and_oil() \
    .drop_irrelevant_columns() \
    .save()


[2025-06-23 21:53:16,159: INFO: main_utils: yaml file: config\config.yaml loaded successfully]
[2025-06-23 21:53:16,161: INFO: main_utils: created directory at: artifacts]
[2025-06-23 21:53:16,163: INFO: main_utils: created directory at: artifacts/data_acquisition]
✅ Final train and test files saved to: artifacts\data_acquisition


<__main__.DataProcessor at 0x1bceff41b70>