In [1]:
# Import necessary libraries
import pandas as pd
import logging
import os, sys
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))
# Import data preprocessor class
from preprocess import DataPreprocessor

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

logger.info("Imported libraries and configured logging.")

2024-09-24 16:22:21,803 - INFO - Imported libraries and configured logging.


In [2]:
logger.info("Preprocessed both the test and train datasets")
# Load and preprocess the datasets
if __name__ == "__main__":
    train_path = '../data/train_cleaned.csv'  # Path to train dataset
    test_path = '../data/test_cleaned.csv'  # Path to test dataset
    test_id = '../data/sample_submission.csv'
    # Create instance of the class
    preprocessor = DataPreprocessor(train_path, test_path, test_id)
    # Load the dataset
    train_df, test_df = preprocessor.preprocess()
    # Save Preprocessed data
    preprocessor.save_data()


2024-09-24 16:22:21,819 - INFO - Preprocessed both the test and train datasets


Cleaning data...
Extracting datetime features...
Performing feature engineering...
Encoding categorical data...
Preprocessing complete.
Processed data saved to ../data/train_processed.csv and ../data/test_processed.csv.


In [3]:
train_df.columns

Index(['Store', 'DayOfWeek', 'Sales', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'Year', 'Month', 'Day', 'StoreType', 'Assortment',
       'CompetitionDistance', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear',
       'IsWeekday', 'IsWeekend', 'IsBeginningOfMonth', 'IsMidMonth',
       'IsEndOfMonth', 'IsHoliday', 'Promo_duration'],
      dtype='object')

In [4]:
test_df.columns

Index(['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'Year', 'Month', 'Day', 'StoreType', 'Assortment',
       'CompetitionDistance', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear',
       'IsWeekday', 'IsWeekend', 'IsBeginningOfMonth', 'IsMidMonth',
       'IsEndOfMonth', 'IsHoliday', 'Promo_duration'],
      dtype='object')

In [5]:
train_df.shape,test_df.shape

((821291, 23), (40080, 22))

In [7]:
train_df.tail(3)

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,...,Promo2,Promo2SinceWeek,Promo2SinceYear,IsWeekday,IsWeekend,IsBeginningOfMonth,IsMidMonth,IsEndOfMonth,IsHoliday,Promo_duration
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,769,1,5035.0,1.0,0.0,1,1.0,2013,1,1,...,1,48.0,2012.0,1,0,1,0,0,1,360.0
2013-01-01,948,1,4491.0,1.0,0.0,1,1.0,2013,1,1,...,0,0.0,0.0,1,0,1,0,0,1,355.0
2013-01-01,1097,1,5961.0,1.0,0.0,1,1.0,2013,1,1,...,0,0.0,0.0,1,0,1,0,0,1,360.0


In [8]:
test_df.head(3)

Unnamed: 0_level_0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,StoreType,...,Promo2,Promo2SinceWeek,Promo2SinceYear,IsWeekday,IsWeekend,IsBeginningOfMonth,IsMidMonth,IsEndOfMonth,IsHoliday,Promo_duration
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,3,1.0,1.0,0,0.0,2015,9,17,2,...,0,0.0,0.0,1,0,0,1,0,0,1.0
2,3,3,1.0,1.0,0,0.0,2015,9,17,0,...,1,14.0,2011.0,1,0,0,1,0,0,1.0
3,7,3,1.0,1.0,0,0.0,2015,9,17,0,...,0,0.0,0.0,1,0,0,1,0,0,1.0
