In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os, sys

In [2]:
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

In [3]:
from scripts.logging import setup_logger
logger = setup_logger()

In [4]:
from scripts.load_data import load_datasets
try:
    train_df, store_df = load_datasets('../Data/train.csv', '../Data/store.csv', logger)
except FileNotFoundError:
    logger.critical("Dataset loading failed due to missing files.")
except Exception as e:
    logger.critical(f"Unexpected error: {e}")

2024-09-24 14:00:13,912 - INFO - Loading datasets.
  train_df = pd.read_csv(train_file)
2024-09-24 14:00:14,342 - INFO - Datasets loaded successfully.


In [5]:
logger.info("Checking for missing values in train.csv dataset.")
missing_values = train_df.isnull().sum()
logger.info(f"Missing values found:\n{missing_values}")

2024-09-24 14:02:15,104 - INFO - Checking for missing values in train.csv dataset.
2024-09-24 14:02:15,184 - INFO - Missing values found:
Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64


In [7]:
# Merge datasets
df = pd.merge(train_df, store_df, on='Store', how='left')

In [8]:
# Handle missing values, e.g., fill missing CompetitionDistance with median
df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace=True)

# Fill missing Promo2 related columns (like Promo2SinceWeek) with default values
df['Promo2SinceWeek'].fillna(0, inplace=True)
df['Promo2SinceYear'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Promo2SinceWeek'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate ob

In [9]:
df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,
1,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,5,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9.0,2009.0,0,0.0,0.0,
4,5,5,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4.0,2015.0,0,0.0,0.0,


In [10]:
print(df.columns)

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval'],
      dtype='object')


In [11]:
# Convert 'StateHoliday' to numerical values (One-hot encoding)
df = pd.get_dummies(df, columns=['StateHoliday', 'Assortment', 'StoreType'], drop_first=True)

# # Alternative: Label encode categorical variables
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# df['StateHoliday'] = le.fit_transform(df['StateHoliday'])

In [13]:
from scripts.create_datetime_features import create_datetime_features
df = create_datetime_features(df)