In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging

In [8]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("rossmann_sales_forecasting.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

In [9]:
# Set global visualization style
sns.set(style="whitegrid", palette="muted")

In [10]:
# Load datasets
try:
    train = pd.read_csv("../data/train.csv")
    test = pd.read_csv("../data/test.csv")
    store = pd.read_csv("../data/store.csv")
    logger.info("Datasets successfully loaded.")
except Exception as e:
    logger.error(f"Error loading datasets: {e}")
    raise

  train = pd.read_csv("../data/train.csv")
2025-01-05 16:32:24,450 - INFO - Datasets successfully loaded.


In [11]:
# Initial inspection
logger.info(f"Train data preview:\n{train.head()}")
logger.info(f"Test data preview:\n{test.head()}")
logger.info(f"Store data preview:\n{store.head()}")

2025-01-05 16:32:49,146 - INFO - Train data preview:
   Store  DayOfWeek        Date  Sales  Customers  Open  Promo StateHoliday  \
0      1          5  2015-07-31   5263        555     1      1            0   
1      2          5  2015-07-31   6064        625     1      1            0   
2      3          5  2015-07-31   8314        821     1      1            0   
3      4          5  2015-07-31  13995       1498     1      1            0   
4      5          5  2015-07-31   4822        559     1      1            0   

   SchoolHoliday  
0              1  
1              1  
2              1  
3              1  
4              1  
2025-01-05 16:32:49,150 - INFO - Test data preview:
   Id  Store  DayOfWeek        Date  Open  Promo StateHoliday  SchoolHoliday
0   1      1          4  2015-09-17   1.0      1            0              0
1   2      3          4  2015-09-17   1.0      1            0              0
2   3      7          4  2015-09-17   1.0      1            0              

In [12]:
# Check for missing values
logger.info(f"Missing values in train dataset:\n{train.isnull().sum()}")
logger.info(f"Missing values in test dataset:\n{test.isnull().sum()}")
logger.info(f"Missing values in store dataset:\n{store.isnull().sum()}")

2025-01-05 16:35:52,257 - INFO - Missing values in train dataset:
Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64
2025-01-05 16:35:52,261 - INFO - Missing values in test dataset:
Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64
2025-01-05 16:35:52,262 - INFO - Missing values in store dataset:
Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64


In [13]:
# Check data types
logger.info(f"Data types in train dataset:\n{train.dtypes}")
logger.info(f"Data types in test dataset:\n{test.dtypes}")
logger.info(f"Data types in store dataset:\n{store.dtypes}")

2025-01-05 16:36:36,944 - INFO - Data types in train dataset:
Store             int64
DayOfWeek         int64
Date             object
Sales             int64
Customers         int64
Open              int64
Promo             int64
StateHoliday     object
SchoolHoliday     int64
dtype: object
2025-01-05 16:36:36,946 - INFO - Data types in test dataset:
Id                 int64
Store              int64
DayOfWeek          int64
Date              object
Open             float64
Promo              int64
StateHoliday      object
SchoolHoliday      int64
dtype: object
2025-01-05 16:36:36,948 - INFO - Data types in store dataset:
Store                          int64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
d

In [14]:
# Handle missing values
test['Open'].fillna(test['Open'].mode()[0], inplace=True)
store['CompetitionDistance'].fillna(store['CompetitionDistance'].median(), inplace=True)
store['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
store['CompetitionOpenSinceYear'].fillna(0, inplace=True)
store['Promo2SinceWeek'].fillna(0, inplace=True)
store['Promo2SinceYear'].fillna(0, inplace=True)
store['PromoInterval'].fillna("None", inplace=True)
logger.info("Missing values in store dataset handled.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Open'].fillna(test['Open'].mode(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  store['CompetitionDistance'].fillna(store['CompetitionDistance'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the 