### Add scripts to the notebook

In [1]:
import sys
import os

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 4\Rossman-Pharma-Sales-Prediction\notebooks


### Import statments

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
sns.set_theme()

### Load the data

path to csv files

In [4]:
# path of the store data
STORE_PATH  = '../data/store.csv'

# path of the pre-compiled train data
TRAIN_PATH = '../data/train.csv'

# path of the pre-compiled test data
TEST_DATA = '../data/test.csv'

In [5]:
# load the store data
store_df = pd.read_csv(STORE_PATH)

# load the pre-compiled train data
train_pre_df = pd.read_csv(TRAIN_PATH)

# load the pre-compiled test data
test_pre_df = pd.read_csv(TEST_DATA)

  train_pre_df = pd.read_csv(TRAIN_PATH)


### Preprocessing

1) Handling missing data

In [6]:
# empty data in the training portion
train_pre_df.isna().mean()

Store            0.0
DayOfWeek        0.0
Date             0.0
Sales            0.0
Customers        0.0
Open             0.0
Promo            0.0
StateHoliday     0.0
SchoolHoliday    0.0
dtype: float64

In [7]:
# empty data in the testing portion
test_pre_df.isna().mean()

Id               0.000000
Store            0.000000
DayOfWeek        0.000000
Date             0.000000
Open             0.000268
Promo            0.000000
StateHoliday     0.000000
SchoolHoliday    0.000000
dtype: float64

In [8]:
# empty data in the store data
store_df.isna().mean()

Store                        0.000000
StoreType                    0.000000
Assortment                   0.000000
CompetitionDistance          0.002691
CompetitionOpenSinceMonth    0.317489
CompetitionOpenSinceYear     0.317489
Promo2                       0.000000
Promo2SinceWeek              0.487892
Promo2SinceYear              0.487892
PromoInterval                0.487892
dtype: float64

While the training and testing data don't have any missing values, there are missing values within the store data.
- Significant missing values:
    - Promo2SinceWeek : 48.7892%
    - Promo2SinceYear : 48.7892%
    - PromoInterval : 48.7892%
    - CompetitionOpenSinceMonth: 31.7489%
    - CompetitionSinceYear: 31.7489%
- Insignificant missing values:
    - CompetitionDistance: 0.2691%

I didn't want to drop the NA rows because they (as in the NA values) give information about the store, so I decided to replace the NA values with an integer or a string for categorical values that still holds the information portraid by the NA values

In [9]:
# Fill significant missing values
store_df['Promo2SinceWeek'] = store_df['Promo2SinceWeek'].fillna(-1)  # use -1 to indicate non-participation
store_df['Promo2SinceYear'] = store_df['Promo2SinceYear'].fillna(-1)  # use -1 to indicate non-participation
store_df['PromoInterval'] = store_df['PromoInterval'].fillna('None') # use none to show there was no interval

In [10]:
# Fill competition-related missing values
store_df['CompetitionDistance'] = store_df['CompetitionDistance'].fillna(float('inf'))  # Infinite distance means no competition
store_df['CompetitionOpenSinceMonth'] = store_df['CompetitionOpenSinceMonth'].fillna(0)       # 0 to indicate no competitors
store_df['CompetitionOpenSinceYear'] = store_df['CompetitionOpenSinceYear'].fillna(0)        # 0 to indicate no competitors

In [11]:
# empty data in the store data
store_df.isna().mean()

Store                        0.0
StoreType                    0.0
Assortment                   0.0
CompetitionDistance          0.0
CompetitionOpenSinceMonth    0.0
CompetitionOpenSinceYear     0.0
Promo2                       0.0
Promo2SinceWeek              0.0
Promo2SinceYear              0.0
PromoInterval                0.0
dtype: float64

2) Breakdown the date values in the training and testing sets into more granural date indicators

In [12]:
# convert the date into date objects
train_pre_df['Date'] = pd.to_datetime(train_pre_df['Date'])

# break the date object into the date, the month and the year
train_pre_df['Day'] = train_pre_df['Date'].dt.day
train_pre_df['Month'] = train_pre_df['Date'].dt.month
train_pre_df['Year'] = train_pre_df['Date'].dt.year
train_pre_df['DayName'] = train_pre_df['Date'].dt.day_name()
train_pre_df['WeekDay'] = train_pre_df['Date'].dt.day_of_week

# determine if it is month beginning, mid-month, and end of the month
train_pre_df['MonthStart'] = train_pre_df['Date'].dt.is_month_start
train_pre_df['MonthEnd'] = train_pre_df['Date'].dt.is_month_end
train_pre_df['MonthMid'] = (train_pre_df['MonthStart'] == False) & (train_pre_df['MonthEnd'] == False)

# determine if the day is a weekday or not
train_pre_df['WeekEnd'] = (train_pre_df['WeekDay'] == 5) | (train_pre_df['WeekDay'] == 6)