# Imports

In [2]:
import pandas as pd

from itertools import product

# Data preparation

In [48]:
train = pd.read_csv('./data/train.csv', parse_dates=['date'])
test = pd.read_csv('./data/test.csv', parse_dates=['date'])
holidays_events = pd.read_csv('./data/holidays_events.csv', parse_dates=['date'])
oil = pd.read_csv('./data/oil.csv', parse_dates=['date'])
stores = pd.read_csv('./data/stores.csv')
transactions = pd.read_csv('./data/transactions.csv', parse_dates=['date'])

### Check if there are missing dates in the train data

In [49]:
# Get the date range from earliest to latest in train data
date_range = pd.date_range(start=train['date'].min(), end=train['date'].max(), freq='D')

# Get unique dates in train data
train_dates = train['date'].unique()

# Find missing dates
missing_dates = date_range.difference(pd.DatetimeIndex(train_dates))

print(f"Total dates in range: {len(date_range)}")
print(f"Unique dates in train: {len(train_dates)}")
print(f"Missing dates: {len(missing_dates)}")
print(f"\nMissing dates:\n{missing_dates}")


Total dates in range: 1688
Unique dates in train: 1684
Missing dates: 4

Missing dates:
DatetimeIndex(['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25'], dtype='datetime64[ns]', freq=None)


In [50]:
# Create a complete date range from train data
date_range = pd.date_range(start=train['date'].min(), end=train['date'].max(), freq='D')

# Get all unique combinations of store number and product family
stores_list = train['store_nbr'].unique()
unique_product_families = train['family'].unique()

# Create a MultiIndex with all combinations of date, store_nbr, and family
complete_index = pd.MultiIndex.from_tuples(
    product(date_range, stores_list, unique_product_families),
    names=['date', 'store_nbr', 'family']
)

# Create a complete dataframe
complete_df = pd.DataFrame(index=complete_index).reset_index()

# Merge with original train data
train_complete = complete_df.merge(
    train,
    on=['date', 'store_nbr', 'family'],
    how='left'
)

# Fill missing values i.e. sales and onpromotion with 0
train_complete['sales'] = train_complete['sales'].fillna(0)
train_complete['onpromotion'] = train_complete['onpromotion'].fillna(0)

# 4 missing dates, 54 stores, 33 product families i.e. 4 * 54 * 33 = 7128
print(f"Added records: {len(train_complete) - len(train)}")

train = train_complete

Added records: 7128


### Check missing oil dates

In [51]:
# Get the date range from earliest to latest in train and test data
date_range = pd.date_range(start=train['date'].min(), end=test['date'].max(), freq='D')

# Get unique dates in oil data
oil_dates = oil['date'].unique()

# Find missing dates
missing_dates = date_range.difference(pd.DatetimeIndex(oil_dates))

print(f"Total dates in range: {len(date_range)}")
print(f"Unique dates in oil: {len(oil_dates)}")
print(f"Missing dates: {len(missing_dates)}")

Total dates in range: 1704
Unique dates in oil: 1218
Missing dates: 486


In [52]:
# Create complete date range for the data from train and test data
date_range = pd.date_range(start=train['date'].min(), end=test['date'].max(), freq='D')

# Create complete dataframe with all dates
oil_complete = pd.DataFrame({'date': date_range})

# Merge with original oil data
oil_complete = oil_complete.merge(
    oil, 
    on='date', 
    how='left'
)

# Forward fill (use last known price for missing dates)
oil_complete['dcoilwtico'] = oil_complete['dcoilwtico'].ffill()

# Backward fill for any remaining NaNs at the start
oil_complete['dcoilwtico'] = oil_complete['dcoilwtico'].bfill()

# Update oil dataframe
print(f"Number of records after filling missing oil data: {len(oil_complete)}")
print(f"Added records: {len(oil_complete) - len(oil)}")

oil = oil_complete

Number of records after filling missing oil data: 1704
Added records: 486


### Check transactions

In [53]:
# Total transactions count
total_transactions_count = len(train.groupby(["date", "store_nbr"])['sales'].sum())

print(f"Missing records from transactions: {total_transactions_count - len(transactions)}")


store_sales = train.groupby(['date', 'store_nbr'])['sales'].sum().reset_index()

# Merge transactions with sales
transactions_complete = transactions.merge(
    store_sales,
    on=['date', 'store_nbr'],
    how='outer'
).sort_values(by=['date', 'store_nbr'], ignore_index=True)

# For dates where sales are 0, set transactions to 0
transactions_complete.loc[transactions_complete["sales"].eq(0), "transactions"] = 0

# Interpolate missing transactions for each store
transactions_complete['transactions'] = transactions_complete.groupby('store_nbr')['transactions'].transform(
    lambda x: x.interpolate(method='linear')
)

# Round transactions to remove fractions from interpolation
transactions_complete['transactions'] = transactions_complete['transactions'].round().astype(int)

# Drop sales column
transactions_complete = transactions_complete.drop('sales', axis=1)

print(f"Added transactions: {len(transactions_complete) - len(transactions)}")

transactions = transactions_complete

Missing records from transactions: 7664
Added transactions: 7664
