Import packages and training data:

In [6]:
import seaborn as sns
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import datetime

os.chdir(r"c:\Users\JosephVovrosh\personalgit")

def get_missing_coordinates(df):
    """Return list of (row_index, column_name) where NaNs are found."""
    return list(zip(*np.where(pd.isna(df))))

In [7]:
df_train = pd.read_csv('data/Store_Sales/train.csv')
df_train['date'] = pd.to_datetime(df_train['date'])

stores = pd.read_csv('data/Store_Sales/stores.csv')

oil_prices = pd.read_csv('data/Store_Sales/oil.csv')
oil_prices['date'] = pd.to_datetime(oil_prices['date'])

oil_prices.set_index('date', inplace=True)
full_index = pd.date_range(start=oil_prices.index.min(), end=oil_prices.index.max(), freq='D')
oil_prices = oil_prices.reindex(full_index)
oil_prices = oil_prices.reset_index()
oil_prices.rename(columns={'index': 'date'}, inplace=True)

oil_prices = oil_prices.interpolate('linear', limit_direction='both')

transactions = pd.read_csv('data/Store_Sales/transactions.csv')
transactions['date'] = pd.to_datetime(transactions['date'])

holidays = pd.read_csv('data/Store_Sales/holidays_events.csv')
holidays['date'] = pd.to_datetime(holidays['date'])

df_test = pd.read_csv('data/Store_Sales/test.csv')
df_test['date'] = pd.to_datetime(df_test['date'])

In [8]:
df_train = df_train.merge(
    stores[['store_nbr', 'cluster', 'type', 'city', 'state']],
    on='store_nbr',
    how='left'
)

df_train = df_train.merge(
    transactions[['date', 'store_nbr', 'transactions']],
    on=['date', 'store_nbr'],
    how='left'
)
df_train['transactions'] = df_train['transactions'].fillna(0)

df_train = df_train.merge(
    oil_prices[['date','dcoilwtico']],
    on='date',
    how='left'
)

holiday_local = holidays[holidays['locale']=='Local'].copy()
holiday_local = holiday_local.rename(columns={'locale_name': 'city', 'type': 'type_local', 'transferred': 'transferred_local'})

holiday_regional = holidays[holidays['locale']=='Regional'].copy()
holiday_regional = holiday_regional.rename(columns={'locale_name': 'state', 'type': 'type_regional', 'transferred': 'transferred_regional'})

holiday_national = holidays[holidays['locale']=='National'].copy()
holiday_national = holiday_national.rename(columns={'type': 'type_national', 'transferred': 'transferred_national'})

df_train = df_train.merge(
    holiday_local[['date', 'city', 'type_local', 'transferred_local']],
    on=['date', 'city'],
    how='left'
)

df_train = df_train.merge(
    holiday_regional[['date', 'state', 'type_regional', 'transferred_regional']],
    on=['date', 'state'],
    how='left'
)

df_train = df_train.merge(
    holiday_national[['date', 'type_national', 'transferred_national']],
    on='date',
    how='left'
)

holiday_cols = [
    'type_local', 'transferred_local',
    'type_regional', 'transferred_regional',
    'type_national', 'transferred_national'
]
df_train[holiday_cols] = df_train[holiday_cols].fillna(-1)


In [9]:
df_train.drop(columns='id')
df_train['family'] = df_train['family'].astype('category').cat.codes
df_train['type'] = df_train['type'].astype('category').cat.codes
df_train['city'] = df_train['city'].astype('category').cat.codes
df_train['state'] = df_train['state'].astype('category').cat.codes
df_train['type_local'] = df_train['type_local'].astype('category').cat.codes
df_train['transferred_local'] = df_train['transferred_local'].astype('category').cat.codes
df_train['type_regional'] = df_train['type_regional'].astype('category').cat.codes
df_train['transferred_regional'] = df_train['transferred_regional'].astype('category').cat.codes
df_train['type_national'] = df_train['type_national'].astype('category').cat.codes
df_train['transferred_national'] = df_train['transferred_national'].astype('category').cat.codes

In [11]:
df_train.to_csv('Kaggle-Store_Sales/training_data.csv', index=False)