# Kaggle getting started store sales

Just for exercise purposes I did this entirely without looking at any of the discussions or other code in the contest. I'll take a look at it once I have something reasonable here

In [None]:
import utility as utl 
# My little collection of useful functions, updated frequently.
# https://www.kaggle.com/code/beezus666/utility

import lightgbm as lgb
from lightgbm.callback import early_stopping
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import optuna
import plotly.express as px
from itertools import islice
import re
from sklearn.metrics import mean_squared_log_error
import itertools
import warnings
warnings.filterwarnings('ignore')


IS_INTERACTIVE = os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive'

In [None]:
# read in csvs and create DFs
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:        
        print(os.path.join(dirname, filename))
        no_ext = f'{os.path.splitext(filename)[0]}_df'
        no_ext = no_ext.replace(" ", "_")
        no_ext = no_ext.replace("-", "_")
        globals()[no_ext] = pd.read_csv(os.path.join(dirname, filename))


# Observations of DFs
Just looking at the tables below, some quick observations:
- Train and test cover the same stores
- Train goes from Jan 2013 - Aug 15 2017
- Test is from Aug 16 2017 - Aug 31 2017
- Transactions are all transactions per store but only for the test data time period
- Oil prices go all the way to the end of test time period though


In [None]:
dfs = %who_ls DataFrame
for df in dfs:     
    utl.df_info(globals()[df], df)
    utl.summary(globals()[df])

# Transactions EDA

Below aggregates all transacitons, looking for trends. 
- Huge range around Christmas and NYE. Smaller upticks around other holidays, particularly in Early May.
- Weekly trends shown for uptick on Saturday/Sunday and downtick for Thursday/Friday.


## Notes from EDA
- Yearly trends: 
    - Spikes on days leading up to Christmas and New Year's eve. 
    - Drop off on New year's day, almost zero sales on New Year's day.
    - Smaller spikes in May 9th
- Weekly trends:
    - Saturday is biggest sales weekly, sunday 2nd
    - Thurs/Friday lowest weekly sales
- Overwhelming zeros
    - Many zeros, try tweedie distribution?

In [None]:
# Ensure 'date' column is of datetime type
transactions_df['date'] = pd.to_datetime(transactions_df['date'])

# Group by date and sum the transactions
daily_transactions = transactions_df.groupby('date').transactions.sum().reset_index()

# Sort the dataframe by date in ascending order
daily_transactions = daily_transactions.sort_values(by='date', ascending=True)

# Add day of the week column
daily_transactions['day_of_week'] = daily_transactions['date'].dt.day_name()

# Create a custom hover column
daily_transactions['hover_text'] = daily_transactions['date'].dt.strftime('%Y-%m-%d') + " (" + daily_transactions['day_of_week'] + "): " + daily_transactions['transactions'].astype(str)

fig = px.line(daily_transactions, x='date', y='transactions',
             title='Transactions Trend Across All Stores by Date',
             hover_name='hover_text')  # Use the custom hover column

fig.show()


In [None]:
copy_train = train_df.copy()

# Round the 'sales' column to the nearest integer
copy_train['sales'] = copy_train['sales'].round()

# Filter the DataFrame based on sales 
copy_train = copy_train[(copy_train['sales'] >= 0) & (copy_train['sales'] <= 10000)]

# Create a histogram
fig = px.histogram(copy_train, x='sales')
fig.update_layout(
    title='Sales aggregated and rounded',
    xaxis_title='Number of Sales',
    yaxis_title='Frequency'
)
fig.show()


# Feature engineering

In [None]:
train_test_date = '2017-08-01' # we're training, so leave last 15 days for evaluation

# Standard time series features 

Lag and offsets are very standard time series features. The one tricky thing with this problem is that we have to predict 15 days out for the test set.

### Feaures we're making:
- Calendar time date explode features
    - add datepart function takes dates and explodes it into different columns
- target encode DOW
- Target encode holidays


In [None]:
def process_df_pd(df, holidays_df, stores_df, cut_off_date, is_train = True):
    # Create a new DataFrame with 'sales' and 'id' columns only
    sales_df = df[['id', 'sales', 'date']].copy()    

    # Explode out dates
    df = utl.add_datepart(df, 'date', drop=False)
    utl.df_info(df, 'Exploded dates added')
    

    # add national holidays    
    no_transferred_national = holidays_df[(holidays_df['transferred'] == False) & (holidays_df['locale']=='National')]

    no_transferred_national['date'] = pd.to_datetime(no_transferred_national['date'])
    df['date'] = pd.to_datetime(df['date'])
    df = pd.merge(df, no_transferred_national[['date', 'description']], on='date', how='left')
    df.rename(columns={'description': 'national_holiday'}, inplace=True)

    
    def shift_sales(df, days):
        # Filter data based on the cut_off_date_1 for calculating the offset sales
        df_copy = df[df['date'] < cut_off_date].copy()
        df_copy['date'] += pd.Timedelta(days=days)
        df_copy.rename(columns={'sales': f'sales_{days}_days_ago'}, inplace=True)
        return df.merge(df_copy[['store_nbr', 'family', 'date', f'sales_{days}_days_ago']], on=['store_nbr', 'family', 'date'], how='left')
    
    # lag features
    for days in [7, 14, 28]:
        df = shift_sales(df, days)
        df[f'sales_{days}_days_ago'].fillna(method='ffill', inplace=True, limit=days)        
        print(f'Adding {days} lags.')
        if days == 7: utl.df_info(df, '7 days lags added') #take a look at it while processing

    # Delete all values for sales starting on the cut_off_date in the original df
    if is_train: df.loc[df['date'] >= cut_off_date, 'sales'] = np.nan
    
    # convert all object columns to categories
    object_columns = utl.get_columns_by_type(df, 'object')
    for col in object_columns: 
        df[col] = df[col].astype('category')   
    utl.df_info(sales_df, 'sales and id only')
    
    return df, sales_df  # Return both the modified df and the new sales_df

In [None]:
%%time
train_test_date = pd.to_datetime(train_test_date)
feat_eng_df, sales_id_df = process_df_pd(train_df, holidays_events_df, stores_df, train_test_date)


In [None]:
def dow_mean_encoding(df):
    # Convert 'date' to datetime type
    df['date'] = pd.to_datetime(df['date'])
    
    # Mask to exclude rows for mean calculation
    mask = ~((df['date'].dt.month == 12) & (df['date'].dt.day >= 12) | 
             (df['date'].dt.month == 1) & (df['date'].dt.day <= 5))
    filtered_df = df[mask]
    
    # Filter only rows from 2016 onwards
    filtered_df = filtered_df[filtered_df['date'] >= '2016-01-01']
    
    # Aggregate the filtered dataframe and compute mean sales
    aggregated_df = filtered_df.groupby(['store_nbr', 'family', 'Dayofweek'])['sales'].mean().reset_index()
    aggregated_df.rename(columns={'sales': 'dow_mean_sales'}, inplace=True)
    
    # Merge the original DataFrame with the aggregated DataFrame
    merged_df = pd.merge(df, aggregated_df, on=['store_nbr', 'family', 'Dayofweek'], how='left')

    return merged_df, aggregated_df


In [None]:
feat_eng_df, dow_encoded_df = dow_mean_encoding(feat_eng_df)
utl.summary(feat_eng_df)
utl.df_info(dow_encoded_df, "DOW means")

In [None]:
feat_eng_df[(feat_eng_df['family'] == 'PRODUCE') & (feat_eng_df['store_nbr'] == 41)& (feat_eng_df['date'] >= '2017-07-01')].head(100)


# Holiday target encodes

In [None]:
def holiday_mean_encoding(df):
    # Step 1: Aggregate using mean
    aggregated_df = df.groupby(['store_nbr', 'family', 'national_holiday'])['sales'].mean().reset_index()
    aggregated_df.rename(columns={'sales': 'holiday_mean_sales'}, inplace=True)
    
    # Step 2: Merge the aggregated values with the original df
    merged_df = pd.merge(df, aggregated_df, on=['store_nbr', 'family', 'national_holiday'], how='left')        
    
    # Step 3: Return both the merged DataFrame and the aggregated DataFrame
    return merged_df, aggregated_df


In [None]:
feat_eng_df, holiday_encoded_df = holiday_mean_encoding(feat_eng_df)

utl.df_info(feat_eng_df, "with holiday encode")
utl.df_info(holiday_encoded_df, "holidays average")

In [None]:
feat_eng_df[(feat_eng_df['family'] == 'PRODUCE') & (feat_eng_df['store_nbr'] == 41)& (feat_eng_df['date'] >= '2016-07-01') & 
            pd.notna(feat_eng_df['holiday_mean_sales'])]


# LGBM model

In [None]:
feat_eng_df.tail()

In [None]:
params = {
    'boosting_type': 'gbdt',
#    'objective': 'regression',
    'objective': 'tweedie',
    'metric': 'rmse',  # We will use RMSE for training, but we'll calculate RMSLE for evaluation later.
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'seed': 666
}

In [None]:
# dfs for train
X_train = feat_eng_df[(feat_eng_df['date'] < train_test_date) & (feat_eng_df['date'] >= '2017-01-15')].drop(['sales', 'date', 'id'], axis = 1)
#max_id = X_train['id'].max()
#y_train = sales_id_df[sales_id_df['id'] <= max_id]['sales']
y_train = feat_eng_df[(feat_eng_df['date'] < train_test_date) & (feat_eng_df['date'] >= '2017-01-15')]['sales']

# Dfs for test
X_test = feat_eng_df[(feat_eng_df['date'] >= train_test_date)].drop(['sales', 'date'], axis = 1)
min_id = X_test['id'].min()
X_test.drop('id', axis=1, inplace = True)
y_test = sales_id_df[sales_id_df['id'] >= min_id]['sales'] #need to use sales_id_df because deleted sales while targeting

In [None]:
utl.df_info(X_test, 'X_test')

In [None]:
X_train.shape, y_train.shape

In [None]:
%%time
lgb_train_data = lgb.Dataset(X_train, label=y_train)
lgb_test_data = lgb.Dataset(X_test, label=y_test, reference=lgb_train_data)

num_round = 1000
callbacks = [early_stopping(stopping_rounds=50)]

# should take about 40 seconds to train
bst = lgb.train(params, lgb_train_data, num_round, valid_sets=[lgb_test_data],callbacks = callbacks)

y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
y_pred[y_pred <= 0] = 0 #preds coming back negative...


def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))
error = rmsle(y_test, y_pred)
rounded_error = round(error, 4)
print(f"RMSLE: {rounded_error}")

# CV Training log
1. RMSLE: 1.8687 Base
2. RMSLE: 1.8609 Removed Christmas from DOW mean sales encodes
3. RMSLE: 0.9247 Tweedie distro - lots of 0's...
4. RMSLE: 0.9202 removed id..
5. RMSLE: 0.8519 Removed elapsed
6. RMSLE: 0.9003 ~~Removed 'Is_month_end','Is_month_start', 'Is_quarter_end' ~~
7. RMSLE: 0.8095 increased early stopping to 50. Besides improving the score... the model was unstable with early stopping = 10, the score changed (improved) when run a 2nd time.
8. **RMSLE: 0.4884!! removed everything prior to 2016 for DOW encoding!!!**
9. RMSLE: 0.5183 Added holiday encoding and score dropped to ~0.78, removed all training data prior to Jan 2017 and improved to new score

In [None]:
# Lets look at Feature importance of the model
feature_imp_gain = pd.DataFrame({
    'Feature': bst.feature_name(),
    'Importance (Gain)': bst.feature_importance(importance_type='gain')
})

feature_imp_split = pd.DataFrame({
    'Feature': bst.feature_name(),
    'Importance (Split)': bst.feature_importance(importance_type='split')
})

# Sort the DataFrames by importance
feature_imp_gain = feature_imp_gain.sort_values(by='Importance (Gain)', ascending=False)
feature_imp_split = feature_imp_split.sort_values(by='Importance (Split)', ascending=False)

# Create two separate plots
fig_gain = px.bar(feature_imp_gain, 
             x='Importance (Gain)', 
             y='Feature', 
             orientation='h', 
             title='LightGBM Feature Importance (Gain)',
             labels={'Importance (Gain)': 'Feature Importance', 'Feature': 'Feature Name'},
             width=800, height=600)  

fig_split = px.bar(feature_imp_split, 
             x='Importance (Split)', 
             y='Feature', 
             orientation='h', 
             title='LightGBM Feature Importance (Split)',
             labels={'Importance (Split)': 'Feature Importance', 'Feature': 'Feature Name'},
             width=800, height=600)  

fig_gain.show()
fig_split.show()

# Process test_df, predict and submit

- Use the same defs made above for train
    - Except for target encodes, use the resulting df
- Predict
    - Quick and dirty, just use model from CV
    - When ready: 
        - reprocess train_df with whole data, retrain model
        - reprocess train_df

In [None]:
holiday_encoded_df.head()

In [None]:
%%time
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])
min_test_date = test_df['date'].min()

# Concatenate train_df and test_df
# Concatenating train and test because we want to get 7/14/28 day lags
concatenated_df = pd.concat([train_df, test_df], ignore_index=True)

# Filter rows from train_df based on the date condition
# date_condition = test_df['date'].min() - pd.DateOffset(days=56)
# filtered_train_df = train_df[train_df['date'] >= date_condition]

# Run the concatenated DataFrame through the process_df_pd function
feat_eng_test_df, sales_df_delete_me = process_df_pd(concatenated_df, holidays_events_df, stores_df, min_test_date, is_train = False)

# remove all rows that weren't in training_df
feat_eng_test_df = feat_eng_test_df[(feat_eng_test_df['date'] >= min_test_date)]


# get DOW and holiday encodes that were encoded during training
feat_eng_test_df = pd.merge(feat_eng_test_df, dow_encoded_df, on=['store_nbr', 'family', 'Dayofweek'], how='left')
feat_eng_test_df = pd.merge(feat_eng_test_df, holiday_encoded_df, on=['store_nbr', 'family', 'national_holiday'], how='left')

utl.df_info(feat_eng_test_df, 'df for predictions')

# The 'dow_mean_sales' column from dow_encoded_df has been added to feat_eng_test_df


# # Step 4: Remove rows from train_df in the resulting DataFrame
# final_df, sales_df_delete_me = processed_df[~processed_df.index.isin(filtered_train_df.index)]

# Now, final_df contains the desired DataFrame with the specified operations applied.


In [None]:
feat_eng_test_df[(feat_eng_test_df['family'] == 'PRODUCE') & (feat_eng_test_df['store_nbr'] == 9)].head(100)

In [None]:
# Dfs for test
X_test_sub = feat_eng_test_df.drop(['sales', 'date', 'id'], axis = 1)
X_test_ids = feat_eng_test_df['id']

In [None]:
utl.df_info(X_test_sub, "df for predict for submission")

In [None]:
%%time
num_round = 1000
callbacks = [early_stopping(stopping_rounds=50)]


y_pred_sub = bst.predict(X_test_sub, num_iteration=bst.best_iteration)
y_pred_sub[y_pred_sub <= 0] = 0 #preds coming back negative...

y_pred_sub[:20]

In [None]:
data = {'id': X_test_ids, 'sales': y_pred_sub}

# Create a DataFrame
sub_df = pd.DataFrame(data)
sub_df.tail()

In [None]:
sample_submission_df.tail()

In [None]:
sub_df.to_csv('submission.csv', index=False)

# Leaderboard log
1. RMSLE: 0.51891... pretty encouraging that it's very close to my CV. Not a terrible score, didn't even put holidays in yet.
2. RMSLE: 0.77593 added holiday encodes