In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sklearn
from sklearn.model_selection import GridSearchCV, PredefinedSplit
import lightgbm as lgb
import scipy.sparse 
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import pickle, gc
from tqdm import tqdm_notebook
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import warnings
from itertools import product
warnings.filterwarnings('ignore')
%matplotlib inline 

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)
sns.set(rc={'figure.figsize':(20, 10)})

In [None]:
sales = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
item_cats = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

## EDA
- the total sales by each month
    - the top sales is reached in month 11 and 23.
- shop and items combinations per month
    - there are more shops selling items in month 0, 2, 11 and 23.
- total sales from each shop
    - There are over 300,000 sales from shop 31. And, the shop 25 has over 200,000 sales.
- total amount sold to customers per item
    - item id 20949 has the highest sales.
- total items sold in each category
    - 40 and 30 category items have the most sales.
    
    

In [None]:
sns.set_context("talk", font_scale=1.4)
sales_month = pd.DataFrame(sales.groupby(['date_block_num']).sum().item_cnt_day).reset_index()
sales_month.columns = ['date_block_num', 'sum_items_sold']
sns.barplot(x ='date_block_num', y='sum_items_sold', 
            data=sales_month.reset_index());
plt.plot(sales_month.sum_items_sold)
plt.title('Distribution of the sum of sales per month')
del sales_month

In [None]:
comb_shop_item = pd.DataFrame(sales[['date_block_num', 'shop_id', 
                                     'item_id']].drop_duplicates().groupby('date_block_num').size()).reset_index()
comb_shop_item.columns = ['date_block_num', 'item-shop_comb']
sns.barplot(x ='date_block_num', y='item-shop_comb', data=comb_shop_item);
plt.plot(comb_shop_item['item-shop_comb']);
plt.title('Number of combinations shop-it with sales per month')
del comb_shop_item

In [None]:
sns.set_context("talk", font_scale=1)
sales_month_shop_id = pd.DataFrame(sales.groupby(['shop_id']).sum().item_cnt_day).reset_index()
sales_month_shop_id.columns = ['shop_id', 'sum_sales']
sns.barplot(x ='shop_id', y='sum_sales', data=sales_month_shop_id, palette='Paired')
plt.title('Distribution of sales per shop');
del sales_month_shop_id

In [None]:
sns.set_context("talk", font_scale=1.4)
sales_item_id = pd.DataFrame(sales.groupby(['item_id']).sum().item_cnt_day)
plt.xlabel('item id')
plt.ylabel('sales')
plt.plot(sales_item_id);

In [None]:
#fig, ax = plt.subplots()
sns.set_context("talk", font_scale=0.8)
sales_item_cat = sales.merge(items, how='left', on='item_id').groupby('item_category_id').item_cnt_day.sum()
sns.barplot(x ='item_category_id', y='item_cnt_day',
            data=sales_item_cat.reset_index(), 
            palette='Paired'
           );
del sales_item_cat

## Data Leakage¶

Only the 41% of the all combinations item-shop that we have data appears in the test set.

We model in order to take advantage of this data leakage as much as possible.

In [None]:
tuples_df = pd.Series(list(sales[['item_id', 'shop_id']].itertuples(index=False, name=None)))
tuples_test = pd.Series(list(test[['item_id', 'shop_id']].itertuples(index=False, name=None)))
print(str(round(tuples_df.isin(tuples_test).sum()/len(tuples_df),2)*100)+'%')

## Get a feature matrix
- geneate dataframe from all shops/items combinations each month
- features: 
    - target_shop, target_item, target (total sales per month)
    - mean encoding features: item_target_enc
    - lagged features of 1, 2, 3 months: target, target_shop, target_item


In [None]:
sales.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()

In [None]:
def get_feature_matrix(sales, test, items, list_lags, date_block_threshold):
    
    """ This function create the model tablon"""
  
    # Create "grid" with columns
    index_cols = ['shop_id', 'item_id', 'date_block_num']

    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    new_items = pd.DataFrame()
    cur_items_aux=np.array([])
    for block_num in sales['date_block_num'].unique():
        cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
        cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].append(pd.Series(cur_items_aux)).unique()
        cur_items_aux = cur_items[pd.Series(cur_items).isin(test.item_id)]
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into a dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Add submission shop_id-item_id in order to test predictions
    test['date_block_num'] = 34
    grid = grid.append(test[['shop_id', 'item_id', 'date_block_num']])

    # Groupby data to get shop-item-month aggregates
#     gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
    gb = sales.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()
    gb.rename(columns={"item_cnt_day": "target"}, inplace=True)
    # Fix column names
#     gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
    # Join it to the grid
    all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

    # Same as above but with shop-month aggregates
#     gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
    gb = sales.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()
    gb.rename(columns={"item_cnt_day": "target_shop"}, inplace=True)
#     gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

    # Same as above but with item-month aggregates
#     gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
    gb = sales.groupby(['item_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()
    gb.rename(columns={"item_cnt_day": "target_item"}, inplace=True)
#     gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)
    
    # mean encoding
#     shop_id_target_mean = all_data.groupby('shop_id').target.mean()
#     all_data['shop_target_enc'] = all_data['shop_id'].map(shop_id_target_mean)
#     all_data['shop_target_enc'].fillna(0.3343, inplace=True) 
    item_id_target_mean = all_data.groupby('item_id').target.mean()
    all_data['item_target_enc'] = all_data['item_id'].map(item_id_target_mean)
    all_data['item_target_enc'].fillna(0.3343, inplace=True) 
#     gb = sales[['item_target_enc', 'item_id', 'date_block_num']]
#     all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num'])
    print(all_data.head())

    # Downcast dtypes from 64 to 32 bit to save memory
    all_data = downcast_dtypes(all_data)
    del grid, gb 
    gc.collect()
    # List of columns that we will use to create lags
    cols_to_rename = list(all_data.columns.difference(index_cols)) 
    cols_to_rename.remove('item_target_enc')
    print(cols_to_rename)
    shift_range = list_lags

    for month_shift in tqdm_notebook(shift_range):
        train_shift = all_data[index_cols + cols_to_rename].copy()
    
        train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
        foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

    del train_shift

    # Don't use old data from year 2013
    all_data = all_data[all_data['date_block_num'] >= date_block_threshold] 
    # List of all lagged features
    fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
    # We will drop these at fitting stage
    to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 
    # Category for each item
    item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

    all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
    all_data = downcast_dtypes(all_data)
    gc.collect();
    
    return [all_data, to_drop_cols]

def downcast_dtypes(df):
    
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

def clip20(x):
    return np.clip(x, 0, 20)

def clip40(x):
    return np.clip(x, 0, 20)

def rmse(*args):
    
    """ Funcion that calculates the root mean squared error"""
    return np.sqrt(mean_squared_error(*args))

In [None]:
list_lags = [1, 2, 3]
date_block_threshold = 12
sales_for_modelling = sales[sales.item_id.isin(test.item_id)]
[all_data, to_drop_cols]  = get_feature_matrix(sales_for_modelling, test, items, list_lags, date_block_threshold)
all_data.head()

In [None]:
sns.set_context("talk", font_scale=1.4)
plt.title('Number of different shop-item combinations in data per month')
all_data.groupby('date_block_num').size().plot();

## Advanced Feature Engineering
mean / variance encoding for category-id last month

In [None]:
mean_enc_item_cat = pd.DataFrame(all_data.groupby(['shop_id', 
                                                    'item_category_id']).target.agg(['mean', 'var']).reset_index())
mean_enc_item_cat.columns = ['shop_id', 'item_category_id', 'mean_enc_cat_id', 'var_enc_cat_id']
all_data = pd.merge(all_data, mean_enc_item_cat, how='left', on=['shop_id', 'item_category_id'])
del mean_enc_item_cat
all_data = downcast_dtypes(all_data)

In [None]:
sub_data = all_data[all_data.date_block_num==34].fillna(0)
all_data = all_data[all_data.date_block_num<34].fillna(0)
sub_data.head()

In [None]:
dates = all_data['date_block_num']
boolean_test = (dates.isin([22,31,32,33])) # & (boolean)
boolean_train = ~boolean_test
dates_train = dates[boolean_train]
dates_val  = dates[boolean_test]

X_train = all_data.loc[boolean_train].drop(to_drop_cols, axis=1)
X_val =  all_data.loc[boolean_test].drop(to_drop_cols, axis=1)

y_train = all_data.loc[boolean_train, 'target'].values
y_val =  all_data.loc[boolean_test, 'target'].values

In [None]:
tuples_validation_submission = pd.Series(list(all_data.loc[all_data['date_block_num']==33, ['item_id', 'shop_id']].itertuples(index=False, name=None)))

In [None]:
print('X_train shape is ' + str(X_train.shape))
print('X_val shape is ' + str(X_val.shape))

In [None]:
# tuples_validation_submission = pd.Series(list(test[['item_id', 'shop_id']][dates_val==33].itertuples(index=False, name=None)))
tuples_validation_submission = pd.Series(list(all_data.loc[all_data['date_block_num']==33, ['item_id', 'shop_id']].itertuples(index=False, name=None)))
print(f'The {round(tuples_test.isin(tuples_validation_submission).sum()/len(tuples_test),2)*100} % of the item_id-shop_id are in the cv set ')

## Modelling & Hyperparameter tuning
- Light Gradient Boosting
- RandomForestRegressor

In [None]:
learning_rates = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
best_rmse = 9999999999999
for lr in learning_rates:
    lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': lr, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

    lgb_model = lgb.train(lgb_params, lgb.Dataset(X_train, label=clip40(y_train)), int(100 * (lr / 0.03)))
    pred_lgb_val = lgb_model.predict(X_val)
    score = rmse(clip20(y_val), clip20(pred_lgb_val))

    if score < best_rmse:
        best_rmse = score
        best_lr = lr
        best_lgb = lgb_model

In [None]:
best_lr
# 0.04

In [None]:
X = X_train.append(X_val)
y = np.append(y_train, y_val)

best_lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.04, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }
best_lgb = lgb.train(lgb_params, lgb.Dataset(X, label=clip40(y)), int(100 * (lr / 0.03)))

In [None]:
filename = 'best_lgb_2.sav'
pickle.dump(best_lgb, open(filename, 'wb'))

In [None]:
# Random Forest
X = X_train.append(X_val)
Y = np.concatenate([y_train, y_val])

regr = RandomForestRegressor(bootstrap=0.7, criterion='mse', max_depth=12,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=4,
           verbose=0, warm_start=False)
regr.fit(X, Y)

# train_ind=np.zeros(X.shape[0])
# for i in range(0, len(X_train)):
#     train_ind[i] = -1
# ps = PredefinedSplit(test_fold=(train_ind))

In [None]:
filename = 'best_rf_2.sav'
pickle.dump(regr, open(filename, 'wb'))

In [None]:
# param_grid={'bootstrap':[0.7, 0.8], 'max_features':[4, 6, 8], 
#             'max_depth' : [None, 4, 6, 8, 10, 12]}
# gs = GridSearchCV(cv = None, 
#                   estimator = RandomForestRegressor(n_estimators=300, n_jobs=4), 
#                   param_grid=param_grid, scoring='neg_mean_squared_error')

In [None]:
# gs.fit(X, clip40(Y))
# best_rf = gs.best_estimator_
# best_rf = gs.best_estimator_
# best_rf

## Model Prediction

In [None]:
rf = pickle.load(open('/kaggle/input/model2/best_rf_2.sav', 'rb'))
lgb = pickle.load(open('/kaggle/input/model2/best_lgb_2.sav', 'rb'))

In [None]:
pred_lgb_val = lgb.predict(X_val)
print('Train RMSE for lgb is  %f' % rmse(clip20(y_train), clip20(lgb.predict(X_train))))
print('Val RMSE for lgb is %f' % rmse(clip20(y_val), clip20(pred_lgb_val)))
# Train RMSE for lgb is  0.836673
# Val RMSE for lgb is 0.803742

In [None]:
feat_importances = pd.Series(lgb.feature_importance(), index=X_val.columns)
feat_importances = feat_importances.nlargest(20)
feat_importances.plot(kind='barh')
plt.title('Feature importance LGB')
plt.show()

In [None]:
pred_rf_val = clip20(rf.predict(X_val.fillna(0)))
print('Train RMSE for rf is %f' % rmse(clip20(y_train), clip20(rf.predict(X_train))))
print('Val RMSE for rf is %f' % rmse(clip20(y_val), pred_rf_val))

# Train RMSE for rf is 0.990761
# Val RMSE for rf is 0.936114

In [None]:
feat_importances = pd.Series(rf.feature_importances_, index=X_val.columns)
feat_importances = feat_importances.nlargest(20)
feat_importances.plot(kind='barh')
plt.title('Feature importance RF')
plt.show()

In [None]:
plt.scatter(clip20(pred_rf_val), clip20(pred_lgb_val))

In [None]:
X_val_level2 = np.c_[pred_rf_val, pred_lgb_val]

## Ensembles
### Stacking with linear regression
- combine the validation prediction from random forest regressor and light gradient boost

In [None]:
lr = LinearRegression()
lr.fit(X_val_level2, clip20(y_val))
pred_lr_val =  clip20(lr.predict(X_val_level2))
print('Test rmse for stacking variables is %f' % rmse(clip20(y_val), pred_lr_val))

In [None]:
test_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
test_df.head()

In [None]:
sam_sub = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
sam_sub.head()

In [None]:
sub_data.drop(to_drop_cols, axis=1).fillna(0)

In [None]:
pred_test_rf = rf.predict(sub_data.drop(to_drop_cols, axis=1).fillna(0))
pred_test_lgb = lgb.predict(sub_data.drop(to_drop_cols, axis=1).fillna(0))
X_test_level2 = np.c_[clip20(pred_test_rf), clip20(pred_test_lgb)]
test_pred = clip20(lr.predict(X_test_level2))

In [None]:
predictions = pd.DataFrame()
predictions['shop_id'] = test.shop_id
predictions['item_id'] = test.item_id
predictions['item_cnt_month'] = test_pred
submision = test[['ID', 'shop_id', 'item_id']].merge(predictions, on=['shop_id', 'item_id'], how='left').fillna(0)
submision[['ID', 'item_cnt_month']].to_csv('submission.csv',index=False)