In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Process Involved
1. Understand our data better in Exploratory Data Analysis -- do necessary data wrangling
2. Use sales from Oct 2015 as predictions for Nov 2015 (Previous Value Benchmark)
3. Quick Baseline. Apply some variant of decision tree (wihtout any feature engineering, compare this with previously value benchmark)
4. Set up cross validation to try out different feature engineering ideas
5. Tune decision tree models, try to tune and get several diverse models with similar performance
6. Use Ensemble methods to boost score


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import time

In [3]:
from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm
from sklearn import preprocessing
from xgboost import plot_tree
from matplotlib import pyplot

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
kernel_with_output = False

In [6]:
# data lading
if kernel_with_output:
    sales_train = pd.read_csv('data/sales_train.csv')
    items = pd.read_csv('data/items.csv')
    shops = pd.read_csv('data/shops.csv')
    item_categories = pd.read_csv('data/item_categories.csv')
    test = pd.read_csv('data/test.csv')
    sample_submission = pd.read_csv('data/sample_submission.csv')

In [None]:
# Insert missing rows and aggregations
if kernel_with_output:
    # For every month we create a grid from all shops/items combinations from that month
    grid = []
    for block_num in sales_train['date_block_num'].unique():
        cur_shops = sales_train[sales_train['date_block_num']==block_num]['shop_id'].unique()
        cur_items = sales_train[sales_train['date_block_num']==block_num]['item_id'].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
    index_cols = ['shop_id', 'item_id', 'date_block_num']
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Aggregations
    sales_train['item_cnt_day'] = sales_train['item_cnt_day'].clip(0,20)
    groups = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'])
    trainset = groups.agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
    trainset = trainset.rename(columns = {'item_cnt_day' : 'item_cnt_month'})
    trainset['item_cnt_month'] = trainset['item_cnt_month'].clip(0,20)

    trainset = pd.merge(grid,trainset,how='left',on=index_cols)
    trainset.item_cnt_month = trainset.item_cnt_month.fillna(0)

    # Get category id
    trainset = pd.merge(trainset, items[['item_id', 'item_category_id']], on = 'item_id')
    trainset.to_csv('trainset_with_grid.csv')

    trainset.head()