# Project name: [Predict Future Sales](https://www.kaggle.com/c/competitive-data-science-predict-future-sales)

## Objective

Get better than 1.05 score on [Public Leaderboard](https://www.kaggle.com/c/competitive-data-science-predict-future-sales/leaderboard)

## Version

Just simple median - score 1.41241

In [1]:
__ver__ = "0.1"

## Setup

In [2]:
import numpy as np
import pandas as pd
import catboost
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline 

import itertools

# Data load

In [3]:
# path
cat_path = "./raw_data/item_categories.csv"
items_path = "./raw_data/items.csv"
shop_path = "./raw_data/shops.csv"
sales_path = "./raw_data/sales_train.csv.gz"
test_path = "./raw_data/test.csv.gz"

In [4]:
# load
cat = pd.read_csv(cat_path)
items = pd.read_csv(items_path)
shops = pd.read_csv(shop_path)
sales_params = dict(parse_dates=[0], infer_datetime_format = True, dayfirst=True)
sales = pd.read_csv(sales_path, **sales_params)
# load and save ID in index
test = pd.read_csv(test_path).set_index('ID')

## Monthly sales

In [5]:
# drop columns - not in test
sales.drop(["date", "item_price"], axis=1, inplace=True)

In [6]:
# get monthly sales
sales = sales.groupby(["date_block_num", "shop_id", "item_id"], as_index=False).sum()

## Stack train and test data

In [7]:
# add date_block_num to test
test_date_block = sales.date_block_num.max() + 1
test["date_block_num"] = test_date_block

In [8]:
# stack train and test data
data = pd.concat([sales, test], axis=0, sort=False)
data.index.name = "ID"
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1823324 entries, 0 to 214199
Data columns (total 4 columns):
date_block_num    int64
shop_id           int64
item_id           int64
item_cnt_day      float64
dtypes: float64(1), int64(3)
memory usage: 69.6 MB


In [9]:
# downcast types
down_cast = dict(
    date_block_num='int8',
    shop_id='category',
    item_id='category',
    item_cnt_day='float16' # must be float - NaNs in test part
)
data = data.astype(down_cast)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1823324 entries, 0 to 214199
Data columns (total 4 columns):
date_block_num    int8
shop_id           category
item_id           category
item_cnt_day      float16
dtypes: category(2), float16(1), int8(1)
memory usage: 25.1 MB


In [10]:
# save
processed_path = f"./processed_data/data_{__ver__}.pickle"
data.to_pickle(processed_path)

## Load processed data

In [11]:
data = pd.read_pickle(processed_path)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1823324 entries, 0 to 214199
Data columns (total 4 columns):
date_block_num    int8
shop_id           category
item_id           category
item_cnt_day      float16
dtypes: category(2), float16(1), int8(1)
memory usage: 24.5 MB


## Simple baseline solution

In [12]:
# test_date_block
baseline_sales = data.item_cnt_day[data.date_block_num < test_date_block].median()

In [13]:
sub_index = data.index[data.date_block_num == test_date_block]

In [14]:
sub_df = pd.DataFrame(baseline_sales, index=sub_index, columns=["item_cnt_month"])

## Submission

In [15]:
sub_path = f"./submissions/submission_{__ver__}.csv"
sub_df.to_csv(sub_path)