# Important stuff

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Download

In [None]:
! pip install -q kaggle

In [None]:
# using kaggle.json
from google.colab import files
files.upload()

In [None]:
# Hiding kaggle.json file and downloading 
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c competitive-data-science-predict-future-sales

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading shops.csv to /content
  0% 0.00/2.91k [00:00<?, ?B/s]
100% 2.91k/2.91k [00:00<00:00, 2.62MB/s]
Downloading sales_train.csv.zip to /content
 83% 11.0M/13.3M [00:00<00:00, 115MB/s]
100% 13.3M/13.3M [00:00<00:00, 120MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/468k [00:00<?, ?B/s]
100% 468k/468k [00:00<00:00, 150MB/s]
Downloading items.csv.zip to /content
  0% 0.00/368k [00:00<?, ?B/s]
100% 368k/368k [00:00<00:00, 123MB/s]
Downloading item_categories.csv to /content
  0% 0.00/3.49k [00:00<?, ?B/s]
100% 3.49k/3.49k [00:00<00:00, 3.63MB/s]
Downloading test.csv.zip to /content
  0% 0.00/1.02M [00:00<?, ?B/s]
100% 1.02M/1.02M [00:00<00:00, 134MB/s]


In [None]:
import os

my_dir = '/content/'
filelist = [f for f in os.listdir(my_dir) if '.' in f]
filelist = filelist[2:]
print(filelist)

['sales_train.csv.zip', 'items.csv.zip', 'sample_submission.csv.zip', 'item_categories.csv', 'shops.csv', 'test.csv.zip']


In [None]:
! mkdir '/content/drive/MyDrive/HSE Final Project'
! mkdir '/content/drive/MyDrive/HSE Final Project/Data'

In [None]:
import shutil

new_dir = '/content/drive/MyDrive/HSE Final Project/Data/'

for file_name in filelist:
    shutil.move(my_dir + file_name, new_dir + file_name)

In [None]:
! unzip '/content/drive/MyDrive/HSE Final Project/Data/items.csv.zip'
! unzip '/content/drive/MyDrive/HSE Final Project/Data/sales_train.csv.zip'
! unzip '/content/drive/MyDrive/HSE Final Project/Data/test.csv.zip'

Archive:  /content/drive/MyDrive/HSE Final Project/Data/items.csv.zip
  inflating: items.csv               
Archive:  /content/drive/MyDrive/HSE Final Project/Data/sales_train.csv.zip
  inflating: sales_train.csv         
Archive:  /content/drive/MyDrive/HSE Final Project/Data/test.csv.zip
  inflating: test.csv                


In [None]:
! mkdir '/content/drive/MyDrive/HSE Final Project/Data/Zip'

In [None]:
filenames = ['items.csv.zip', 'sales_train.csv.zip', 'test.csv.zip']

src = '/content/drive/MyDrive/HSE Final Project/Data/'
dest = src + 'Zip/'

for file_name in filenames:
    shutil.move(src + file_name, dest + file_name)

In [None]:
file_names = [file_name[:-4] for file_name in filenames]
print(file_names)

['items.csv', 'sales_train.csv', 'test.csv']


In [None]:
src = '/content/'
dest = '/content/drive/MyDrive/HSE Final Project/Data/'

for file_name in file_names:
    shutil.move(src + file_name, dest + file_name)

# Review criteria

**Clarity**

- The clear step-by-step instruction on how to produce the final submit file is provided

- Code has comments where it is needed and meaningful function names

**Feature preprocessing and generation with respect to models**

- Several simple features are generated

- For non-tree-based models preprocessing is used or the absence of it is explained

**Feature extraction from text and images**

- Features from text are extracted

- Special preprocessings for text are utilized (TF-IDF, stemming, levenshtening...)

**EDA**

- Several interesting observations about data are discovered and explained

- Target distribution is visualized, time trend is assessed

**Validation**

- Type of train/test split is identified and used for validation

- Type of public/private split is identified

**Data leakages**

- Data is investigated for data leakages and investigation process is described

- Found data leakages are utilized

**Metrics optimization**

- Correct metric is optimized

**Advanced Features I: mean encodings**

- Mean-encoding is applied

- Mean-encoding is set up correctly, i.e. KFold or expanding scheme are utilized correctly

**Advanced Features II**

- At least one feature from this topic is introduced

**Hyperparameter tuning**

- Parameters of models are roughly optimal

**Ensembles**

- Ensembling is utilized (linear combination counts)

- Validation with ensembling scheme is set up correctly, i.e. KFold or Holdout is utilized

- Models from different classes are utilized (at least two from the following: KNN, linear models, RF, GBDT, NN)

# Baseline

## Main Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
dest = '/content/drive/MyDrive/HSE Final Project/Data/'

df_train = pd.read_csv(dest + 'sales_train.csv')
df_test = pd.read_csv(dest + 'test.csv')

In [4]:
df_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [5]:
df_test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


## Additional Data (not needed yet)

In [None]:
df_shops = pd.read_csv(dest + 'shops.csv')
df_shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [None]:
df_categories = pd.read_csv(dest + 'item_categories.csv')
df_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


## Analysis for train

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


In [7]:
df_train['date'] =  pd.to_datetime(df_train['date'], format='%d.%m.%Y')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   date            datetime64[ns]
 1   date_block_num  int64         
 2   shop_id         int64         
 3   item_id         int64         
 4   item_price      float64       
 5   item_cnt_day    float64       
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 134.4 MB


In [8]:
df_train['year'] = df_train.date.dt.year
df_train['month'] = df_train.date.dt.month
df_train['day'] = df_train.date.dt.day
df_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,year,month,day
0,2013-01-02,0,59,22154,999.0,1.0,2013,1,2
1,2013-01-03,0,25,2552,899.0,1.0,2013,1,3
2,2013-01-05,0,25,2552,899.0,-1.0,2013,1,5
3,2013-01-06,0,25,2554,1709.05,1.0,2013,1,6
4,2013-01-15,0,25,2555,1099.0,1.0,2013,1,15


In [9]:
df_train.drop('date', axis=1, inplace=True)
df_train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,year,month,day
0,0,59,22154,999.0,1.0,2013,1,2
1,0,25,2552,899.0,1.0,2013,1,3
2,0,25,2552,899.0,-1.0,2013,1,5
3,0,25,2554,1709.05,1.0,2013,1,6
4,0,25,2555,1099.0,1.0,2013,1,15


In [10]:
df_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,year,month,day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641,2013.777,6.247717,15.85267
std,9.422988,16.22697,6324.297,1729.8,2.618834,0.768479,3.536219,8.923483
min,0.0,0.0,0.0,-1.0,-22.0,2013.0,1.0,1.0
25%,7.0,22.0,4476.0,249.0,1.0,2013.0,3.0,8.0
50%,14.0,31.0,9343.0,399.0,1.0,2014.0,6.0,16.0
75%,23.0,47.0,15684.0,999.0,1.0,2014.0,9.0,24.0
max,33.0,59.0,22169.0,307980.0,2169.0,2015.0,12.0,31.0


In [15]:
df_train_monthly = df_train.groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].sum()
df_train_monthly.head()

date_block_num  item_id  shop_id
0               19       25         1.0
                27       1          1.0
                         2          1.0
                         10         1.0
                         19         1.0
Name: item_cnt_day, dtype: float64

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split