# Important stuff

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Download

In [2]:
! pip install -q kaggle

In [None]:
# using kaggle.json
from google.colab import files
files.upload()

In [10]:
# Hiding kaggle.json file and downloading 
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c competitive-data-science-predict-future-sales

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading shops.csv to /content
  0% 0.00/2.91k [00:00<?, ?B/s]
100% 2.91k/2.91k [00:00<00:00, 2.62MB/s]
Downloading sales_train.csv.zip to /content
 83% 11.0M/13.3M [00:00<00:00, 115MB/s]
100% 13.3M/13.3M [00:00<00:00, 120MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/468k [00:00<?, ?B/s]
100% 468k/468k [00:00<00:00, 150MB/s]
Downloading items.csv.zip to /content
  0% 0.00/368k [00:00<?, ?B/s]
100% 368k/368k [00:00<00:00, 123MB/s]
Downloading item_categories.csv to /content
  0% 0.00/3.49k [00:00<?, ?B/s]
100% 3.49k/3.49k [00:00<00:00, 3.63MB/s]
Downloading test.csv.zip to /content
  0% 0.00/1.02M [00:00<?, ?B/s]
100% 1.02M/1.02M [00:00<00:00, 134MB/s]


In [11]:
import os

my_dir = '/content/'
filelist = [f for f in os.listdir(my_dir) if '.' in f]
filelist = filelist[2:]
print(filelist)

['sales_train.csv.zip', 'items.csv.zip', 'sample_submission.csv.zip', 'item_categories.csv', 'shops.csv', 'test.csv.zip']


In [8]:
! mkdir '/content/drive/MyDrive/HSE Final Project'
! mkdir '/content/drive/MyDrive/HSE Final Project/Data'

In [12]:
import shutil

new_dir = '/content/drive/MyDrive/HSE Final Project/Data/'

for file_name in filelist:
    shutil.move(my_dir + file_name, new_dir + file_name)

In [13]:
! unzip '/content/drive/MyDrive/HSE Final Project/Data/items.csv.zip'
! unzip '/content/drive/MyDrive/HSE Final Project/Data/sales_train.csv.zip'
! unzip '/content/drive/MyDrive/HSE Final Project/Data/test.csv.zip'

Archive:  /content/drive/MyDrive/HSE Final Project/Data/items.csv.zip
  inflating: items.csv               
Archive:  /content/drive/MyDrive/HSE Final Project/Data/sales_train.csv.zip
  inflating: sales_train.csv         
Archive:  /content/drive/MyDrive/HSE Final Project/Data/test.csv.zip
  inflating: test.csv                


In [14]:
! mkdir '/content/drive/MyDrive/HSE Final Project/Data/Zip'

In [15]:
filenames = ['items.csv.zip', 'sales_train.csv.zip', 'test.csv.zip']

src = '/content/drive/MyDrive/HSE Final Project/Data/'
dest = src + 'Zip/'

for file_name in filenames:
    shutil.move(src + file_name, dest + file_name)

In [16]:
file_names = [file_name[:-4] for file_name in filenames]
print(file_names)

['items.csv', 'sales_train.csv', 'test.csv']


In [17]:
src = '/content/'
dest = '/content/drive/MyDrive/HSE Final Project/Data/'

for file_name in file_names:
    shutil.move(src + file_name, dest + file_name)

# Review criteria

**Clarity**

- The clear step-by-step instruction on how to produce the final submit file is provided

- Code has comments where it is needed and meaningful function names

**Feature preprocessing and generation with respect to models**

- Several simple features are generated

- For non-tree-based models preprocessing is used or the absence of it is explained

**Feature extraction from text and images**

- Features from text are extracted

- Special preprocessings for text are utilized (TF-IDF, stemming, levenshtening...)

**EDA**

- Several interesting observations about data are discovered and explained

- Target distribution is visualized, time trend is assessed

**Validation**

- Type of train/test split is identified and used for validation

- Type of public/private split is identified

**Data leakages**

- Data is investigated for data leakages and investigation process is described

- Found data leakages are utilized

**Metrics optimization**

- Correct metric is optimized

**Advanced Features I: mean encodings**

- Mean-encoding is applied

- Mean-encoding is set up correctly, i.e. KFold or expanding scheme are utilized correctly

**Advanced Features II**

- At least one feature from this topic is introduced

**Hyperparameter tuning**

- Parameters of models are roughly optimal

**Ensembles**

- Ensembling is utilized (linear combination counts)

- Validation with ensembling scheme is set up correctly, i.e. KFold or Holdout is utilized

- Models from different classes are utilized (at least two from the following: KNN, linear models, RF, GBDT, NN)

# Baseline

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [19]:
dest = '/content/drive/MyDrive/HSE Final Project/Data/'

df_train = pd.read_csv(dest + 'sales_train.csv')
df_test = pd.read_csv(dest + 'test.csv')

In [20]:
df_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [23]:
df_test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [21]:
df_shops = pd.read_csv(dest + 'shops.csv')
df_shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [22]:
df_categories = pd.read_csv(dest + 'item_categories.csv')
df_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4
