# Imports and config

In [1]:
import os
import pickle
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error

import config as cfg
import data_processing as dp


warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

# Data preparation

### Load data

In [13]:
sales = pd.read_csv(cfg.FILENAMES['TRAIN_SALES'])

items = pd.read_csv(cfg.FILENAMES["ITEMS"])

categories = pd.read_csv(cfg.FILENAMES["ITEM_CATEGORIES"])

shops = pd.read_csv(cfg.FILENAMES["SHOPS"])

### Categories features

In [3]:
print(len(categories))
categories.head(2)

84


Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1


In [4]:
category_names = list(categories["item_category_name"])
categories["item_category_name_en"] = dp.translate_string_list(category_names)

categories["item_category_name_en_cur"] = dp.process_categories(categories["item_category_name_en"])

main_sub_category_colnames = ["main_category_name", "sub_category_name"]
categories[main_sub_category_colnames] = categories["item_category_name_en_cur"].str.split(" - ", expand=True)
categories.head(2)

Unnamed: 0,item_category_name,item_category_id,item_category_name_en,item_category_name_en_cur,main_category_name,sub_category_name
0,PC - Гарнитуры/Наушники,0,PC - Headsets / Headphones,Accessories - PC,Accessories,PC
1,Аксессуары - PS2,1,Accessories - PS2,Accessories - PS2,Accessories,PS2


In [5]:
categories = categories[["item_category_id","main_category_name", "sub_category_name"]]
categories.head(1)

Unnamed: 0,item_category_id,main_category_name,sub_category_name
0,0,Accessories,PC


## Shop features

In [14]:
print(len(shops))
shops.head(2)

60


Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1


In [16]:
shops["city_rus"] = shops["shop_name"].str.split(" ",expand=True)[0]
shop_names = [x.replace("!","") for x in shops["city_rus"]]
shops["city_name"] = dp.translate_string_list(shop_names)

shops["city_name"] = dp.process_shops(shops["city_name"])
shops.head(2)

Unnamed: 0,shop_name,shop_id,city_rus,city_name
0,"!Якутск Орджоникидзе, 56 фран",0,!Якутск,Yakutsk
1,"!Якутск ТЦ ""Центральный"" фран",1,!Якутск,Yakutsk


In [17]:
shops = shops[["shop_id","city_name"]]
shops.head(2)

Unnamed: 0,shop_id,city_name
0,0,Yakutsk
1,1,Yakutsk


## Remove train negative values and outliers

In [19]:
print(len(sales))

2935849
