# Table of Content

* [Little helper](#helper)
* [Load data](#data)
* EDA: Examine DataFrames
    * [First impression .head()](#head)
    * [Analyze names via translation](#translation)
    * [Check for NaNs](#nans)
    * [Feature classification with .info()](#classification)
    * [Create train and test set](#create-ds)
    * [Some facts and dataset statistics](#statistics)
    * [Examine 'date' & 'date_block_num'](#date)
    * [Compare train and test set](#comparison)
* EDA: Visualization
    * [Overview Feature distribution](#feature-distribution)
    * [Compare train and test set distributions](#compare-feat)
    * [Analysing test-set sampling](#sampling)
    * [Analysing item price](#item-price)
    * [Boxplots](#boxplots)
    * [Time series](#time)

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline


import os

from scipy import stats
from sklearn.preprocessing import LabelEncoder
from IPython.display import Image, display

# print versions
lib_list = [np, pd, plt, sns, os]
for p in set(lib_list):
    try:
        print(p.__name__, p.__version__)
    except AttributeError as err:
        print(err)

<a id="helper"></a>
# Little helper

In [None]:
OUTPUT = '/kaggle/working'

def comparison_distplot(dfs, col):
    x_min = min(dfs[0][col].min(),dfs[1][col].min())
    x_max = max(dfs[0][col].max(),dfs[1][col].max())
    
    fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(15,8))
    for df, ax in zip(dfs, axes.flat):
        try:
            sns.histplot(df[col], binrange=[x_min,x_max], kde=True, ax=ax);
        except:
            print("Skip df '{}'!".format(col))
            continue
    
def stacked_distplot(dfs, col, shrink=1):
    x_min = min(dfs[0][col].min(),dfs[1][col].min())
    x_max = max(dfs[0][col].max(),dfs[1][col].max())
    
    df1, df2 = dfs
    
    x=df1[col].to_frame()
    x['dataset']= 'train'
    y=df2[col].to_frame()
    y['dataset']='test'
    data=pd.concat([y,x], ignore_index=True)

    plt.figure(figsize=(15,5))
    sns.histplot(data=data, 
                 x=col, 
                 hue='dataset', 
                 multiple='stack', 
                 shrink=shrink);
    
def feature_distplot(df):
    cols = list(df.columns)
    if len(cols)%2==0:
        nrows = len(cols)//2
    else:
        nrows = 1 + len(cols)//2
    fig, axes = plt.subplots(ncols=2, nrows=nrows, figsize=(15,nrows*5))
    for col, ax in zip(cols, axes.flat):
        try:
            sns.histplot(df[col], kde=True, ax=ax);
            print("{}-plot done!".format(col))
        except:
            print("Skip column '{}'!".format(col))

def statistics(df):
    median = df.median(axis=0)
    skew = df.skew(axis=0)
    kurt = df.kurt(axis=0)
    return pd.concat([median, skew, kurt], axis=1, keys=['median', 'skew', 'kurt'])    
    
def check_isnull(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

def boxplot(df, category, values, ymin=None, ymax=None):
    data = pd.concat([df[category],df[values]], axis=1)
    f, ax = plt.subplots(figsize=(15, 8))
    fig = sns.boxplot(x=category, y=values, data=data);
    if ymin != None and ymax != None:
        fig.axis(ymin=ymin, ymax=ymax)
        
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df



<a id="data"></a>
# Load data

In [None]:
df_names, df_paths = [], []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        df_names.append(filename.split('.')[0])
        df_paths.append(os.path.join(dirname, filename))

dfs = {name:pd.read_csv(path) for (name, path) in zip(df_names,df_paths)}

print("Data frames")
display(df_names)
print()
print("Paths")
display(df_paths)

# downcast data
for df in df_names:
    dfs[df] = downcast_dtypes(dfs[df])

# EDA: Examine DataFrames

<a id="head"></a>
# First impression with .head()

Whats happening?

+ **'items'** Assign item-id and item-category-id to each item-name

+ **'item_categories'** Assign item-category-id to each category-name

+ **'shops'** Assign shop-id to each shop-name

+ **'sales_train'** Train set: transactions for all shops and items with date, price and count. Further, date is aggregated to date-blocks.

+ **'test'** Test set: Assign pair-id to each shop-item pair

In [None]:
dfs['items'].head(3)

In [None]:
dfs['item_categories'].head(3)

In [None]:
dfs['shops'].head(3)

In [None]:
dfs['sales_train'].head(3)

In [None]:
dfs['test'].head(3)

<a id="translation"></a>
# Analyze names via translation

+ First word in 'shop_name' is the name of the city
+ First word in 'item_category_name' refers to a 'super' category

**Ideas: new features**

+ Define new categorical features from the first word in each row
+ Create vectorial representation for item name to measure similarity. Group by threshold

In [None]:
# item_category_name

'''
    0 PC - Headsets/headphones
    1 Accessories - PS2
    2 Accessories - PS3
    3 Accessories - PS4
    4 Accessories - PSP
    5 Accessories - PSVita
    6 Accessories - XBOX 360
    7 Accessories - XBOX ONE
    8 Tickets - Numerical
    9 Product Delivery
    10 Gaming Consoles - PS2
    11 Gaming Consoles - PS3
    12 Gaming Consoles - PS4
    13 Gaming consoles - PSP
    14 Gaming Consoles - PSVita
    15 Gaming Consoles - XBOX 360
    16 Gaming Consoles - XBOX ONE
    17 Gaming Consoles - Other
    18 Gaming - PS2
    19 Games - PS3
    20 Games - PS4
    21 Games - PSP
    22 Games - PSVita
    23 Games - XBOX 360
    24 Games - XBOX ONE
    25 Games - Game Accessories
    26 Android Games - Digital
    27 MAC Games - Digital
    28 PC Games - PC Games Extras
    29 PC Games - Collector's Edition
    30 PC Games - Standard Editions
    31 PC Games - Digital
    32 Payment Cards (Cinema, Music, Games)
    33 Payment Cards - Live!
    34 Payment cards - Live! (Digit)
    35 Payment cards - PSN
    36 Payment cards - Windows (digital)
    37 Cinema - Blu-ray
    38 Cinema - Blu-ray 3D
    39 Cinematography - Blu-ray 4K
    40 Cinema - DVD
    41 Movies - Collector's Edition
    42 Books - Art books, encyclopaedias
    43 Books - Audiobooks
    44 Books - Audiobooks (Digital)
    45 Books - Audiobooks 1C
    46 Books - Business Books
    47 Books - Comics, Manga
    48 Books - Computer Literature
    49 Books - 1C How-to Materials
    50 Books - Greeting Cards
    51 Books - General Interest Literature
    52 Books - Guidebooks
    53 Books - Fiction
    54 Books - Digital
    55 Music - Locally produced CDs
    56 Music - Branded CDs
    57 Music - MP3
    58 Music - Vinyl
    59 Music - Music Video
    60 Music - Gift editions
    61 Gifts - Paraphernalia
    62 Gifts - Gadgets, robots, sport
    63 Gifts - stuffed animals
    64 Gifts - Board games
    65 Gifts - Board games (compact)
    66 Gifts - Postcards, Stickers
    67 Gifts - Development
    68 Gifts - Certificates, Services
    69 Gifts - Souvenirs
    70 Gifts - Souvenirs (mounted)
    71 Gifts - Bags, Notebooks, Mouse pads
    72 Gifts - Figurines
    73 Software - 1C:Enterprise 8
    74 Programmes - MAC
    75 Applications - Home and Office
    76 Programmes - Home and Office (DIGITAL)
    77 Electric Office / Training
    78 Programs - Educational (Dashboard)
    79 Offices
    80 Offices - Tickets
    81 Net media (Spire)
    82 Clean Media (Spire)
    83 Power Supply Elements
'''

In [None]:
# shop_name

'''
    0 !Yakutsk Ordzhonikidze, 56 fran
    1 !Yakutsk TC "Tsentralnyi" fran
    2 Adygeya SC "Mega
    3 Balashikha SC "Oktyabr-Kinomir".
    4 Volzhskiy SC "Volga Mall
    5 Vologda Marmalade Shopping Mall
    6 Voronezh (Plekhanovskaya mall 13)
    7 Voronezh Maximir Shopping mall
    8 Voronezh City-Park "Grad" Shopping Mall
    9 Offsite Shopping Centre
    10 Zhukovsky 39m Chkalova str.
    11 Zhukovsky 39m² Chkalova str.
    12 Online shop CS
    13 Behetle Shopping Mall in Kazan
    14 Kazan "Park House" mall II
    15 Kaluga XXI Century Shopping Mall
    16 Rio Shopping Mall in Kolomna
    17 Vzletka Plaza Shopping Mall in Krasnoyarsk
    18 June Shopping Mall in Krasnoyarsk
    19 Kursk SC "Pushkinsky
    20 Moscow Rasprodazha mall
    21 Moscow MTRC "Afi Mall
    22 Moscow Shop C21
    23 Moscow Budyonovsky Shopping Mall (pavilion A2)
    24 Moscow REC "Budyonovsky" (pavilion K7)
    25 Moscow REC "Atrium"
    26 Moscow TC "AREAL" (Belyaevo)
    27 Moscow SC "MEGA Belaya Dacha II"
    28 Moscow MEGA Teply Stan Shopping Mall II
    29 Moscow Novy Century Shopping Mall (Novokosino)
    30 Moscow Perlovsky Shopping Centre
    31 Moscow RC "Semenovsky"
    32 Moscow Serebryany Dom Shopping Mall
    33 Mytishchi "XL-3" shopping mall
    34 N.Novgorod RIO Shopping mall
    35 N.Novgorod SC "Fantastika
    36 Novosibirsk Galereya Novosibirsk Shopping Mall
    37 Novosibirsk Mega Shopping Mall
    38 Omsk "Mega" Shopping Mall
    39 RostovNaDonu "Mega Centre Horizon" Shopping Mall
    40 RostovNaDonu "MegaCentre Horizon" Shopping Mall Ostrovnoy
    41 RostovNaDonu REC "Mega"
    42 Nevsky Centre Shopping Mall, St. Petersburg
    43 Sennaya Shopping Mall, St. Petersburg
    44 Melodia Shopping Mall in Samara
    45 ParkHouse Shopping Centre in Samara
    46 Sergiev Posad SC "7Ya"
    47 Surgut City Mall Shopping Centre
    48 Tomsk "Emerald City" Shopping Mall
    49 Tyumen City Mall "Crystal
    50 Tyumen Goodwin Shopping Mall
    51 Tyumen SC "Zelyony Bereg"
    52 Ufa TC "Tsentralnyi"
    53 Ufa SC "Family" 2
    54 Khimki SC "Mega"
    55 1C-Online Digital Warehouse
    56 Chekhov REC "Karnaval"
    57 Yakutsk Ordzhonikidze 56
    58 Yakutsk SC "Tsentralnyi"
    59 Altair Shopping Centre, Yaroslavl
'''

## Create new features from names

Create new col with first words and label encode the words.

1. Add cities
2. Add super categories

In [None]:
# Add cities

# new col
dfs['shops']['city'] = dfs['shops']['shop_name'].str.split(' ').str[0]

# label encoding
le = LabelEncoder()
le.fit(dfs['shops']['city'])
print("Number of cities {}".format(len(list(le.classes_))))

dfs['shops']['city_id']=pd.Series(le.transform(dfs['shops']['city']))
dfs['shops'].head(3)

In [None]:
# Add super category

# new col
dfs['item_categories']['super_cat'] = dfs['item_categories']['item_category_name'].str.split(' ').str[0]

# label encoding
le.fit(dfs['item_categories']['super_cat'])
print("Number of super categories {}".format(len(list(le.classes_))))

dfs['item_categories']['super_cat_id']=pd.Series(le.transform(dfs['item_categories']['super_cat']))
dfs['item_categories'].head(3)

<a id="nans"></a>
# Missing values: NaNs

There are no nans!

In [None]:
pd.concat([check_isnull(dfs[name]) for name in df_names])

<a id="classification"></a>
# Feature classification with .info()

Consider only relevant features - i.e. drop name cols.

**Categorical Features**

+ item_id
+ item_category_id
+ super_cat_id
+ shop_id
+ city_id

Via label encoding mapped to integers - no ordinal features!

**Datetime Feature**

+ date
+ date_block_num

**Numerical Features**

+ item_price
+ item_cnt_day

In [None]:
dfs['items'].info()

In [None]:
dfs['item_categories'].info()

In [None]:
dfs['shops'].info()

In [None]:
dfs['sales_train'].info()

In [None]:
dfs['test'].info()

<a id="create-ds"></a>
# Create train and test set

**target**: 'item_cnt_day' for shop-item pairs

1. Change 'date' col to datetime format
2. Some mergings
    + Add item categories and super categories to both sets
    + Add cities to both sets
    + Add date-block column with nans to the test set
    + Add item_price column with nans to the test set

In [None]:
# change 'date' format
dfs['sales_train']['date']=pd.to_datetime(dfs['sales_train']['date'],format="%d.%m.%Y")
sales = dfs['sales_train'].drop_duplicates().sort_values(by="date", ascending=True)

X_train=sales.drop('item_cnt_day',axis=1)
y_train=sales['item_cnt_day'].to_frame()

# train set manipulations
X_train=X_train.merge(dfs['items'][['item_id','item_category_id']], how='left', on='item_id')
X_train=X_train.merge(dfs['shops'][['shop_id','city_id']], how='left', on='shop_id')
X_train=X_train.merge(dfs['item_categories'][['item_category_id','super_cat_id']], how='left', on='item_category_id')

X_train.head(3)

In [None]:
# test set manipulations
X_test=dfs['test'].merge(dfs['items'][['item_id','item_category_id']], how='left', on='item_id')
X_test=X_test.merge(dfs['shops'][['shop_id','city_id']], how='left', on='shop_id')
X_test=X_test.merge(dfs['item_categories'][['item_category_id','super_cat_id']], how='left', on='item_category_id')

X_test['date_block_num'] = np.nan
X_test['item_price'] = np.nan

X_test.head(3)

<a id="statistics"></a>
# Some facts and dataset statistics

+ **items**
    + 22169 items
    + 84 item-categories
    + 15 super categories
+ **shops**
    + 60 shops
    + 32 cities
+ **datetime**
    + 34 date-blocks from 0 to 33 in train set
+ **train**
    + 2935849 transactions
+ **test**
    + 214200 transations

## 'items'

In [None]:
dfs['items'].describe().T

In [None]:
statistics(dfs['items'].drop('item_name',axis=1))

In [None]:
dfs['items'].nunique()

## 'item_categories'

In [None]:
dfs['item_categories'].describe().T

In [None]:
statistics(dfs['item_categories'].drop(['item_category_name','super_cat'],axis=1))

In [None]:
dfs['item_categories'].nunique()

## 'shops'

In [None]:
dfs['shops'].describe().T

In [None]:
statistics(dfs['shops'].drop(['shop_name','city'],axis=1))

In [None]:
dfs['shops'].nunique()

## Training set

### Features

In [None]:
X_train.describe().T

In [None]:
statistics(X_train.drop('date',axis=1))

In [None]:
X_train.nunique()

### Targets

In [None]:
y_train.describe().T

In [None]:
statistics(y_train)

In [None]:
y_train.nunique()

## Test set

In [None]:
X_test.describe().T

In [None]:
statistics(X_test)

In [None]:
X_test.nunique()

<a id="date"></a>
# Examine 'date' & 'date_block_num'

The 'date_block_num' values correspond to months. Add 'date_block_num' value '34' to test_set for the following month.

In [None]:
print('Datetime period \t %s / %s' % (X_train.date.min().date(), X_train.date.max().date()))
print('Days in train \t\t %d' % ((X_train.date.max() - X_train.date.min()).days + 1))
print()
print("Min/Max date_block_num \t {}/{}".format(X_train.date_block_num.min(),X_train.date_block_num.max()))
print("Unique values \t\t {}".format(X_train.date_block_num.nunique()))
print()
print("Dateblock == month\t{}".format(((X_train['date_block_num']+1)%12==X_train.date.dt.month).any()))
print()

In [None]:
# Add date block value
X_test['date_block_num']=34

<a id="comparison"></a>
# Compare train set with test set

+ 102796 unseen shop-item pairs - almost half of test-pairs are new
+ 363 unseen items in test set - so their prices are missing
+ 6 duplicates in train set

**Ideas**
+ infer from categories, super categories, cities


In [None]:
X_train['shop_pair'] = list(zip(X_train.shop_id, X_train.item_id))
X_test['shop_pair'] = list(zip(X_test.shop_id, X_test.item_id))

total_shops=dfs['shops']['shop_id'].nunique()
train_shops=dfs['sales_train']['shop_id'].nunique()
test_shops=dfs['test']['shop_id'].nunique()


total_items=dfs['items']['item_id'].nunique()
train_items=dfs['sales_train']['item_id'].nunique()
test_items=dfs['test']['item_id'].nunique()

duplicates_train = len(dfs['sales_train'])-len(dfs['sales_train'].drop_duplicates())
duplicates_test = len(X_test[['shop_id','item_id']])-len(X_test[['shop_id','item_id']].drop_duplicates())

num_unseen_items=dfs['test']['item_id'].drop_duplicates().isin(dfs['sales_train']['item_id'].drop_duplicates().to_list()).value_counts()[0]
num_unseen_pairs=len(X_test)-len(X_test[X_test['shop_pair'].isin(X_train['shop_pair'].drop_duplicates())])

X_train=X_train.drop('shop_pair',axis=1)
X_test=X_test.drop('shop_pair',axis=1)

print("Shops in train-ds:\t\t{}/{}".format(train_shops,total_shops))
print("Shops in test-ds:\t\t{}/{}".format(test_shops,total_shops))
print()
print("Missing items in train-ds:\t{}/{}".format(total_items-train_items,total_items))
print("Missing items in test-ds:\t{}/{}".format(total_items-test_items,total_items))
print()
print('Duplicates in "sales_train"\t{}'.format(duplicates_train))
print('Duplicates in "test"\t\t{}'.format(duplicates_test))
print()
print("Unseen test-items:\t\t{}".format(num_unseen_items))
print("Unseen shop-item-pairs:\t\t{}".format(num_unseen_pairs))

# EDA: Visualizations

<a id="feature-distribution"></a>
# Overview feature distributions


## Train set
+ Transactions increase before christmas
+ No information gained from item_price plot
+ Many transactions in a single town
+ Most transactions from few item-categories and super-categories

In [None]:
filename = 'train_feat_dist.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    feature_distplot(X_train);
    plt.savefig(os.path.join(OUTPUT,filename))

### Test set
+ uniform shop-distribution - check sampling!

In [None]:
X_test['shop_id'].value_counts().describe().T

In [None]:
filename = 'test_feat_dist.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    feature_distplot(X_test.drop(['ID','date_block_num'],axis=1));
    plt.savefig(os.path.join(OUTPUT,filename))

<a id="compare-feat"></a>
## Compare train and test set distributions

In [None]:
filename = 'stacked_shop.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    stacked_distplot([X_train,X_test],'shop_id');
    plt.savefig(os.path.join(OUTPUT,filename))

In [None]:
filename = 'comparison_shop.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    comparison_distplot([X_train,X_test],'shop_id');
    plt.savefig(os.path.join(OUTPUT,filename))

In [None]:
filename = 'stacked_item_cat.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    stacked_distplot([X_train,X_test],'item_category_id', shrink=2);
    plt.savefig(os.path.join(OUTPUT,filename))

In [None]:
filename = 'comparison_item_cat.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    comparison_distplot([X_train,X_test],'item_category_id');
    plt.savefig(os.path.join(OUTPUT,filename))

In [None]:
filename = 'stacked_super_cat.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    stacked_distplot([X_train,X_test],'super_cat_id', shrink=2);
    plt.savefig(os.path.join(OUTPUT,filename))

In [None]:
filename = 'comparison_super_cat.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    comparison_distplot([X_train,X_test],'super_cat_id');
    plt.savefig(os.path.join(OUTPUT,filename))

In [None]:
filename = 'stacked_city.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    stacked_distplot([X_train,X_test],'city_id', shrink=2);
    plt.savefig(os.path.join(OUTPUT,filename))

In [None]:
filename = 'comparison_city.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    comparison_distplot([X_train,X_test],'city_id');
    plt.savefig(os.path.join(OUTPUT,filename))

In [None]:
filename = 'stacked_item.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    stacked_distplot([X_train,X_test],'item_id');
    plt.savefig(os.path.join(OUTPUT,filename))

In [None]:
filename = 'comparison_item.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    comparison_distplot([X_train,X_test],'item_id');
    plt.savefig(os.path.join(OUTPUT,filename))

<a id="sampling"></a>
# Analysing test-set sampling

For each item there are exactly 5100 items.

In [None]:
print(X_test.groupby('shop_id')['item_id'].nunique())

## Analyse item distribution per shop

For each shop, the same subset was selected to sample the test-set.

In [None]:
shops=[2,3,4,5,6,7,10]

fig,axes=plt.subplots(ncols=1, nrows=len(shops), figsize=(15,7))

for shop_id,ax in zip(shops,axes):
    sns.histplot(X_test[X_test['shop_id']==shop_id]['item_id'].to_frame(),kde=False,ax=ax,bins=200)

In [None]:
# Check equality of item distribution for each shop
shops=X_test['shop_id'].drop_duplicates().to_list()

for shop in shops:
    if not (X_test[X_test['shop_id']==2]['item_id'].value_counts()==X_test[X_test['shop_id']==shop]['item_id'].value_counts()).all():
        print('Distribution of shop {} differs from shop 2!'.format(shop))

In [None]:
X_tr=X_train
X_te=X_test

X_tr['shop_diff'] =  X_tr.shop_id
X_te['shop_diff'] =  X_te.shop_id

g_tr= X_tr.groupby('item_id')
g_te= X_te.groupby('item_id')

fig,(ax0,ax1)=plt.subplots(nrows=1,ncols=2,figsize=(8,4))
ax0.scatter(g_tr.shop_diff.mean(),g_tr.size(),edgecolor = 'none',alpha = 0.5, s=10)
ax1.scatter(g_te.shop_diff.mean(),g_te.size(),edgecolor = 'none',alpha = 0.5, s=10)
ax0.set_xlabel('Group mean relative shop-id')
ax1.set_xlabel('Group mean relative shop-id')
ax1.set_title('Train');
ax0.set_title('Test');
ax0.set_ylabel('Group size');

## Extend train-set

There are no transactions with count zero, but we expect predictions of count zero in the test-set.

* Artificially extend train-set by transactions of count zero to increase similarity between train and test-set for a single month
    - Test-set: shops x items = 42*5100 = 214200
    - Train-set: shops x items = 60*22169 = 1,333,140

Finally we have a dataset of size 34*1,33,140=45,326,760 transactions

## Missing pairs in train

Shops x unseen items = 42*363 = 15246

In [None]:
print("Number of pairs with unseen items {}".format(len(X_test)-len(X_test[X_test['item_id'].isin(X_train['item_id'].drop_duplicates())])))

<a id="item-price"></a>
# Analyse item price

## Filled NaNs

There are no nans, Probably many missing prices were filled by very small values.

1 rouble = 0.012 euro
+ 1 transaction below 0
+ 2932 + 1226 + 493 transactions < 1.2 cent

In [None]:
plt.title('Item price in euro')
X_train[(X_train['item_price']<2500)]['item_price'].transform(lambda x: 0.012*x).hist(bins=50,figsize=(12,6));

In [None]:
item_price=dfs['sales_train']['item_price']

print('Item price min/max:\t{}/{}'.format(item_price.min(),item_price.max()))

print('Lowest item prices')
threshold=1
print(X_train[(X_train['item_price']<threshold)]['item_price'].value_counts().sort_values())

In [None]:
sns.displot(X_train[(X_train['item_price']<threshold)]['item_price'],height=5,aspect=1.5);
plt.title('Item price distribution for the lowest prices');

In [None]:
threshold=0.11
num_prints=3

ids=X_train[X_train['item_price']<threshold]['item_id'].to_list()
for id in ids[:num_prints]:
    print(50*'#')
    print(dfs['items'][dfs['items']['item_id']==id]['item_name'])
    print()
    print('Print counts per category with transactions < threshold')
    print(X_train[(X_train['item_id']==id)&(X_train['item_price']<threshold)]['item_category_id'].value_counts())
    print()
    print('Drop values below threshold and calc mean and std')
    prices=X_train[(X_train['item_id']==id)&(X_train['item_price']>threshold)]['item_price']
    print("mean\t{:.2f}".format(prices.mean()))
    print("std\t{:.2f}".format(prices.std()))

## Explore transaction-count distribution

For many items, there are only few transactions! Maybe check the items category trend, instead of assuming a constant price.

For unique item prices, the item-price needs to be the same over all shops. Therefore, unique prices may result from few transactions, while the true value varies.

In [None]:
filename = 'trans_count_dist.png'
if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    num_transactions = [len(X_train[X_train['item_id']==i]) for i in range(X_train['item_id'].nunique())]

    num_transactions = np.asarray(num_transactions)
    print('Transactions per item:')
    print('Mean\t{:.2f}'.format(num_transactions.mean()))
    print('Std\t{:.2f}'.format(num_transactions.std()))

    plt.figure(figsize=(15,6))
    n=50
    plt.hist(num_transactions, bins=n, range=[0,n]);
    plt.savefig(os.path.join(OUTPUT,filename))

## Get ratio of items with constant price

Plot percentage of items with varying price along the recorded period.

In [None]:
percentage=[]
for i in range(X_train['shop_id'].max()):
    trans_shop = X_train[X_train['shop_id']==i]
    counts=trans_shop.groupby(['item_id'])['item_price'].nunique()
    num_items = trans_shop['item_id'].nunique() 
    num_const_price = counts[counts==1].sum()
    percentage.append([i, num_items, num_items-num_const_price,100*(num_items-num_const_price)/num_items])

percentage=pd.DataFrame(percentage, columns = ['shop_id','total','non-const','percentage'])

plt.figure(figsize=(15,6))
plt.ylabel('items with non-constant price [%]')
plt.xlabel('shops')
plt.xticks(rotation=45)
sns.barplot(data=percentage, x='shop_id', y='percentage', color='salmon');

<a id='boxplots'></a>
# Boxplots

In [None]:
sales=dfs['sales_train']
sales['year']=sales['date'].dt.year
sales['month']=sales['date'].dt.month
sales['revenue']=sales['item_price']*sales['item_cnt_day']

In [None]:
category = 'shop_id'
values = 'item_price'

fig = boxplot(sales, category, values, ymin=0, ymax=5000)

In [None]:
category = 'shop_id'
values = 'revenue'

fig = boxplot(sales, category, values, ymin=0, ymax=5000)

In [None]:
category = 'item_category_id'
values = 'item_price'

fig = boxplot(X_train, category, values, ymin=0, ymax=7500)

<a id='time'></a>
# Time series



In [None]:
filename = 'date_count.png'

if os.path.isfile(os.path.join(OUTPUT,filename)):
    print("Load {}".format(os.path.join(OUTPUT,filename)))
    display(Image(os.path.join(OUTPUT,filename)))
else:
    print('Replot...')
    plt.xlabel('date')
    plt.ylabel('counts')
    plot = X_train.date.value_counts().plot();
    plot.get_figure().savefig(os.path.join(OUTPUT,filename))

In [None]:
fig,ax=plt.subplots(figsize=(12,6))
plt.title('Total revenue per month')
sales.groupby(['month','year'])['revenue'].sum().unstack().plot(ax=ax);

In [None]:
fig,ax=plt.subplots(figsize=(12,6))
plt.title('Mean item price per month')
sales.groupby(['month','year'])['item_price'].agg('mean').unstack().plot(ax=ax);

In [None]:
fig,ax=plt.subplots(figsize=(12,6))
plt.title('Total item counts per month')
sales.groupby(['month','year'])['item_cnt_day'].agg('sum').unstack().plot(ax=ax);