In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

# Baseline modeling pt.1

First, let's look at our data.

In [3]:
train = pd.read_csv('C:/datasets/train_sales.csv')
test = pd.read_csv('C:/datasets/test_sales.csv')

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_name,item_name,item_category_id,item_category_name,year,month,day,year-month,location,shop_type,item_category,subcat
0,0,2013-01-02,0,59,22154,999.0,1.0,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray,2013,1,2,2013-01,Ярославль,ТЦ,Кино,Blu-Ray
1,1,2013-01-03,0,25,2552,899.0,1.0,"Москва ТРК ""Атриум""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,2013,1,3,2013-01,Москва,ТРК,Музыка,Винил
2,3,2013-01-06,0,25,2554,1709.05,1.0,"Москва ТРК ""Атриум""",DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил,2013,1,6,2013-01,Москва,ТРК,Музыка,Винил
3,4,2013-01-15,0,25,2555,1099.0,1.0,"Москва ТРК ""Атриум""",DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства,2013,1,15,2013-01,Москва,ТРК,Музыка,CD фирменного производства
4,5,2013-01-10,0,25,2564,349.0,1.0,"Москва ТРК ""Атриум""",DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,Музыка - Музыкальное видео,2013,1,10,2013-01,Москва,ТРК,Музыка,Музыкальное видео


We need to group data by month and shops.

In [5]:
train_ds = train.groupby(['date_block_num', 'location','shop_type','shop_id','item_category_id','item_category', 'subcat','item_id',]).agg({'item_cnt_day':'sum','item_price':'mean'}).reset_index()
train_ds.columns = ['date_block_num', 'location','shop_type','shop_id','item_category_id','item_category','subcat','item_id','item_cnt_monthly','mean_item_price']

In [6]:
train_ds

Unnamed: 0,date_block_num,location,shop_type,shop_id,item_category_id,item_category,subcat,item_id,item_cnt_monthly,mean_item_price
0,0,Адыгея,ТЦ,2,2,Аксессуары,PS3,5572,9.0,1532.857143
1,0,Адыгея,ТЦ,2,2,Аксессуары,PS3,5573,2.0,924.000000
2,0,Адыгея,ТЦ,2,2,Аксессуары,PS3,5575,4.0,955.005000
3,0,Адыгея,ТЦ,2,2,Аксессуары,PS3,5576,3.0,2490.000000
4,0,Адыгея,ТЦ,2,2,Аксессуары,PS3,5632,1.0,2390.000000
...,...,...,...,...,...,...,...,...,...,...
1608219,33,Ярославль,ТЦ,59,75,Программы,Для дома и офиса,5383,1.0,4390.000000
1608220,33,Ярославль,ТЦ,59,79,Служебные,Служебные,17717,13.0,802.550000
1608221,33,Ярославль,ТЦ,59,83,Элементы питания,Элементы питания,22087,6.0,119.000000
1608222,33,Ярославль,ТЦ,59,83,Элементы питания,Элементы питания,22088,2.0,119.000000


### Feature generation

**This one will go as an independent module lately.**

We have a lot of categorical info. Since label encoding will kinda randomize the distribution we can also try mean encoding.

In [7]:
def label_encode_data(data):
    
    #shop
    data["loc_cd"] = LabelEncoder().fit_transform(data["location"])
    data["shop_type_cd"] = LabelEncoder().fit_transform(data["shop_type"])
    data.drop(["location"], axis=1, inplace=True)
    data.drop(["shop_type"], axis=1, inplace=True)

    #item
    data["item_cat_cd"] = LabelEncoder().fit_transform(data["item_category"])
    data["subcat_cd"] = LabelEncoder().fit_transform(data["subcat"])
    data.drop(["item_category"], axis=1, inplace=True)
    data.drop(["subcat"], axis=1, inplace=True)
    
    return data

def one_hot_encode_data(data):
    
    data = pd.get_dummies(data, columns = ['location', 'shop_type','item_category','subcat'])
    
    return data

def mean_encode_data(data):
    Mean_encoded_loc = data.groupby(['location'])['item_cnt_day'].mean().to_dict()
    data['loc_cd'] =  data['location'].map(Mean_encoded_loc)
    Mean_encoded_shop = data.groupby(['shop_type'])['item_cnt_day'].mean().to_dict()
    data['shop_type_cd'] =  data['shop_type'].map(Mean_encoded_shop)
    data.drop(["location"], axis=1, inplace=True)
    data.drop(["shop_type"], axis=1, inplace=True)

    #item
    Mean_encoded_cat= data.groupby(['item_category'])['item_cnt_day'].mean().to_dict()
    data['item_cat_cd'] =  data['item_category'].map(Mean_encoded_cat)
    Mean_encoded_subcat = data.groupby(['subcat'])['item_cnt_day'].mean().to_dict()
    data['subcat_cd'] =  data['subcat'].map(Mean_encoded_subcat)
    data.drop(["item_category"], axis=1, inplace=True)
    data.drop(["subcat"], axis=1, inplace=True)
    
    return data
    

def novelty_feature(data):
    
    #novelty
    min_values = data.groupby("item_id")["date_block_num"].min().reset_index()
    min_values.columns = ['item_id','first_sales_date_block']
    data = pd.merge(data, min_values, on='item_id', how = 'left')
    
    return data

def lag_features(df, lags, col_list):
    
    for col_name in col_list:
        tmp = df[["date_block_num", "shop_id", "item_id", col_name]]
        for i in lags:
            shifted = tmp.copy()
            shifted.columns = [
                "date_block_num",
                "shop_id",
                "item_id",
                col_name + "_lag_" + str(i),
            ]
            shifted["date_block_num"] += i
            df = pd.merge(
                df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left"
            )
    return df

def last_halfyear_feathure(train_ds):
#last 6 month average of sales
    train_ds["last_6month_cnt"] = train_ds[["item_cnt_monthly_lag_1", "item_cnt_monthly_lag_2", "item_cnt_monthly_lag_3", "item_cnt_monthly_lag_4", "item_cnt_monthly_lag_5","item_cnt_monthly_lag_6"]].mean(skipna=True, axis=1)    
    return train_ds

In [8]:
def primary_data_for_modeling(data, first_encounter=False):
    
    train_ds = novelty_feature(data)
    train_ds = one_hot_encode_data(train_ds)
    train_ds = lag_features(train_ds, [1, 2, 3, 4, 5, 6, 12], ["item_cnt_monthly"])
    train_ds = last_halfyear_feathure(train_ds)
    train_ds.fillna(0, inplace=True)
    
    
    return train_ds

## Data splits

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import math
from sklearn.metrics import mean_squared_error as mse

def LR_model(x_train, y_train, x_val, y_val):
    prob_model = LinearRegression().fit(x_train, y_train)
    print("RMSE on train: ",math.sqrt(mse(y_train, prob_model.predict(x_train))))
    y_pred = prob_model.predict(x_val)
    print("RMSE on validation: ", math.sqrt(mse(y_val,y_pred)))

Let's create function with cross-validation. Some model which we will teach on the created splits and dataframe which will be used for creation of the splits itself are going to be taken for parameters.

Lags and other features take info only from previous periods of time so i hope there will not be any data leakage if we will extract them before train_ds/test_ds splits. Encoding takes information about the data from the whole dataset, esp.mean encoding, and because of that i implemented this process after spliting.

In [10]:
def cross_validation_for_ts(model, data, month):
    
    #first_month = month
    
    if month>=data.date_block_num.max():
        print("Cannot be splited")
        
    elif month<=data.date_block_num.min():
        print("Cannot be splited")
    #print(data[data.date_block_num==month][-1:].index[0])
    else:
        while True:
            
            ##feature gen.
            data_1 = data[:(data[data.date_block_num==(month+1)][-1:].index[0]+1)]
            data_1 = primary_data_for_modeling(data_1)
            
            train_ds = data_1[:(data_1[data_1.date_block_num==month][-1:].index[0]+1)]
            if (month+1)<=data_1.date_block_num.max():
                test_ds = data_1[(data_1[data_1.date_block_num==month][-1:].index[0]+1):(data_1[data_1.date_block_num==(month+1)][-1:].index[0]+1)]
            else:
                test_ds = data_1[(data_1[data_1.date_block_num==month][-1:].index[0]+1):]
            ##encoding
            #train_ds = label_encode_data(train_ds)
            #test_ds = label_encode_data(test_ds)
            
            #train_ds = primary_data_for_modeling(train_ds)
            #test_ds = primary_data_for_modeling(test_ds)
            
            #print("Train set first and last indexes:", train_ds[0:].index[0],"-", train_ds[-1:].index[0])
            #print("Test set first and last indexes:", test_ds[0:].index[0],"-", test_ds[-1:].index[0])
            x_train = train_ds.drop(['item_cnt_monthly'],axis=1)
            y_train = train_ds["item_cnt_monthly"]
            x_val = test_ds.drop(['item_cnt_monthly'],axis=1)
            y_val = test_ds["item_cnt_monthly"]
            print("Months of train:", (train_ds.date_block_num.unique()))
            print("Months of test:", (test_ds.date_block_num.unique()))
            model(x_train, y_train, x_val, y_val)
            print('-'*100)

            month+=1
            
            if month>=data.date_block_num.max():
                break

In [11]:
cross_validation_for_ts(LR_model, train_ds, 28)

Months of train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28]
Months of test: [29]
RMSE on train:  5.5599011106659795
RMSE on validation:  4.109259646732683
----------------------------------------------------------------------------------------------------
Months of train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
Months of test: [30]
RMSE on train:  5.531475986563945
RMSE on validation:  2.6782884958605457
----------------------------------------------------------------------------------------------------
Months of train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
Months of test: [31]
RMSE on train:  5.48409147740065
RMSE on validation:  2.885941677712586
----------------------------------------------------------------------------------------------------
Months of train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 