In [1]:
import json
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn.preprocessing as skp
import sklearn.compose as skc

In [2]:
with open('SETTINGS.json') as settings_file:  
    settings = json.load(settings_file)
comp_data = settings['COMPETITION_DATA_DIR']
pre_trained_model = settings['PRE_TRAINED_MODEL_PATH']
submissions = settings['SUBMISSION_DIR']

In [3]:
# Here we define functions with preprocessing routines. Not running anything yet.
# Data cleaning, fixing, feature generation.
# We do everything in functional style:
# main function here is get_train_data() - it builds the final matrix right from the raw data.
# We then use it for training and validation.

# Manually found shop duplicates, inferred by their names.
def fix_shop_duplicates(df) -> None:
    # Якутск Орджоникидзе, 56
    df.loc[df.shop_id == 0, 'shop_id'] = 57
    # Якутск ТЦ "Центральный"
    df.loc[df.shop_id == 1, 'shop_id'] = 58
    # Жуковский ул. Чкалова 39м²
    df.loc[df.shop_id == 10, 'shop_id'] = 11

def fixed_transactions() -> pd.DataFrame:
    transactions = pd.read_csv(comp_data + 'sales_train.csv.gz')
    fix_shop_duplicates(transactions)
    return transactions

# Group transactions by shop, item and month
def grouped_sales() -> pd.DataFrame:
    transactions = fixed_transactions()
    grouped = transactions.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)
    grouped = grouped.agg({'item_cnt_day': np.sum})\
                     .rename(columns={'item_cnt_day': 'item_cnt_month'})
    return grouped

# Here is the most important part - make monthly sales for each item-shop,
# this is the main thing we need to train on.
def make_monthly_sales() -> pd.DataFrame:
    items = pd.read_csv(comp_data + 'items.csv')
    shops = get_shops()
    dates = pd.DataFrame()
    dates['date_block_num'] = pd.Series(range(0,33+1))
    
    # foo=1 is a pandas-way to make Cartesian product (without for-loops)
    # So we first make a matrix with with all possible shop-item-months...
    shop_item_months = \
        dates.assign(foo=1)\
        .merge(shops['shop_id'].to_frame().assign(foo=1))\
        .merge(items['item_id'].to_frame().assign(foo=1))\
        .drop('foo', 1)
    # ...and then merge actual sales to it. Missing values mean zero sales, hence, fillna(0)
    return shop_item_months.merge(grouped_sales(), how = 'left', on = ['shop_id', 'item_id', 'date_block_num'])\
                           .fillna(0)

####################################################
# Here we add a "city" feature to the dataset, extracting it from shop names.
# We append the label-encoded version to the resulting data.
def add_city_feature(shops) -> None:
    shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
    shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
    shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
    shops['city_code'] = skp.LabelEncoder().fit_transform(shops['city'])
    shops = shops[['shop_id','city_code']]

def get_shops() -> pd.DataFrame:
    result = pd.read_csv(comp_data + 'shops.csv').query('shop_id not in [0, 1, 10]')#filter duplicates shops here
    add_city_feature(result)
    return result
    
def append_city_code(df) -> pd.DataFrame:
    return df.merge(get_shops()[['shop_id', 'city_code']], how='left', on = ['shop_id'])
####################################################

####################################################
# lag_feature() allows to add a lag of an arbitrary feature and time period
def lag_feature(df, lags, col) -> pd.DataFrame:
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

# But be use only the simplest case - sales from the previous month.
# We also have to remove the first month from the data set, since we can't provide this feature for it.
def append_last_month_sales(df) -> pd.DataFrame:
    df = lag_feature(df, [1], 'item_cnt_month')
    df.query('date_block_num != 0', inplace=True)
    return df
####################################################

####################################################
# Main functions here - build train and test matrices, that we can feed to the model.
# Simple caching is used here, so that we can loosely call these functions later,
# not worrying for performance. Only the first call will be expensive.

# We will split this later into test and validation sets
def get_train_data() -> pd.DataFrame:
    if get_train_data.cache is None:
        print('making train data...')
        get_train_data.cache = make_monthly_sales()
        
        # My experiments showed, that training only on item-shops, that appear in the test set,
        # actually _improves_ the validation score slightly!
        # And obviously, it is way faster.
        #
        # Another point is - validating only on test item-shops gives a very much closer score to the LB score.
        # That means, item-shops from the test set are much harder to predict
        # than average item-shop from the train set.
        #
        # So, we leave only item-shops from the test set.
        test_set = pd.read_csv(comp_data + 'test.csv.gz')
        fix_shop_duplicates(test_set)
        get_train_data.cache =\
            get_train_data.cache.query('item_id in(@test_set.item_id) and shop_id in(@test_set.shop_id)')
        
        # adding city code feature
        get_train_data.cache = append_city_code(get_train_data.cache)
        # adding month feature (from 1 to 12)
        get_train_data.cache['month_of_year'] = get_train_data.cache.date_block_num % 12 + 1
        # adding item category feature
        items = pd.read_csv(comp_data + 'items.csv')
        get_train_data.cache = get_train_data.cache.merge(items[['item_id', 'item_category_id']], how='left', on='item_id')
        # adding last month sales feature
        get_train_data.cache = append_last_month_sales(get_train_data.cache)
        # We will use clipped sales for training and validation
        get_train_data.cache.item_cnt_month = get_train_data.cache.item_cnt_month.clip(0, 20)
    return get_train_data.cache.copy()
get_train_data.cache = None

def get_test_data() -> pd.DataFrame:
    if get_test_data.cache is None:
        print('making test data...')
        get_test_data.cache = pd.read_csv(comp_data + 'test.csv.gz')
        fix_shop_duplicates(get_test_data.cache)
        get_test_data.cache = append_city_code(get_test_data.cache)
        
        items = pd.read_csv(comp_data + 'items.csv')[['item_id', 'item_category_id']]
        get_test_data.cache = get_test_data.cache.merge(items, how='left', on = ['item_id'])
        get_test_data.cache['date_block_num'] = np.full((get_test_data.cache.shape[0]), 34)
        get_test_data.cache['month_of_year'] = np.full((get_test_data.cache.shape[0]), 11)
    
        # append last month sales to test, using train data
        sales_train = get_train_data().query('date_block_num == 33')\
                                      [['shop_id', 'item_id', 'item_cnt_month']]\
                                      .rename(columns={'item_cnt_month': 'item_cnt_month_lag_1'})
        get_test_data.cache = get_test_data.cache.merge(sales_train, on=['shop_id', 'item_id'], how='left')
    return get_test_data.cache.copy()
get_test_data.cache = None
####################################################

In [4]:
# We use XGBoost for our model.
# Functions here prepare data specifically in the XGBoost format.
# make_predictions() does all the job from scratch: training, validation, predicting.

# Here are all the features we use. Not many at all.
# We use one-hot encoding for categorical features with low cardinality.
# For high-cardinality features ('shop_id', 'item_id', 'item_category_id'),
# simple label encoding turned out to give even better score, and certainly works faster,
# using a lot less memory.
# Label encoding yields pretty satisfying score, so we won't use mean encodings.
one_hot_features = ['month_of_year', 'city_code']
normal_features = ['date_block_num', 'item_cnt_month_lag_1', 'shop_id', 'item_id', 'item_category_id']
all_features = np.concatenate((one_hot_features, normal_features))

# Makes a matrix used for training.
# One can provide a list of date_block_num-s for use in validation, so they will be excluded from this train set.
# Also returns ColumnTransformer for later use in validation and test.
def make_train_matrix(validation_months = []) ->(xgb.DMatrix, skc.ColumnTransformer):
    sales_train = get_train_data()
    print('making xgb train matrix...')
    if len(validation_months) > 0:
        print("Excluding validation months from train")
        sales_train = sales_train[~(sales_train.date_block_num.isin(validation_months))]
    print("Train rows: ", sales_train.shape[0])
    labels = sales_train['item_cnt_month'].tolist()
    print('Used features: ' + str(all_features))
    sales_train = sales_train[all_features]

    # using n_jobs more than 1 here may cause an internal sklearn error due to large object size
    column_transformer = skc.make_column_transformer((skp.OneHotEncoder(categories='auto'),\
                                                      one_hot_features),\
                                                     n_jobs=1, remainder='passthrough')
    # one-hot encoding yields a sparse matrix
    sales_train_sparse = column_transformer.fit_transform(sales_train)
    del sales_train# to save memory
    feature_names = column_transformer.named_transformers_['onehotencoder'].get_feature_names(one_hot_features)
    feature_names = np.concatenate((feature_names, normal_features))
    return (xgb.DMatrix(sales_train_sparse, label=labels, feature_names=feature_names, nthread=4), column_transformer)

# Makes a validation matrix for a single month.
def make_eval_matrix(fit_column_transformer, date_block_num = 33) -> xgb.DMatrix:
    sales_eval = get_train_data()
    sales_eval = sales_eval[sales_eval.date_block_num == date_block_num]
    print("Adding eval matrix with row count: ", sales_eval.shape[0])
    labels = sales_eval['item_cnt_month'].tolist()
    sales_eval = sales_eval[all_features]
    sales_eval_sparse = fit_column_transformer.transform(sales_eval)
    del sales_eval
    feature_names = fit_column_transformer.named_transformers_['onehotencoder'].get_feature_names(one_hot_features)
    feature_names = np.concatenate((feature_names, normal_features))
    return xgb.DMatrix(sales_eval_sparse, label=labels, feature_names=feature_names, nthread=4)

# Returns trained booster (model).
def train_model(param, num_boost_round, early_stopping_rounds, dtrain, deval = []) -> xgb.Booster:
    watchlist = [(dtrain,'train')]
    evalnum = 1;
    for eval_matrix in deval:
        watchlist.append((eval_matrix,'eval' + str(evalnum)))
        evalnum += 1
    evals_result = dict()
    return xgb.train(param, dtrain, num_boost_round, evals = watchlist, evals_result= evals_result,\
                     verbose_eval = True, early_stopping_rounds = early_stopping_rounds)

# Makes a test matrix ready for making predictions.
def make_dtest(fit_column_transformer) -> xgb.DMatrix:
    test_set = get_test_data()[all_features]
    ts_sparse = fit_column_transformer.transform(test_set)
    feature_names = fit_column_transformer.named_transformers_['onehotencoder'].get_feature_names(one_hot_features)
    feature_names = np.concatenate((feature_names, normal_features))
    return xgb.DMatrix(ts_sparse, feature_names=feature_names, nthread=4)

# Returns a trained model and a DataFrame with predictions.
# Prepares all data internally, we only specify parameters for training and validation.
def make_predictions(param, num_boost_round, early_stopping_rounds, validation_months = [])\
-> (xgb.Booster, pd.DataFrame):
    dtrain, column_transformer = make_train_matrix(validation_months)
    deval = []
    for validation_month in validation_months:
        deval.append(make_eval_matrix(column_transformer, validation_month))
    booster = train_model(param, num_boost_round, early_stopping_rounds, dtrain, deval)
    del dtrain
    del deval
    dtest = make_dtest(column_transformer)
    predictions = booster.predict(dtest)
    test_set = get_test_data()
    test_set['item_cnt_month'] = predictions
    test_set.item_cnt_month = test_set.item_cnt_month.clip(0, 20)
    return booster, test_set

# Makes predictions using pre-trained model
def predict_from_saved_model() -> (xgb.Booster, pd.DataFrame):
    column_transformer = make_train_matrix([])[1]
    booster = xgb.Booster({'nthread': 4})
    booster.load_model(pre_trained_model)
    dtest = make_dtest(column_transformer)
    predictions = booster.predict(dtest)
    test_set = get_test_data()
    test_set['item_cnt_month'] = predictions
    test_set.item_cnt_month = test_set.item_cnt_month.clip(0, 20)
    return booster, test_set

In [12]:
# Now it is time to run it.
# By default, a pre-trained model is used. If you want, set "use_pre_trained_model" to False.
use_pre_trained_model = True
if use_pre_trained_model:
    booster, preds = predict_from_saved_model()
else:
    # max_depth, eta and num_boost_round are set to suboptimal values after some tuning.
    # This model was already validated and now we aim to make best possible predictions for test.
    # So, we use all given data for training and do no validation.
    # num_boost_round = 20 is also tuned, so no need for early stopping here.
    param = {'max_depth':8, 'eta':0.2, 'verbosity':1, 'objective':'reg:linear', 'eval_metric':'rmse'}
    booster, preds = make_predictions(param, num_boost_round = 20, early_stopping_rounds = None, validation_months = [])

making xgb train matrix...
Train rows:  7068600
Used features: ['month_of_year' 'city_code' 'date_block_num' 'item_cnt_month_lag_1'
 'shop_id' 'item_id' 'item_category_id']


The above model gives ~1.01 LB score.<br>
We are going to improve it to ~0.96 using manually examined test data.<br>
Many items in the test set (~360) have no sales in train.<br>
By manually examining these items by their name and searching them on the internet,<br>
one can find a lot of new releases (that appeared around November 2015).<br>
(Though, it may be harder if you don't know Russian).<br>
Our model poorly handles such items, so we will manually fix predictions for them.<br>
I actually examined all the new items and chose a subset of them, that gives most potential for improvement.

In [13]:
# The item_id-s here are manually set after analysis of the new items.

# These items were definitely on the market before November 2015.
# We can infer that nobody buys them.
def is_definitely_old_release(item) -> bool:
    return item.item_id in\
           [6439,15183,639,640,762,22137,18627,14739,10054,10310,8850,8957,83,12441,19651,22035,12574,9030,1193]

# New albums issued by Bryan Adams, Rod Steward and other very popular bands.
def new_hot_musical_release(item) -> bool:
    return item.item_id in [8890, 5320, 3604, 3984, 6335, 2569, 1732, 1246, 1252, 3271, 15553, 5025,\
                            7669, 19219, 3908, 4642, 5064, 1683, 12920]

# These give the most effect.
def new_hot_computer_game(item) -> bool:
    # 1. Fallout 4
    # 2. Football Manager 2016
    # 3. Rise of the Tomb Raider
    # 4. Call of Duty: Black Ops III
    # 5. Star Wars: Battlefront
    # 6. Starcraft II: Legacy Of The Void
    # 7. Assassin's Creed new release
    # 8. Need for Speed - 2015
    return item.item_id in [3407, 3408, 3405]\
        or item.item_id in [3538]\
        or item.item_id in [6152, 6153]\
        or item.item_id in [2327, 2322, 2323, 2326, 2328, 2325]\
        or item.item_id in [6729, 6731, 6732, 6730, 6733, 6734]\
        or item.item_id in [6742, 6743]\
        or item.item_id in [1580, 1585, 1577, 1574, 1575]\
        or item.item_id in [5268]

# This seems to be not so popular as games above.
def new_normal_plus_computer_game(item) -> bool:
    # Anno 2205
    return item.item_id in [1437]

# Even less popular but of some interest.
def new_normal_computer_game(item) -> bool:
    # Wasteland 2: Director's Cut 
    # Crew. Wild Run Edition
    # Divinity. Original Sin: Enhanced Edition
    return item.item_id in [7782]\
        or item.item_id in [2427]\
        or item.item_id in [2966]

# Items that are exclusive to certain region.
def new_local_release(item) -> bool:
    return item.item_id in [21467,19155,8993,10483,10372]

# These are very popular.
def new_hot_movie(item) -> bool:
    return item.item_id in [14647,14648]

# Less popular but of some interest.
def new_normal_plus_movie(item) -> bool:
    return item.item_id in [13804,13805,18174]

# So, what are we going to do with all these items?
# Consider Fallout 4 as an example.
# It is reasonable to expect such a game to have nearly the best sales among all PC games in the current shop.
# Also, recall that our predictions are clipped to 20 - it greatly helps not to make a big miss here.
# So, we are going to count maximum sales for the previous month among each shop for current item's category.
# And we then use it as a baseline.
# For hot computer games, for example, we can just set our predictions to the max.
# For less popular items we may multipy max by some number, e.g. 0.5.
def get_max_shop_category_sales(predictions) -> pd.DataFrame:
    print('making max shop-category sales...')
    last_month_sales = get_train_data().query('date_block_num == 33')[['shop_id', 'item_category_id', 'item_cnt_month']]
    
    max_sales = last_month_sales.groupby(['shop_id', 'item_category_id'], as_index=False)\
                                .agg({'item_cnt_month': np.max})\
                                .rename(columns={'item_cnt_month': 'max_sales'})
    
    # just to make this function more generic - item_category_id may or may not be in the passed dataframe
    if 'item_category_id' in predictions.columns:
        return predictions.merge(max_sales, how='left', on=['shop_id', 'item_category_id'])
    else:
        items = pd.read_csv(comp_data + 'items.csv')[['item_id', 'item_category_id']]
        result = predictions.merge(items, how='left', on = ['item_id'])
        return result.merge(max_sales, how='left', on=['shop_id', 'item_category_id'])

# Finds new items in the dataframe and adjusts predictions for them.
def fix_unknown_samples(predictions) -> pd.DataFrame:
    result = get_max_shop_category_sales(predictions.copy())    
    
    print('adjusting predictions...')
    # Here we adjust the predictions.
    # Suboptimal coefficients were found with a little bit of LB probing.
    result.loc[(result.apply(new_hot_computer_game, axis=1)),'item_cnt_month'] = result.max_sales
    result.loc[(result.apply(new_normal_computer_game, axis=1)),'item_cnt_month'] = result.max_sales * 0.12
    result.loc[(result.apply(new_normal_plus_computer_game, axis=1)),'item_cnt_month'] = result.max_sales * 0.3
    result.loc[(result.apply(new_hot_movie, axis=1)),'item_cnt_month'] = result.max_sales * 0.8
    result.loc[(result.apply(new_normal_plus_movie, axis=1)),'item_cnt_month'] = result.max_sales * 0.3
    result.loc[(result.apply(new_hot_musical_release, axis=1)),'item_cnt_month'] = result.max_sales * 0.22

    # Since these items are local to some region and we don't know the region exactly,
    # it seems better to just set them to 0.
    result.loc[(result.apply(new_local_release, axis=1)),'item_cnt_month'] = 0
    result.loc[(result.apply(is_definitely_old_release, axis=1)),'item_cnt_month'] = 0
    
    result = result.drop(['max_sales'], axis=1)
    result.item_cnt_month = result.item_cnt_month.clip(0, 20)# just for clarity
    return result

In [14]:
preds_fixed = fix_unknown_samples(preds)

making max shop-category sales...
adjusting predictions...


In [15]:
# That's it
preds_fixed[['ID', 'item_cnt_month']].to_csv(submissions + 'predictions.csv', index=False)