In [None]:
! pip install deepctr
! pip install 'h5py==2.10.0'



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pickle

In [None]:
from typing import Tuple, List
import json

In [None]:
%tensorflow_version 1.x
import tensorflow as tf
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

TensorFlow 1.x selected.


In [None]:
from deepctr.feature_column import SparseFeat, DenseFeat
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model, load_model

from deepctr.models import DeepFM
import numpy as np

In [None]:
import gc

# Generate Data

In [None]:
class DataProcessor:
    def __init__(self, data_dir:str):
        self.base = data_dir # data diectory
    
    def _load_raw_data(self) -> dict:
        """Load original raw data

        Returns:
            dict: raw data dictionary
        """
        articles  = pd.read_csv(self.base+'articles.csv')
        customers = pd.read_csv(self.base+'customers.csv')
        trans     = pd.read_csv(self.base+'transactions_train.csv')

        return {'item':articles, 'user':customers, 'trans':trans}
    
    def _encode_id(self, data:dict, map_dir:str) -> dict:
        """Encode user and item id as integers

        Args:
            data (dict): raw data dictionary, keys: 'item', 'user', 'trans'
            map_dir (str): relative directory to store index-id-maps

        Returns:
            dict: data dictionary
        """
        if not os.path.isdir(self.base+map_dir):
            os.mkdir(self.base+map_dir)

        user_id2index_path = self.base + map_dir + 'user_id2index.pkl'
        user_index2id_path = self.base + map_dir + 'user_index2id.pkl'
        item_id2index_path = self.base + map_dir + 'item_id2index.pkl'
        item_index2id_path = self.base + map_dir + 'item_index2id.pkl'

        user_id2index_dict = dict(zip(data['user']['customer_id'], data['user'].index+1))
        user_index2id_dict = dict(zip(data['user'].index+1, data['user']['customer_id']))
        item_id2index_dict = dict(zip(data['item']['article_id'], data['item'].index+1))
        item_index2id_dict = dict(zip(data['item'].index+1, data['item']['article_id']))
        pickle.dump(user_id2index_dict, open(user_id2index_path, 'wb'))
        pickle.dump(user_index2id_dict, open(user_index2id_path, 'wb'))
        pickle.dump(item_id2index_dict, open(item_id2index_path, 'wb'))
        pickle.dump(item_index2id_dict, open(item_index2id_path, 'wb'))
        
        data['trans']['customer_id'] = data['trans']['customer_id'].map(user_id2index_dict)
        data['trans']['article_id']  = data['trans']['article_id'].map(item_id2index_dict)
        data['user']['customer_id']  = data['user']['customer_id'].map(user_id2index_dict)
        data['item']['article_id']   = data['item']['article_id'].map(item_id2index_dict)

        return data
    
    def _transform_feats(self, data:dict) -> dict:
        """Transform features (label encode and change dtypes)

        Args:
            data (dict): data dictionary, keys: 'item', 'user', 'trans'

        Returns:
            dict: data dictionary
        """
        trans = data['trans']
        user = data['user'].fillna(-1)
        item = data['item']

        # Transactions
        trans['price'] = trans['price'].astype('float32')
        trans['sales_channel_id'] = trans['sales_channel_id'].astype('int8')

        # Customers
        user_sparse_feats = [x for x in user.columns if x not in ['age']]
        for feat in tqdm([x for x in user_sparse_feats if x!='customer_id'], 'Encode User Sparse Feats'):
            lbe = LabelEncoder()
            user[feat] = lbe.fit_transform(user[feat].astype(str)) + 1
            user[feat] = user[feat].astype('int32')
        
        # Articles
        item_sparse_feats = ['article_id', 'product_code', 'product_type_no', 'product_group_name', 
                             'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id', 
                             'perceived_colour_master_id', 'department_no', 'index_code', 'index_group_no', 
                             'section_no', 'garment_group_no']
        for feat in tqdm([x for x in item_sparse_feats if x!='article_id'], 'Encode Item Sparse Feats'):
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat].astype(str)) + 1
            item[feat] = item[feat].astype('int32')
        
        data['trans'] = trans
        data['user'] = user
        data['item'] = item[item_sparse_feats]

        return data
    

    def save_data(self, data:dict, name:str):
        """Save data dictionary as parquet

        Args:
            data (dict): data dictionary, keys: 'item', 'user', 'trans'
            name (str): name of the data dict (data versioning)
        """
        path = self.base+name+'/'
        if not os.path.exists(path):
            os.mkdir(path)
        data['user'].to_parquet(path+'user.pqt')
        data['item'].to_parquet(path+'item.pqt')
        data['trans'].to_parquet(path+'trans.pqt')
    
    def load_data(self, name:str) -> dict:
        """Load data dictionary

        Args:
            name (str): name of data dict

        Raises:
            OSError: invalid data version

        Returns:
            dict: loaded data dictionary
        """
        path = self.base+name+'/'
        if not os.path.exists(path):
            raise OSError
        data = {}
        data['user'] = pd.read_parquet(path+'user.pqt')
        data['item'] = pd.read_parquet(path+'item.pqt')
        data['trans'] = pd.read_parquet(path+'trans.pqt')

        return data
    
    def preprocess_data(self, save:bool=True, name:str='encoded_full') -> dict:
        """Preprocess raw data

        Args:
            save (bool, optional): whether to save the preprocessed data. Defaults to True.
            name (str, optional): version name of the data to be saved

        Returns:
            dict: preprocessed data
        """
        data = self._load_raw_data()
        data = self._encode_id(data, 'index_id_map/')
        data = self._transform_feats(data)
        if save:
            self.save_data(data, name)
        return data
    

    def gen_data_set(self, data_name:str, dataset_name:str, features:List[str], 
                           data:dict, train_end_date:str, val_end_date:str, seq_max_len:int, negsample:int=0):
        """Generate train set and valid set

        Args:
            data_name (str): version name of data dictionary
            dataset_name (str): version name of dataset
            features (List[str]): feature list
            data (dict): data dictionary, keys: 'user', 'item', 'trans'
            train_end_date (str): end date of train set
            val_end_date (str): end date of valid set
            seq_max_len (int): maximum history sequence length
            negsample (int, optional): number of negative samples. Defaults to 0.

        """
        np.random.seed(2022)
        args = {
            'features':features,
            'train_end_date':train_end_date,
            'val_end_date':val_end_date,
            'seq_max_len':seq_max_len,
            'negsample':negsample,
            'seed':2022
        }

        data['trans'].sort_values("t_dat",inplace=True)
        trans = data['trans']
        # Split train set and valid set
        train_data = trans.loc[trans['t_dat']<=train_end_date]
        val_data = trans.loc[(train_end_date<trans['t_dat']) & (trans['t_dat']<=val_end_date)]

        item_ids = set(data['item']['article_id'].values)
        
        # Calculate number of rows of train set and valid set to fasten the dataset generating process
        counter = train_data[['customer_id','article_id']].groupby('customer_id',as_index=False).count()
        train_rows = (counter['article_id'] * (negsample+1)).sum()

        # Generate rows
        # train_set format: [custID, articleID, label, history_seq_len]
        # valid_set format: [custID, history_seq_len]
        train_set = np.zeros((train_rows, 4))
        train_customers = list(train_data['customer_id'].unique())
        val_customers = list(val_data['customer_id'].unique())
        val_set = np.zeros((len(train_customers), 2))
        val_label = val_data.groupby('customer_id')['article_id'].apply(list).reset_index()
        val_label['article_id'] = val_label['article_id'].apply(lambda x:' '.join([str(i) for i in x]))

        p,q = 0,0
        for custID, hist in tqdm(train_data.groupby('customer_id'), 'Generate train set'):
            pos_list = hist['article_id'].tolist()
            if negsample > 0:
                candidate_set = list(item_ids - set(pos_list)) # Negative samples
                neg_list = np.random.choice(candidate_set, size=len(pos_list)*negsample, replace=True)
            for i in range(len(pos_list)):
                # Positive sample
                train_set[p] = [custID, pos_list[i], 1, i+1]
                p += 1
            #Negative smaples
            tmp = np.array([[custID, 0, 0, len(pos_list)]])
            tmp = np.repeat(tmp, repeats=negsample*len(pos_list), axis=0)
            tmp[:,1] = neg_list
            train_set[p:p+negsample*len(pos_list)] = tmp
            p += negsample*len(pos_list)

            val_set[q] = [custID, len(pos_list)]
            q += 1
        # val_set = val_set[np.isin(val_set[:,0], val_customers)]

        np.random.shuffle(train_set)
        np.random.shuffle(val_set)

        # Generate other features and save
        path = self.base + data_name + '/' + dataset_name + '/'
        if not os.path.exists(path):
            os.mkdir(path)
        
        json.dump(args, open(path+'args.json','w')) # save args
        
        user = data['user']
        item = data['item']
        user = user.set_index('customer_id')
        item = item.set_index('article_id')

        train_uid = train_set[:,0]
        train_iid = train_set[:,1]
        np.save(open(path+'train_customer_id.npy','wb'), train_uid)
        np.save(open(path+'train_article_id.npy','wb'), train_iid)
        np.save(open(path+'train_label.npy','wb'), train_set[:,2])
        np.save(open(path+'train_hist_len.npy','wb'), train_set[:,3])

        val_uid = val_set[:,0]
        np.save(open(path+'valid_customer_id.npy','wb'), val_uid)
        np.save(open(path+'valid_hist_len.npy','wb'), val_set[:,1])
        val_label.to_csv(path+'valid_label.csv', index=False)

        del train_set, val_set
        gc.collect()

        for key in tqdm([x for x in user.columns if x in features and x!='customer_id']):
            train_tmp_array = user[key].loc[train_uid].values
            val_tmp_array = user[key].loc[val_uid].values
            np.save(open(path+'train_'+key+'.npy','wb'), train_tmp_array)
            np.save(open(path+'valid_'+key+'.npy','wb'), val_tmp_array)
            del train_tmp_array, val_tmp_array
            gc.collect()
        
        del train_uid, user
        gc.collect()
        
        for key in tqdm([x for x in item.columns if x in features and x!='article_id']):
            train_tmp_array = item[key].loc[train_iid].values
            np.save(open(path+'train_'+key+'.npy','wb'), train_tmp_array)
            del train_tmp_array
            gc.collect()
    
    def load_dataset(self, data_name:str, dataset_name:str, customer_feats:List[str], article_feats:List[str]) -> Tuple:
        """Load saved dataset

        Args:
            data_name (str): version name of data used to generate dataset
            dataset_name (str): version name of dataset
            customer_feats (List[str]): list of customer features to be loaded
            article_feats (List[str]): list of article features to be loaded

        Returns:
            Tuple: [train set, valid set]
        """
        path = self.base + data_name + '/' + dataset_name + '/'
        if not os.path.exists(path):
            raise OSError

        train_set = {}
        val_set = {}

        for feat in tqdm(customer_feats + ['hist_len'], 'Load Customer Features'):
            train_set[feat] = np.load(open(path+'train_'+feat+'.npy','rb'), allow_pickle=True)
            val_set[feat] = np.load(open(path+'valid_'+feat+'.npy','rb'), allow_pickle=True)

        for feat in tqdm(article_feats, 'Load Article Features'):
            train_set[feat] = np.load(open(path+'train_'+feat+'.npy','rb'), allow_pickle=True)

        train_label = np.load(open(path+'train_label.npy','rb'), allow_pickle=True)
        val_label = pd.read_csv(path+'valid_label.csv')

        return train_set, train_label, val_set, val_label

In [None]:
dp = DataProcessor('/content/drive/MyDrive/HM-RecSys/data/')

In [None]:
# data = dp.preprocess_data(save=True) # run in the first run

In [None]:
data = dp.load_data(name='encoded_full')

In [None]:
features = list(data['user'].columns) + list(data['item'].columns)
args = {
    'data_name':'encoded_full', 
    'dataset_name':'200915_neg3_deepfm',
    'features':features,
    'data':data,
    'train_end_date':'2020-09-15',
    'val_end_date':'2020-09-22',
    'seq_max_len':20,
    'negsample':3
}
# dp.gen_data_set(**args)

In [None]:
params = {
    'run_name':'deepfm',
    # data params
    'user_sparse_feats':['customer_id','FN','Active','club_member_status','fashion_news_frequency','postal_code'],
    'user_dense_feats':['age'],
    'item_sparse_feats':['article_id'], # ,'product_code','product_type_no'
    'item_dense_feats':[],
    # model params
    'embedding_dim':64,
    'dnn_hidden_units':(256,128,64),
    'dnn_dropout':0,
    'dnn_activation':'relu',
    'dnn_use_bn':False,
    'task':'binary',

    # training params
    'batch_size':2**16,
    'epoch':20,
    'verbose':1,
    'validation_split':0.15
}

In [None]:
# train_features = list(data['user'].columns)

In [None]:
customer_feats = params['user_sparse_feats'] + params['user_dense_feats']
article_feats = params['item_sparse_feats'] + params['item_dense_feats']

In [None]:
train_set, train_label, val_set, val_label = dp.load_dataset(args['data_name'], args['dataset_name'], customer_feats, article_feats)

Load Customer Features: 100%|██████████| 8/8 [00:21<00:00,  2.70s/it]
Load Article Features: 100%|██████████| 1/1 [00:03<00:00,  3.88s/it]


# Load Data and Train Model

In [None]:
# ! pip install numba 
# from numba import cuda 
# device = cuda.get_current_device()
# device.reset()

In [None]:
feature_dim = {}
for feat in data['user'].columns:
    feature_dim[feat] = data['user'][feat].max()+1
for feat in data['item'].columns:
    feature_dim[feat] = data['item'][feat].max()+1

In [None]:
# 3.Define Model and train
path = dp.base+args['data_name']+'/'+args['dataset_name']+'/'
if not os.path.exists(path+params['run_name']):
    os.mkdir(path+params['run_name'])
path += params['run_name']+'/'

In [None]:
user_feature_columns = [SparseFeat(x, feature_dim[x], params['embedding_dim']) for x in params['user_sparse_feats']] +\
                       [DenseFeat(x) for x in params['user_dense_feats']]
item_feature_columns = [SparseFeat(x, feature_dim[x], params['embedding_dim']) for x in params['item_sparse_feats']] +\
                       [DenseFeat(x) for x in params['item_dense_feats']]

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [None]:
full_features = user_feature_columns + item_feature_columns

In [None]:
# 3.Define Model,train,predict and evaluate
K.set_learning_phase(True)
json.dump(params, open(path+'model_params.json','w'))

model = DeepFM(full_features, full_features, dnn_hidden_units=params['dnn_hidden_units'],
               dnn_dropout=params['dnn_dropout'], dnn_activation=params['dnn_activation'],
               dnn_use_bn=params['dnn_use_bn'], task=params['task'])
# model.summary()
params['model_name'] = 'DeepFM'
model.compile("adam", "binary_crossentropy",
                metrics=['binary_crossentropy'], )

history = model.fit(train_set, train_label,
                    batch_size=params['batch_size'], 
                    epochs=params['epoch'],
                    verbose=params['verbose'],
                    validation_split=params['validation_split'])
model.save(path+params['model_name']+'.model')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 107263244 samples, validate on 18928808 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
params['model_name'] = 'DeepFM'
model = DeepFM(full_features, full_features, dnn_hidden_units=params['dnn_hidden_units'],
               dnn_dropout=params['dnn_dropout'], dnn_activation=params['dnn_activation'],
               dnn_use_bn=params['dnn_use_bn'], task=params['task'])
model.load_weights(path+params['model_name']+'.model')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [None]:
all_items = np.array(list(data['item']['article_id'].unique()))

In [None]:
val_pred = pd.DataFrame(columns=['customer_id','prediction'])
val_pred['customer_id'] = val_set['customer_id']
val_pred['customer_id'] = val_pred['customer_id'].astype('int32')

In [26]:
batch_size = 300
num_item = all_items.shape[0]

def init_batch_input(batch_size, num_item):
    val_input = {}
    for f in params['user_sparse_feats']+params['user_dense_feats']+['article_id']:
        val_input[f] = np.zeros((batch_size*num_item))
    return val_input

val_input = init_batch_input(batch_size, num_item)
for i in tqdm(range(val_set['customer_id'].shape[0])):
    for f in params['user_sparse_feats']+params['user_dense_feats']:
        tmp = np.repeat(val_set[f][i:i+1], num_item)
        val_input[f][i%batch_size*num_item:(i%batch_size+1)*num_item] = tmp
    val_input['article_id'][i%batch_size*num_item:(i%batch_size+1)*num_item] = all_items

    if (i+1)%batch_size == 0:
        pred = model.predict(val_input, batch_size=params['batch_size'])
        for j in range(batch_size):
            sub_pred = pred[j*num_item:(j+1)*num_item]
            sub_pred = sub_pred.argsort(axis=0)[-12:][::-1][:,0] + 1
            val_pred['prediction'].loc[i-batch_size+1+j] = sub_pred
        val_input = init_batch_input(batch_size, num_item)
        gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

KeyboardInterrupt: ignored

In [30]:
val_pred = val_pred[~val_pred['prediction'].isna()]

In [31]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted) if a]) # CHANGES: ignore null actual (variable=a)

In [32]:
# del val_label['prediction']

In [33]:
val_label = pd.merge(val_label, val_pred, on='customer_id', how='left')

In [34]:
# use top12 articles to impute cold start users
top_k = 12
trans_week = data['trans'].loc[('2020-09-15' >= data['trans'].t_dat) & (data['trans'].t_dat >= '2020-09-09')]
top12_products = data['trans'].article_id.value_counts().index[:top_k].tolist()
top12_products = ' '.join([str(x) for x in top12_products])

In [39]:
mask = ~val_label['prediction'].isna()
val_label['prediction'][mask] = val_label['prediction'][mask].apply(lambda x:' '.join([str(i) for i in x]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [40]:
val_label['prediction'] = val_label['prediction'].fillna(top12_products)

In [41]:
val_label['prediction'] = val_label['prediction'].apply(lambda x:[int(i) for i in x.split()])
val_label['article_id'] = val_label['article_id'].apply(lambda x:[int(i) for i in x.split()])

In [43]:
mapk(val_label['article_id'][mask], val_label['prediction'][mask], k=top_k)

0.003213836262887262

### Compare with rule method

In [None]:
train = data['trans'].loc['2020-09-15' >= data['trans'].t_dat]
train['t_dat'] = pd.to_datetime(train['t_dat'])

tmp = train.groupby('customer_id').t_dat.max().reset_index()
tmp.columns = ['customer_id','max_dat']
train = train.merge(tmp,on=['customer_id'],how='left')
train['diff_dat'] = (train.max_dat - train.t_dat).dt.days
train = train.loc[train['diff_dat']<=6]
print('Train shape:',train.shape)

tmp = train.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
tmp.columns = ['customer_id','article_id','ct']
train = train.merge(tmp,on=['customer_id','article_id'],how='left')
train = train.sort_values(['ct','t_dat'],ascending=False)
train = train.drop_duplicates(['customer_id','article_id'])
train = train.sort_values(['ct','t_dat'],ascending=False)
train.head()

In [None]:
# vc = train.article_id.value_counts()
# pairs = {}
# for j,i in enumerate(vc.index.values[1000:1032]):
#     #if j%10==0: print(j,', ',end='')
#     USERS = train.loc[train.article_id==i.item(),'customer_id'].unique()
#     vc2 = train.loc[(train.customer_id.isin(USERS))&(train.article_id!=i.item()),'article_id'].value_counts()
#     pairs[i.item()] = [vc2.index[0], vc2.index[1], vc2.index[2]]
pairs = np.load('/content/drive/MyDrive/HM-RecSys/data/pairs_cudf.npy',allow_pickle=True).item()

In [None]:
train['article_id2'] = train.article_id.map(pairs)

In [None]:
train2 = train[['customer_id','article_id2']].copy()
train2 = train2.loc[train2.article_id2.notnull()]
train2 = train2.drop_duplicates(['customer_id','article_id2'])
train2 = train2.rename({'article_id2':'article_id'},axis=1)

In [None]:
train = train[['customer_id','article_id']]
train = pd.concat([train,train2],axis=0,ignore_index=True)
train.article_id = train.article_id.astype('int32')
train = train.drop_duplicates(['customer_id','article_id'])

In [None]:
train['article_id'] = ' '+train['article_id'].astype(str)

In [None]:
pred = train.groupby('customer_id').article_id.sum().reset_index()
pred.rename(columns={'article_id':'prediction2'},inplace=True)

In [None]:
val_label = pd.merge(val_label, pred, on=['customer_id'], how='left')

In [None]:
val_label['prediction2'] = val_label['prediction2'].fillna(top12_products)
val_label['prediction2'] = val_label['prediction2'].apply(lambda x:[int(i) for i in x.split()])

In [None]:
mapk(val_label['article_id'], val_label['prediction2'], k=12)

# Submit

In [None]:
pred_df = val_pred

In [None]:
map_path = dp.base + 'index_id_map/'

user_index2id_dict = pickle.load(open(map_path+'/user_index2id.pkl','rb'))
item_index2id_dict = pickle.load(open(map_path+'/item_index2id.pkl','rb'))

pred_df['customer_id'] = pred_df['customer_id'].map(user_index2id_dict)

In [None]:
pred_df['prediction'] = pred_df['prediction'].apply(lambda x:' '.join([str(item_index2id_dict[s]) for s in x]))

In [None]:
trans_week = data['trans'].loc[data['trans'].t_dat >= '2020-09-09']
top12_products = data['trans'].article_id.value_counts().index[:12].tolist()
top12_products = [item_index2id_dict[i] for i in top12_products]
top12_products = ' '.join([str(x) for x in top12_products])

In [None]:
sub = pd.read_csv('/content/drive/MyDrive/HM-RecSys/data/sample_submission.csv')
del sub['prediction']

In [None]:
sub = pd.merge(sub, pred_df, on=['customer_id'], how='left')
sub['prediction'][sub['prediction'].isna()] = top12_products

In [None]:
sub[['customer_id','prediction']].to_csv('/content/drive/MyDrive/HM-RecSys/submit/baseline.csv', index=None)

In [None]:
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/HM-RecSys/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f /content/drive/MyDrive/HM-RecSys/submit/baseline.csv -m "baseline"