In [1]:
import pandas as pd
import numpy as np
import gc
from preprocess import DataProcessor

In [2]:
from typing import Tuple, List

In [3]:
from tqdm import tqdm

In [4]:
class RuleDataProcessor(DataProcessor):
    """Data processor for rule predictor"""

    def __init__(self, data_dir:str):
        super(RuleDataProcessor, self).__init__(data_dir)
    
    def split_data(self, trans_data:pd.DataFrame, train_end_date:str, valid_end_date:str) -> Tuple[pd.DataFrame]:
        """Split transaction data into train set and valid set

        Args:
            trans_data (pd.DataFrame): transaction dataframe
            train_end_date (str): end date of train set, max(train_set.date) <= train_end_date
            valid_end_date (str): end date of valid set, max(valid_set.date) <= valid_end_date

        Returns:
            Tuple[pd.DataFrame]: [train set, valid_set]
        """
        train_set = trans_data.loc[trans_data['t_dat']<=train_end_date]
        valid_set = trans_data.loc[(train_end_date<trans_data['t_dat'])&(trans_data['t_dat']<=valid_end_date)]
        valid_set = valid_set.groupby(['customer_id'])['article_id'].apply(list).reset_index()

        return train_set, valid_set
    
    def history_purchase(self, train_set:pd.DataFrame, days:int=7) -> pd.DataFrame:
        """Filter history purchase items in the last n days

        Args:
            train_set (pd.DataFrame): transaction dataframe
            days (int, optional): length of history

        Returns:
            pd.DataFrame: filtered dataframe
        """
        train_set['t_dat'] = pd.to_datetime(train_set['t_dat'])
        tmp = train_set.groupby('customer_id').t_dat.max().reset_index()
        tmp.columns = ['customer_id','max_dat']
        res = train_set[['customer_id','article_id','t_dat']].merge(tmp,on=['customer_id'],how='left')
        res['diff_dat'] = (res.max_dat - res.t_dat).dt.days
        res = res.loc[res['diff_dat']<days].reset_index(drop=True)

        return res
    
    def get_freq_pair(self, train_set:pd.DataFrame, n:int=3) -> dict:
        """Generate dict of frequent item pairs in target time window

        Args:
            train_set (pd.DataFrame): transaction dataframe in target time window
            n (int, optional): search top *n* pairs for each article. Defaults to 3.

        Returns:
            dict: frequent item pairs
        """
        tmp = train_set.drop_duplicates(['customer_id','article_id'])
        s = tmp[['customer_id','article_id']].merge(tmp[['customer_id','article_id']],on='customer_id')
        s = s[s['article_id_x']!=s['article_id_y']]
        s['count'] = 1
        s = s.groupby(['article_id_x','article_id_y'],as_index=False)['count'].sum()
        s = s.sort_values('count', ascending=False).reset_index(drop=True)
        s = s.groupby('article_id_x')['article_id_y'].apply(lambda x:list(x)[:n])
        s = s.to_dict()

        res = {}
        for k in s.keys():
            for i in s[k]:
                res[k] = i
        return res

    def popular_item(self, train_set:pd.DataFrame, n:int=20) -> List[int]:
        """Generate list of popular items in target time windows

        Args:
            train_set (pd.DataFrame): transaction dataframe in target time window
            n (int, optional): return top *n* popular items. Defaults to 20.

        Returns:
            List[int]: popular items
        """
        tmp = train_set.drop_duplicates(['customer_id','article_id'])
        tmp['count'] = 1
        tmp = tmp.groupby(['article_id'],as_index=False)['count'].sum()
        tmp = tmp.sort_values(by='count',ascending=False)
        
        return tmp['article_id'].values[:n]
    
    def off_item(self, trans_data:pd.DataFrame) -> List[int]:
        """Find items that seem to be off stock

        Args:
            trans_data (pd.DataFrame): transaction_data

        Returns:
            List[int]: off stock items
        """
        item_sale = trans_data.copy()
        item_sale['t_dat'] = pd.to_datetime(item_sale['t_dat'])
        item_sale['year_month'] = (item_sale['t_dat'].dt.year).astype(str) + '_' + (item_sale['t_dat'].dt.month).astype(str)
        item_sale = item_sale.groupby(['article_id','year_month'])['customer_id'].agg('count').reset_index()
        item_sale.rename(columns={'customer_id':'count'}, inplace=True)

        item_sale = pd.pivot_table(item_sale, values='count', index='article_id', columns='year_month')
        item_sale = item_sale.fillna(0)
        mask = ((item_sale['2020_9'] - item_sale['2020_8']) / item_sale['2020_8']) < -0.8
        mask2 = item_sale['2020_9'] == 0

        return list(item_sale[mask | mask2].index)

In [5]:
dp = RuleDataProcessor('./data/')

In [6]:
# data = dp.preprocess_data(save=True, name='encoded_full')
data = dp.load_data('encoded_full')

In [7]:
trans = data['trans']
user = data['user']
item = data['item']

In [8]:
data = {}
data['trans'] = trans
data['item'] = item
data['user'] = user

# Validate

In [8]:
train, valid = dp.split_data(trans, '2020-09-15', '2020-09-22')

In [9]:
last_week = dp.history_purchase(train, days=7)
sept = train[pd.to_datetime(train['t_dat']).dt.month==9]

pairs1 = dp.get_freq_pair(last_week, n=1)
pairs2 = dp.get_freq_pair(sept, n=1)

last_week['article_id2'] = last_week['article_id'].map(pairs1)
sept['article_id2'] = sept['article_id'].map(pairs1)

train2 = last_week[['customer_id','article_id2']].copy()
train2 = train2.loc[train2.article_id2.notnull()]
train2 = train2.drop_duplicates(['customer_id','article_id2'])
train2 = train2.rename({'article_id2':'article_id'},axis=1)
train2['article_id'] = train2['article_id'].astype('int32')

train3 = sept[['customer_id','article_id2']].copy()
train3 = train3.loc[train3.article_id2.notnull()]
train3 = train3.drop_duplicates(['customer_id','article_id2'])
train3 = train3.rename({'article_id2':'article_id'},axis=1)
train3['article_id'] = train3['article_id'].astype('int32')

pred = pd.concat([last_week[['customer_id','article_id']], train2, sept[['customer_id','article_id']], train3])
pred = pred.drop_duplicates(['customer_id','article_id'])

off_stock_items = dp.off_item(trans)
pred = pred[~pred['article_id'].isin(off_stock_items)]
pred['article_id'] = ' '+ pred['article_id'].astype(str) + ' '
pred = pred.groupby('customer_id')['article_id'].sum().reset_index()
pred.columns = ['customer_id','prediction']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set['t_dat'] = pd.to_datetime(train_set['t_dat'])


In [None]:
top12 = dp.popular_item(trans[trans['t_dat']>'2020-09-15'], n=12)
valid = pd.merge(valid, pred, on=['customer_id'], how='left')
valid['prediction'] = valid['prediction'].fillna('')
valid['prediction'] += ' '.join([str(x) for x in top12])
valid['prediction'] = valid['prediction'].apply(lambda x:[int(i) for i in x.split()[:12]])

In [None]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted) if a]) # CHANGES: ignore null actual (variable=a)

In [None]:
mapk(valid['article_id'], valid['prediction'], k=12)

0.023569954298159303

In [None]:
mapk(valid['article_id'], valid['prediction'], k=12)

0.02245225856110233

In [None]:
mapk(valid['article_id'], valid['prediction'], k=12)

0.021503000716863493

# Predict

In [9]:
last_week = dp.history_purchase(trans, days=7)
sept = trans[pd.to_datetime(trans['t_dat']).dt.month==9]

pairs1 = dp.get_freq_pair(last_week, n=1)
pairs2 = dp.get_freq_pair(sept, n=1)

last_week['article_id2'] = last_week['article_id'].map(pairs1)
sept['article_id2'] = sept['article_id'].map(pairs1)

train2 = last_week[['customer_id','article_id2']].copy()
train2 = train2.loc[train2.article_id2.notnull()]
train2 = train2.drop_duplicates(['customer_id','article_id2'])
train2 = train2.rename({'article_id2':'article_id'},axis=1)
train2['article_id'] = train2['article_id'].astype('int32')

train3 = sept[['customer_id','article_id2']].copy()
train3 = train3.loc[train3.article_id2.notnull()]
train3 = train3.drop_duplicates(['customer_id','article_id2'])
train3 = train3.rename({'article_id2':'article_id'},axis=1)
train3['article_id'] = train3['article_id'].astype('int32')

pred = pd.concat([last_week[['customer_id','article_id']], train2, sept[['customer_id','article_id']], train3])
pred = pred.drop_duplicates(['customer_id','article_id'])

off_stock_items = dp.off_item(trans)
pred = pred[~pred['article_id'].isin(off_stock_items)]
pred['article_id'] = ' '+ pred['article_id'].astype(str) + ' '
pred = pred.groupby('customer_id')['article_id'].sum().reset_index()
pred.columns = ['customer_id','prediction']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sept['article_id2'] = sept['article_id'].map(pairs1)


In [22]:
sub = pd.read_csv('./data/sample_submission.csv')

In [23]:
import pickle

In [24]:
map_path = dp.base + 'index_id_map/'

user_index2id_dict = pickle.load(open(map_path+'/user_index2id.pkl','rb'))
item_index2id_dict = pickle.load(open(map_path+'/item_index2id.pkl','rb'))

In [25]:
pred['customer_id'] = pred['customer_id'].map(user_index2id_dict)

In [26]:
del sub['prediction']
top12 = dp.popular_item(trans[trans['t_dat']>'2020-09-15'], n=12)
sub = pd.merge(sub, pred, on=['customer_id'], how='left')
sub['prediction'] = sub['prediction'].fillna('')
sub['prediction'] += ' '.join([str(x) for x in top12])
sub['prediction'] = sub['prediction'].apply(lambda x:[int(i) for i in x.split()[:12]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['count'] = 1


In [None]:
sub['prediction'] = sub['prediction'].apply(lambda x:' '.join(['0'+str(item_index2id_dict[i]) for i in x]))

In [None]:
sub.to_csv('submission.csv',index=False)