In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from datetime import datetime, timedelta

In [2]:
PATH_ILIAS = '../01_ilias'
ORDERS_BEFORE_DEC = PATH_ILIAS + '/orders_before_dec.csv'
TARGET_DEC = PATH_ILIAS + '/gold_dec.csv'
SUBMISSION_DEC = PATH_ILIAS + '/submission_dec.csv'

ORDERS_BEFORE_JAN = PATH_ILIAS + '/orders_before_jan.csv'
TARGET_JAN = PATH_ILIAS + '/gold_jan.csv'
SUBMISSION_JAN = PATH_ILIAS + '/submission_jan.csv'

In [3]:
string_cols = ['userID', 'itemID']

dict_dtypes = {x : 'str'  for x in string_cols}

before_dec = pd.read_csv(ORDERS_BEFORE_DEC, sep='|',  parse_dates=["date"])
target_dec = pd.read_csv(TARGET_DEC, sep='|')
submission_dec = pd.read_csv(SUBMISSION_DEC, sep='|')

before_jan = pd.read_csv(ORDERS_BEFORE_JAN, sep='|',  parse_dates=["date"])
target_jan = pd.read_csv(TARGET_JAN, sep='|')
submission_jan = pd.read_csv(SUBMISSION_JAN, sep='|')

In [4]:
def generate_days_since_last_purchase_feature(X_train):
    data =[]
    X_train = X_train.sort_values(by=['itemID', 'userID'])
    itemIDs = X_train["itemID"].unique()
    for itemID in tqdm(itemIDs):
        idx_x = 0
        idx_y = 0
        # user_partition of all items with this ID 
        user_partition = X_train[X_train["itemID"] == itemID]
        userIDs = user_partition["userID"].unique()
        life_cycles_per_item = []
        for userID in userIDs:
            # count occurences of this combination for partitioning
            occurences_of_combi = len(user_partition[user_partition["userID"] == userID])
            # calculate index end of partition
            idx_y += occurences_of_combi
            item_user_partition = user_partition[idx_x:idx_y]
            cycle_per_user = calculate_average_life_cycle_per_replenishment(item_user_partition)
            idx_x += occurences_of_combi
            data.append(cycle_per_user)
    return data

In [5]:
def collect_life_cycle_over_target_items(X_train, y_test):
    data =[]
    X_train = X_train.sort_values(by=['itemID', 'userID'])
    y_test = y_test.sort_values(by=['itemID', 'userID'])
    itemIDs = X_train["itemID"].unique()
    itemIDs_target = y_test["itemID"].unique()
    for itemID in tqdm(itemIDs):
        if not itemID in itemIDs_target:
            continue
        user_partition = X_train[X_train["itemID"] == itemID]
        intervals = user_partition['days_since_last_purchase'].dropna()
        mean_life_cycle_per_item = intervals.mean()
        results = np.hstack([itemID, mean_life_cycle_per_item])
        data.append(results)
    return data

In [6]:
def calculate_average_life_cycle_per_replenishment(partition):
    N = partition.shape[0]
    result = np.zeros((N , 5))
    diff = partition["date"].diff().dt.days.to_numpy()[1:]
    amount = partition["order"][:-1].to_numpy()
    temp = diff / amount
    temp = np.r_[np.nan, temp]
    temp = temp.reshape((N,1))
    result = np.hstack((partition, temp))
    return result

In [7]:
def train(X_train, y_test):
    jan_av = generate_days_since_last_purchase_feature(X_train)
    temp = np.concatenate(jan_av).ravel()
    df_av = pd.DataFrame(data=temp.reshape(-1, 5), columns=['date', 'userID', 'itemID', 'order', 'days_since_last_purchase'])
    # save df if necessary
    # df_av.to_csv('before_dec_with_intervals.csv', sep='|')
    item_life_cycle_mapping = collect_life_cycle_over_target_items(df_av, y_test)
    mapp = np.concatenate(item_life_cycle_mapping).ravel()
    mapp=mapp.reshape(-1, 2)
    df_mapping = pd.DataFrame(data=mapp, columns= ['itemID', 'days_average'])
    return df_mapping

In [8]:
START_PRED_DATE = datetime(2020,12,1)

def predict(X, y, averages):
    N = y.shape[0]
    y_np = y.to_numpy()
    for i in tqdm(range(N)):
        userID = y_np[i][0]
        itemID = y_np[i][1]
        user_partition = X[X['userID']== userID]
        items = user_partition[user_partition['itemID']== itemID]
        it = items.sort_values(by=['date'], ascending=False).iloc[0]
        average_to_add = averages[averages['itemID']== itemID]['days_average']
        if not np.isnan(average_to_add.iloc[0]):
            delta = average_to_add.iloc[0] * it['order']
        else:
            delta = 0
        new_purchase_date = it['date'] + timedelta(days=delta)
        daysDif = (new_purchase_date - START_PRED_DATE).days
        pred_week = daysDif // 7 + 1
        if pred_week > 4:
            pred_week = 0
        elif pred_week < 0:
            pred_week = 1
        y_np[i][2] = pred_week
    columns = y.columns
    pred = pd.DataFrame(data=y_np, columns=columns)
    return pred

In [9]:
def count_points(pred, gold):
    df = pd.merge(pred, gold, on=["userID", "itemID"], suffixes=("_pred", "_gold"))
    df["points"] = df.apply(_compute_points_for_row, axis=1)
    return df["points"].sum()

def _compute_points_for_row(row):
    y_pred, y_gold = row.prediction_pred, row.prediction_gold
    if y_pred == y_gold:
        # one point if "no order" (0) is predicted correctly; three points if order week is predicted correctly
        return 1 if y_pred == 0 else 3
    # one point if order is predicted correctly (but not the correct week), otherwise zero points
    return 1 if (y_pred > 0 and y_gold > 0) else 0

def get_score(pred, gold):
    points = count_points(pred, gold)
    max_points = count_points(gold, gold)
    score = points / max_points
    return score

In [10]:
def _run_(X_train, y_target, y_submission):
    model = train(X_train, y_submission)
    prediction = predict(X_train, y_submission, model)
    # save prediction if necessary
    #prediction.to_csv('prediction_jan.csv', sep='|')
    print(get_score(prediction, y_target))

In [11]:
_run_(before_dec, target_dec, submission_dec)

100%|██████████████████████████████████████████████████████████████████████████| 29043/29043 [00:08<00:00, 3321.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 9745/9745 [00:41<00:00, 233.77it/s]


0.4143324412726732
