In [1]:
import gc
import glob
import pandas
import numpy as np # linear algebra

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        pass

In [2]:
# ==================================================================
# ----- Конфигурация сессии для экспериментов-----------------------
# ==================================================================
LOCAL = True  # Тип метрики. True - локально, False - Kaggle
SAVE  = True # Сохранять ли файлы. Нужно при сохранении датасета
SHOW  = True # Выводить ли таблицы в процессе
GPU   = False # Использовать GPU
# -----------------------------------------------------------------
params = {
    #'path': '/otto-full-optimized-memory-footprint/train.parquet',
    'path': '/otto-analyse-data/train_no_free_orders.parquet',
    'time_elapsed': 7*24*60*60,
    #'sample': 100000
}

In [3]:
if GPU:
    import cudf
pd = cudf if GPU else pandas
print('We will use RAPIDS version',cudf.__version__) if GPU else print('We will use CPU...')

We will use CPU...


In [4]:
# -----------------------------------------------------------
def dropCopies(df, column='labels'):
    _ = df.copy()
    _[column] = _[column].apply(lambda x: list(dict.fromkeys(x)))
    return _
# -----------------------------------------------------------
def cloneEvents(df):
    result = pd.DataFrame()
    for i in [0, 1, 2]:
        _ = df.copy()
        _['type'] = i
        result = pd.concat([result, _])
    return result
# -----------------------------------------------------------
def getLastNoTypeOnPercent(df, count=0.5, order=[1, 0 ,2], fill=True):
    def sort(x):
        return x.map(lambda x: order.index(x))
    def apply(x):
        rlen = round(len(x) * count)
        return x[:rlen if rlen > 0 else 1]
    _ = df.copy().groupby(['session', 'type']).agg(lambda x: list(dict.fromkeys(x))).reset_index()
    _['aid'] = _['aid'].apply(apply)
    if order is not None and type(order) is list:
        _.sort_values(by='type', inplace=True, key=sort)
    _ = _.groupby('session').agg(labels = ('aid', sum)).reset_index()
    _ = dropCopies(_)
    return cloneEvents(_) if fill else _

In [5]:
# ----- Загрузка данных -----

In [6]:
%%time
# ~ 20s CPU
# ~ 21s GPU
data = pd.read_parquet('../input' + params["path"])
if params.get('sample'):
    data = data.sample(params['sample'])
if SHOW: display(data)

Unnamed: 0,session,aid,ts,type
6,0,1649869,1659369893,1
7,0,461689,1659369898,1
8,0,305831,1659370027,2
9,0,461689,1659370027,2
43,0,789245,1659710157,1
...,...,...,...,...
216715973,12899732,1126169,1661723928,1
216715991,12899739,1379999,1661723968,1
216715993,12899739,301163,1661723986,1
216716044,12899757,1677695,1661723953,1


CPU times: user 1.53 s, sys: 2.14 s, total: 3.68 s
Wall time: 4.73 s


In [7]:
# ----- Разделение датасета -----

In [8]:
def trainTestSplit(df, time_elapsed=7*24*60*60, log=False):
    train_cutoff = df['ts'].max() - time_elapsed
    train = df[df['ts'] <= train_cutoff]
    test = df[df['ts'] > train_cutoff]
    
    overlapping_sessions = train[['session']]\
            .merge(test['session'], how='inner', on='session')['session'].unique()
    test = test[~test['session'].isin(overlapping_sessions)]
    
    new_test = []
    data_to_calculate_validation_score = []
    chunks = len(test['session'].unique())
    percent = -1
    cnt = 0
    
    for grp in test.groupby('session'):
        if (log) and (round(cnt * 100 / chunks) != percent):
            percent = round(cnt * 100 / chunks)
            print('\r', f'{percent}% chunk: {cnt+1}/{chunks+1}', end='   ')
        if(grp[1].shape[0] > 1):
            cutoff = np.random.randint(1, grp[1].shape[0])
            new_test.append(grp[1].iloc[:cutoff])
            data_to_calculate_validation_score.append(grp[1].iloc[cutoff:])
        cnt += 1
    print('\r', f'100% chunk: {chunks+1}/{chunks+1}')
    
    ntest = pd.concat(new_test).reset_index(drop=True)
    valid = pd.concat(data_to_calculate_validation_score).reset_index(drop=True)
    return test, ntest, valid
    valid.drop('ts', axis=1, inplace=True)
    valid = valid.groupby(['session', 'type'])\
                .agg(aids=('aid', lambda x: list(dict.fromkeys(x)))).reset_index()
    valid['type'] = valid['type'].astype('uint8')
    
    del new_test, data_to_calculate_validation_score
    return train, test, valid

In [9]:
%%time
# ~ 10s CPU (100000)
# ~ 1m 20s GPU (100000)
train, test, labels = trainTestSplit(data, params['time_elapsed'], log=True)
if SAVE:
    train.to_parquet('local_train.parquet', index=False)
    test.to_parquet('local_test.parquet', index=False)
    labels.to_parquet('local_labels.parquet', index=False)
if SHOW: display(train, test, labels)

 100% chunk: 723933/723933


Unnamed: 0,session,aid,ts,type
328,2,161269,1661688306,1
9125,84,1236775,1661718709,1
12524,126,923529,1661197237,1
13513,136,87442,1661676627,1
23670,260,1648747,1661241743,1
...,...,...,...,...
216715973,12899732,1126169,1661723928,1
216715991,12899739,1379999,1661723968,1
216715993,12899739,301163,1661723986,1
216716044,12899757,1677695,1661723953,1


Unnamed: 0,session,aid,ts,type
0,382,1375190,1661200758,1
1,486,500819,1661245453,1
2,486,21885,1661245839,1
3,486,1229471,1661245989,1
4,486,1801381,1661246269,1
...,...,...,...,...
1177163,12899595,1449873,1661723801,1
1177164,12899608,327026,1661723930,1
1177165,12899631,1055835,1661723830,1
1177166,12899644,436912,1661723924,1


Unnamed: 0,session,aid,ts,type
0,382,130264,1661200766,1
1,486,1688710,1661246349,1
2,486,319580,1661247784,1
3,486,1633488,1661247959,1
4,486,1229471,1661248339,2
...,...,...,...,...
1178677,12899595,1170894,1661723925,1
1178678,12899608,327026,1661723944,1
1178679,12899631,1807283,1661723972,1
1178680,12899644,822934,1661723940,1


CPU times: user 2min 34s, sys: 7.28 s, total: 2min 41s
Wall time: 2min 41s


In [10]:
# ----- Проверка на наличие неизвестных товаров -----

In [11]:
def getIntersection(r, l):
    inter = set(l).intersection(set(r))
    i = len(inter)
    m = len(l)
    print('Пересекается', i, 'из', m, 'или', round(i * 100 / m, 2), '% товаров')
    return inter

In [12]:
raids = test['aid'].unique()
laids = labels['aid'].unique()
iaids = getIntersection(raids, laids)

Пересекается 213854 из 325941 или 65.61 % товаров


In [13]:
res = {}
l = len(raids)
ll = labels.copy()
ll['aid'] = ll['aid'].apply(lambda x: raids[x % l] if x not in iaids else x)
_ = getIntersection(raids, ll['aid'].unique())
labels = ll

Пересекается 245963 из 245963 или 100.0 % товаров


In [14]:
# ----- Вычисление последних 50% -----

In [15]:
%%time
# ~s
result = getLastNoTypeOnPercent(test, 0.5)
if SHOW: display(result)

Unnamed: 0,session,labels,type
0,382,[1375190],0
1,486,"[500819, 21885]",0
2,524,[1552245],0
3,639,[1321398],0
4,692,[1662401],0
...,...,...,...
465350,12899595,[1449873],2
465351,12899608,[327026],2
465352,12899631,[1055835],2
465353,12899644,[436912],2


CPU times: user 10.9 s, sys: 222 ms, total: 11.2 s
Wall time: 11.2 s


In [16]:
def localMetrics(preds, df_true): 
    lsub = preds.copy()
    lsub['labels'] = lsub['labels'].apply(lambda x: x[:20])
    
    test_labels = df_true.copy().drop('ts', axis=1)\
            .groupby(['session', 'type']).agg(lambda x: list(dict.fromkeys(x))[:20]).reset_index()
    test_labels = test_labels.merge(lsub, how='left', on=['session', 'type'])
    
    test_labels['hits']     = test_labels.apply(lambda df: len(set(df['aid']).intersection(set(df['labels']))), axis=1)
    test_labels['gt_count'] = test_labels['aid'].str.len().clip(0,20)  
    
    recall_per_type = test_labels.groupby(['type'])['hits'].sum() / test_labels.groupby(['type'])['gt_count'].sum() 
    score = (recall_per_type * pd.Series({0: 0.10, 1: 0.30, 2: 0.60})).sum()
    
    r = {0:0, 1:0, 2:0}
    for key in r.keys():
        if test_labels[test_labels['type'] == key].shape[0] > 0:
            r[key] = recall_per_type[key]
    print(f"{score:.3f} = {r[0]:.3f} + {r[1]:.3f} + {r[2]:.3f}")
    print('score:', score)
    return score

In [17]:
%%time
# ~s
if LOCAL:
    localMetrics(result, labels) # 0.25 - 0.277, 0.5 - 0.323, 1.0 - 0.460

0.323 = 0.000 + 0.113 + 0.481
score: 0.32257092518623864
CPU times: user 17.3 s, sys: 153 ms, total: 17.4 s
Wall time: 17.5 s


In [18]:
def getSubmission(df):
    sub = df.copy().dropna()
    sub['labels'] = sub['labels'].map(lambda x: ' '.join(str(i) for i in list(dict.fromkeys(x))))
    sub['type'] = sub['type'].map({0: 'clicks', 1: 'carts', 2: 'orders'})
    sub['session_type'] = sub['session'].astype('str') + '_' + sub['type']
    return sub.drop(['session', 'type'], axis=1)

In [19]:
# ----- PREDICT -----

In [20]:
%%time
# ~s
submission = getSubmission(result)
if SAVE: submission.to_csv('submission.csv', index=False)
if SHOW: display(submission)

Unnamed: 0,labels,session_type
0,1375190,382_clicks
1,500819 21885,486_clicks
2,1552245,524_clicks
3,1321398,639_clicks
4,1662401,692_clicks
...,...,...
465350,1449873,12899595_orders
465351,327026,12899608_orders
465352,1055835,12899631_orders
465353,436912,12899644_orders


CPU times: user 6.03 s, sys: 330 ms, total: 6.35 s
Wall time: 6.38 s
