In [1]:
from datetime import datetime

import gc
import glob
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import defaultdict, Counter

#import cudf as cd
#print('We will use RAPIDS version',cudf.__version__)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/otto-train-and-test-data-for-local-validation/test_labels.parquet
/kaggle/input/otto-train-and-test-data-for-local-validation/id2type.pkl
/kaggle/input/otto-train-and-test-data-for-local-validation/train.parquet
/kaggle/input/otto-train-and-test-data-for-local-validation/type2id.pkl
/kaggle/input/otto-train-and-test-data-for-local-validation/test.parquet
/kaggle/input/prepaired-files/trim_0.parquet
/kaggle/input/prepaired-files/trim_6.parquet
/kaggle/input/prepaired-files/trim_8.parquet
/kaggle/input/prepaired-files/trim_15.parquet
/kaggle/input/prepaired-files/trim_19.parquet
/kaggle/input/prepaired-files/trim_22.parquet
/kaggle/input/prepaired-files/trim_13.parquet
/kaggle/input/prepaired-files/trim_14.parquet
/kaggle/input/prepaired-files/trim_4.parquet
/kaggle/input/prepaired-files/trim_11.parquet
/kaggle/input/prepaired-files/trim_17.parquet
/kaggle/input/prepaired-files/trim_3.parquet
/kaggle/input/prepaired-files/trim_20.parquet
/kaggle/input/prepaired-files/trim_2

In [2]:
def scoreTime(name):
    '''Декоратор для вывода времени работы функции (начало-конец)'''
    def wrapper(func):
        def function(*args, **kwargs):
            print('\r', name, ': start\t', datetime.now())
            func(*args, **kwargs)
            print('\r', name, ': end\t\t', datetime.now())
        return function
    return wrapper
# ---------------------------------------------------------------------
def compressMatrix():
    global resMatrix
    resMatrix = resMatrix.groupby(['aid_x', 'aid_y']).wgt.sum().reset_index()

In [3]:
@scoreTime('coMatrix')
def coMatrix(df, time_elapsed, func, log=False):
    ''' Разбиваем данные на временные промежутки
        и считаем пересечения внутри них '''
    min_ts, max_ts = df.iloc[0]['ts'], df.iloc[-1]['ts']
    chunks = round((max_ts - min_ts) / time_elapsed) + 1
    percent = -1
    for i in range(0, chunks):
        if (log) and (round(i * 100 / chunks) != percent):
            percent = round(i * 100 / chunks)
            print('\r', f'{percent}% chunk: {i}/{chunks}', end='   ')
        start = min_ts + i * time_elapsed
        df_chank = df[(df['ts'] >= start)&(df['ts'] <= start + time_elapsed)]
        if df_chank.shape[0] > 1:
            func(df_chank)
    print('\r', f'100% chunk: {chunks}/{chunks}')

In [4]:
@scoreTime('coMatrixWithFile')
def coMatrixWithFile(func, log=False):
    files = glob.glob('../input/prepaired-files/*.parquet')
    chunks = len(files)
    percent = -1
    for i, chunk in enumerate(files):
        if (log) and (round(i * 100 / chunks) != percent):
            percent = round(i * 100 / chunks)
            print('\r', f'{percent}% chunk: {i + 1}/{chunks + 1}', end='   ')
        df_chank = pd.read_parquet(chunk)
        if df_chank.shape[0] > 1:
            func(df_chank)
        #compressMatrix()
    print('\r', f'100% chunk: {chunks + 1}/{chunks + 1}')

In [5]:
def getPreds(preds):
    '''Получаем предсказание для сохранения'''
    preds2 = {'aid': [], 'type':[], 'labels':[] }
    for i in preds.keys():
        if len(preds[i].keys()) != 3:
            for aid in preds[i].keys():
                preds2['aid'].append(aid)
                preds2['type'].append(i)
                preds2['labels'].append(preds[i][aid])
        else:
            if preds2.get('for') == None:
                preds2['for'] = []
            for j in preds[i].keys():
                for aid in preds[i][j].keys():
                    preds2['aid'].append(aid)
                    preds2['type'].append(i)
                    preds2['for'].append(j)
                    preds2['labels'].append(preds[i][j][aid])
    return pd.DataFrame(preds2)

In [6]:
# ====================================================================
# ----- Расчёт матрицы без учёта типа --------------------------------
# ====================================================================
def noType(x):
    x[['session', 'aid']].groupby('session').agg(aggMatrixNoType)
# -------------------------------------------------
def aggMatrixNoType(x):
    for aid1 in x:
        for aid2 in x:
            if aid1 != aid2:
                resMatrix[aid1][aid2] += 1
# ====================================================================
# ----- Расчёт матрицы с учётом типа --------------------------------
# ====================================================================
def withType(x):
    for i in [0, 1, 2]:
        global activeType
        activeType = i
        x[x['type'] == i][['session', 'aid']].groupby('session').agg(aggMatrixWithType)
# -------------------------------------------------
def aggMatrixWithType(x):
    for aid1 in x:
        for aid2 in x:
            if aid1 != aid2:
                resMatrix[activeType][aid1][aid2] += 1
# ====================================================================
# ----- Расчёт матрицы без учёта типа с учётом -----------------------
# ====================================================================
def mergeSimple(df):
    df = df.merge(df, on='session')
    df = df[['session', 'aid_x', 'aid_y']]
    df = df[df['aid_x'] != df['aid_y']]
    df = df.drop_duplicates(['session', 'aid_x', 'aid_y'])
    df = df[['aid_x', 'aid_y']]
    for aid_x, aid_y in zip(df['aid_x'], df['aid_y']):
        resMatrix[aid_x][aid_y] += 1
# ====================================================================
def mergeSimpleDF(df):
    global i
    df = df.merge(df, on='session')
    df = df[['session', 'aid_x', 'aid_y']]
    df = df[df['aid_x'] != df['aid_y']]
    df = df.drop_duplicates(['session', 'aid_x', 'aid_y'])
    df = df[['aid_x', 'aid_y']]
    df['wgt'] = 1
    df = df.groupby(['aid_x', 'aid_y']).wgt.sum().reset_index()
    df[['aid_x', 'aid_y']] = df[['aid_x', 'aid_y']].astype('uint32')
    df['wgt'] = df['wgt'].astype('uint16')
    df.to_parquet(f'preds_{i}.parquet')
    i += 1
# --------------------------------------------------------------------
def mergeSimpleWithType(df):
    df = df.merge(df, on='session')[['session', 'aid_x', 'aid_y', 'type_y']]
    df = df[df['aid_x'] != df['aid_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    for aid_x, aid_y, type_y in zip(df['aid_x'], df['aid_y'], df['type_y']):
        resMatrix[type_y][aid_x][aid_y] += 1
# --------------------------------------------------------------------
def mergeWithHardType(df):
    weights = [{0:9, 1:3, 2:1},{0:5, 1:3, 2:1},{0:1, 1:5, 2:3}]
    df = df.merge(df, on='session')[['session', 'aid_x', 'aid_y', 'type_x', 'type_y']]
    df = df[df['aid_x'] != df['aid_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    for aid_x, aid_y, type_x, type_y in zip(df['aid_x'], df['aid_y'], df['type_x'], df['type_y']):
        resMatrix[type_y][aid_x][aid_y] += weights[type_x][type_y]
# ====================================================================
# --- Расчёт матрицы по весам ----------------------------------------
# ====================================================================
def mergeWeighted(df):
    df = df.merge(df, on='session')[['session', 'aid_x', 'aid_y', 'type_y']]
    df['wgt'] = df['type_y'].map({0:1, 1:3, 2:6}).astype('uint8')
    df = df[df['aid_x'] != df['aid_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    for aid_x, aid_y, wgt in zip(df['aid_x'], df['aid_y'], df['wgt']):
        resMatrix[aid_x][aid_y] += wgt
# ====================================================================
# ----- Расчёт матрицы с учётом логики: положил в корзину - купил ----
# ====================================================================
def mergeCartBuy(df):
    df = df.loc[df['type'].isin([1,2])]
    df = df.merge(df, on='session')[['session', 'aid_x', 'aid_y']]
    df = df[df['aid_x'] != df['aid_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    for aid_x, aid_y in zip(df['aid_x'], df['aid_y']):
        resMatrix[aid_x][aid_y] += 1
# ====================================================================
# ----- Расчёт матрицы кликов с весом по времени ---------------------
# ====================================================================
def mergeTimeWeighted(df):
    df = df.loc[df['type'] == 0]
    df = df.merge(df, on='session')[['session', 'aid_x', 'aid_y', 'ts_x']]
    df['wgt'] = 1 + 3*(df['ts_x'] - 1659304800)/(1662328791-1659304800)
    df = df[df['aid_x'] != df['aid_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    for aid_x, aid_y, wgt in zip(df['aid_x'], df['aid_y'], df['wgt']):
        resMatrix[aid_x][aid_y] += wgt

In [7]:
# ====================================================================
# ----- Предсказание без учёта типа ----------------------------------
# ====================================================================
def predsNoType(matrix):
    preds = {0: {}, 1:{}, 2:{}}
    for key in matrix:
        for i in preds.keys():
            preds[i][key] = [aid for aid, _ in matrix[key].most_common(20)]
    return preds
# ====================================================================
# ----- Предсказание с учётом типа -----------------------------------
# ====================================================================
def predsWithType(matrix):
    preds = {0: {}, 1:{}, 2:{}}
    for i in preds.keys():
        for key in matrix[i]:
            preds[i][key] = [aid for aid, _ in matrix[i][key].most_common(20)]
    return preds
# ====================================================================
# ----- Предсказание с учётом типа -----------------------------------
# ====================================================================
def predsWithHardType(matrix):
    preds = {0: {}, 1:{}, 2:{}}
    for i in preds.keys():
        preds[i] = {0: {}, 1:{}, 2:{}}
        for j in preds[i].keys():
            for key in matrix[i][j]:
                preds[i][j][key] = [aid for aid, _ in matrix[i][j][key].most_common(20)]
    return preds

In [8]:
def createCoMatrix(df, mFunc, pFunc, time_elapsed, path, matrix):
    '''Расчёт и сохранение матрицы'''
    global resMatrix
    resMatrix = matrix
    coMatrix(df, time_elapsed, mFunc, log=True)
    result = getPreds(pFunc(resMatrix))
    result['labels'].fillna('', inplace=True)
    display(result)
    result.to_parquet(f'{path}.parquet', index=False)
    del resMatrix, result
    gc.collect()

In [9]:
def createCoMatrixWithFile(mFunc, pFunc, path, matrix):
    '''Расчёт и сохранение матрицы'''
    global resMatrix
    resMatrix = matrix
    coMatrixWithFile(mFunc, log=True)
    result = getPreds(pFunc(resMatrix))
    result['labels'].fillna('', inplace=True)
    display(result)
    result.to_parquet(f'{path}.parquet', index=False)
    del resMatrix, result
    gc.collect()

In [10]:
#resMatrix = cd.DataFrame(columns=['aid_x', 'aid_y', 'wgt'], dtype='uint32')
#resMatrix['wgt'] = resMatrix['wgt'].astype('uint16')
#resMatrix.info()

In [11]:
#resMatrix.rename(columns={'wgt_x':'wgt'}, inplace=True)
#resMatrix.info()

In [12]:
#result = getPreds(predsNoType(resMatrix))
#result['labels'].fillna('', inplace=True)
#result

In [13]:
%%time
# ~ 13s
result = pd.read_parquet('../input/prepaired-files/coMatrixHour.parquet')
result.reset_index(inplace=True)
result.columns = ['aid', 'labels']
result.to_parquet('coMatrixHour.parquet', index=False)

CPU times: user 5.43 s, sys: 5.23 s, total: 10.7 s
Wall time: 13.6 s


In [14]:
%%time
# ~ 10s (no_free_orders)
# ~ 3m  (no_duplicates)
#train = pd.read_parquet('../input/otto-analyse-data/train_no_free_orders.parquet')\
#        .sort_values(by=['ts'])
#time = (24 * 60 * 60)

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 8.34 µs


In [15]:
# ----- WEEK -----

In [16]:
#resMatrix = pd.DataFrame(columns=['aid_x', 'aid_y', 'wgt'], dtype='uint32')
#resMatrix['wgt'] = resMatrix['wgt'].astype('uint16')
#resMatrix.info()

In [17]:
#i = 0
#createCoMatrixWithFile(mergeSimpleDF, predsNoType, 'coMatrixMergeDay', resMatrix)

In [18]:
#resMatrix = pd.DataFrame(columns=['aid_x', 'aid_y', 'wgt'], dtype='uint32')
#resMatrix['wgt'] = resMatrix['wgt'].astype('uint16')
#resMatrix.info()

In [19]:
#df_chank = pd.read_parquet(f'../input/prepaired-files/train_no_duplicates_{1}.parquet')
#df_chank = df_chank.merge(df_chank, on='session')
#df_chank = df_chank[['session', 'aid_x', 'aid_y']]
#df_chank = df_chank[df_chank['aid_x'] != df_chank['aid_y']]
#df_chank = df_chank.drop_duplicates(['session', 'aid_x', 'aid_y'])
#df_chank = df_chank[['aid_x', 'aid_y']]
#df_chank.info()

In [20]:
#df_chank['wgt'] = 1
#df_chank = df_chank.groupby(['aid_x', 'aid_y']).wgt.sum().reset_index()
#df_chank[['aid_x', 'aid_y']] = df_chank[['aid_x', 'aid_y']].astype('uint32')
#df_chank['wgt'] = df_chank['wgt'].astype('uint16')

In [21]:
#resMatrix = df_chank.set_index(['aid_x', 'aid_y'])
#df_chank = df_chank.set_index(['aid_x', 'aid_y'])

In [22]:
#resMatrix['wgt'] += df_chank['wgt']

In [23]:
#resMatrix

In [24]:
#res.values

In [25]:
#resMatrix.info()

In [26]:
#sumX = sumX.reset_index()

In [27]:
#sumX.info()

In [28]:
#resMatrix = pd.concat([resMatrix, df])

In [29]:
#for aid_x, aid_y in zip(df_chank['aid_x'], df_chank['aid_y']):
#    resMatrix[aid_x][aid_y] += 1

In [30]:
#mergeSimple(df_chank)

In [31]:
#del df_chank
#gc.collect()

In [32]:
#del resMatrix
#gc.collect()

In [33]:
%%time
# ~ 6m 10s
# noType Week matrix
#createCoMatrix(train, mergeSimple, predsNoType, time, \
#               'coMatrixMergeDay', defaultdict(Counter))

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 7.63 µs


In [34]:
%%time
# ~ 4m 15s
# noType Week matrix
#createCoMatrix(train, mergeSimpleWithType, predsWithType, time, \
#               'coMatrixMergeWithTypeDay', [defaultdict(Counter) for i in range(0, 3)])

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 7.63 µs


In [35]:
%%time
# ~ 5m
# noType Week matrix
#createCoMatrix(train, mergeWithHardType, predsWithType, time, \
#               'coMatrixMergeWithHardTypeDay', [defaultdict(Counter) for i in range(0, 3)])

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 7.87 µs


In [36]:
%%time
# ~ 6m 40s
# noType Week matrix
#createCoMatrix(train, mergeWeighted, predsNoType, time, \
#               'coMatrixMergeWeightedDay', defaultdict(Counter))

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 7.63 µs


In [37]:
%%time
# ~ 6m
# noType Week matrix
#createCoMatrix(train, mergeCartBuy, predsNoType, time, \
#               'coMatrixMergeCartBuyDay', defaultdict(Counter))

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.58 µs


In [38]:
%%time
# ~ 7m
# noType Week matrix
#createCoMatrix(train, mergeTimeWeighted, predsNoType, time, \
#               'coMatrixMergeTimeWeightedDay', defaultdict(Counter))

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs


In [39]:
#train = pd.read_parquet('../input/otto-analyse-data/train_no_free_orders.parquet')\
#        .sort_values(by=['session', 'ts'])
#time_range = (7 * 24 * 60 * 60)
#min_ts = train.ts.min()
#max_ts = train.ts.max()

In [40]:
#def getNextTS(df):
#    df = df.copy().reset_index(drop=True).reset_index()
#    tmp = df[['index', 'ts']].copy()
#    tmp['index'] = tmp['index'] - 1
#    tmp.columns = ['index', 'next_ts']
#    df = df.merge(tmp, how='left', on='index').drop('index', axis=1)
#    df.iloc[-1, -1] = df.iloc[-2, -1]
#    df['next_ts'] = df['next_ts'] - df['ts']
#    df.loc[df['next_ts'] < 0, 'next_ts'] = 0
#    return df

In [41]:
#@scoreTime('coMatrix2')
#def coMatrix2(df, time_elapsed, func, log=False):
#    df = getNextTS(df)
#    chunks = df[df['next_ts'] > time_range].index
#    percent = -1
#    for i in range(0, chunks.shape[0], 2):
#        if (log) and (round(i * 100 / chunks.shape[0]) != percent):
#            percent = round(i * 100 / chunks.shape[0])
#            print('\r', f'{percent}% chunk: {i}/{chunks.shape[0]}', end='   ')
#        if (chunks[i + 1] - chunks[i] > 1):
#            df_chank = df[(df.index > chunks[i])&(df.index <= chunks[i + 1])]
#            if df_chank.shape[0] > 1:
#                func(df_chank)
#        gc.collect()

In [42]:
#%%time
# ~ 3m 50s - 10 000
#resMatrix = defaultdict(Counter)
#coMatrix2(train, time_range, noTypes, log=True)

In [43]:
#result = getPreds(predsNoTypes(resMatrix))
#result['labels'] = result['labels'].map(lambda x: ' '.join(str(i) for i in x))
#result.to_csv('coMatrixNoType2Day_2.csv', index=False)
#result

In [44]:
#df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
#                    'rkey': ['bar', 'bar', 'baz', 'foo'],
#                    'value': [1, 2, 3, 5]})
#df2 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
#                    'rkey': ['baz', 'bar', 'foo', 'boo'],
#                    'value': [5, 6, 7, 8]})
#display(df1)
#display(df2)

In [45]:
#pd.merge(df1, df2, how='outer', on=['lkey', 'rkey'])