In [1]:
import pandas as pd
import numpy as np

import gc

from collections import Counter 

In [2]:
hist_data = pd.read_csv('hist_data.csv')

In [3]:
hist_data.head()

Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods,weight
0,95203091,98506637863,2021-07-01 00:03:44,202808329,1.0,79.99,False,11.14
1,95203091,98506637863,2021-07-01 00:03:44,202953905,1.072,44.945,True,11.14
2,95203091,98506637863,2021-07-01 00:03:44,203566452,1.0,69.99,False,11.14
3,95203091,98506637863,2021-07-01 00:03:44,202820143,1.972,41.295,True,11.14
4,95203091,98506637863,2021-07-01 00:03:44,204400422,1.0,269.99,False,11.14


### Подготовка данных
Мы хотим выделить "постоянных" покупателей, которые делали не мене _n_orders_ заказов. От каждого "постоянного" покупателя мы расчитаем рейтинг для товаров, которые они покупали: 10 - самый частопокупаемый товар, 1 - товар, который покупался реже всего, 0 - товар, который совсем не покупался. 

In [4]:
n_orders = 3

In [5]:
# создадим массив вида (buyer_id, количество сделанных заказов)
grouped = hist_data.groupby(by = 'buyer_id')['pav_order_id'].value_counts()

uniq_buyers = hist_data['buyer_id'].unique()
buyers_order_count = [0 for i in range(len(uniq_buyers))]

it = 0
for buyer in uniq_buyers:
    buyers_order_count[it] = (buyer, len(grouped[buyer]))
    it += 1
buyers_order_count = sorted(buyers_order_count, key = lambda x: x[1], reverse=True)

In [6]:
# создаем set постоянных покупателей
regular_buyers = set()
for buyer in buyers_order_count:
    if buyer[1] >= n_orders:
        regular_buyers.add(buyer[0])

In [33]:
# функция назначения рейтинга
def get_rate(buyer_id):
    buys = hist_data[hist_data['buyer_id'] == buyer_id]['item_id'].value_counts()
    num_of_cart = len(hist_data[hist_data['buyer_id'] == buyer_id]['pav_order_id'].unique())
    if buyer_id not in regular_buyers:
        buys = buys*0.0
    else:
        buys = buys*10/num_of_cart
    return buys


In [56]:
# из списка кандидатов по совстречаемости удаляем повторяющиеся item_id, 
def get_unique_recs(recs: list, top_n: int, our_recommendation: list) -> list:
    rec_dict = {}
    counter = 0
    for k, v in recs:
        if k not in rec_dict:
            rec_dict[k] = v
            counter += 1
        if counter == top_n:
            break
    return (our_recommendation[:top_n//4] + list(rec_dict.keys()) + our_recommendation[top_n//4:])[:20]

def rec_by_item(item_id: int, most_freq_dict: dict) -> list:
    
    return most_freq_dict.get(item_id, None)

# для каждого item_id соберем top_n самых часто встречающихся item_id, 
# отсортируем по частоте и выберем уникальные
def rec_by_basket(buyer_id : int, basket: list, most_freq_dict: dict, top_n: int = 20) -> list:
    our_recommendation = []
    if buyer_id in regular_buyers:
        rate_list = get_rate(buyer_id)
        items_lst = list(rate_list.index)
        rate_lst = list(rate_list.values)
        for i in range(len(items_lst)):
            item, rate = items_lst[i], rate_lst[i]
            if rate < 5:
                break
            if item not in basket:
                our_recommendation.append(item)
            if len(our_recommendation) == top_n: #TODO: поменьять на top_n//2
                break

    res = []
    for item in basket:
        recs = rec_by_item(item, most_freq_dict)
        if recs is not None:
            res += recs
    
    res = sorted(res, key=lambda x: x[1], reverse=True)
    
    return get_unique_recs(res, top_n, our_recommendation)

In [57]:
def apply_relevance(x):
    return [int(item in x['basket']) for item in x['preds']]

def create_relevance(pred):
    d = pred.copy()
    d['basket'] = d['basket'].apply(set)
    d = d.apply(apply_relevance, axis=1)
    return d
def make_coocurs_dict(train_data):
    tmp = (
        train_data[['item_id', 'pav_order_id']]
        .sort_values(['item_id', 'pav_order_id'])
        .merge(train_data[['item_id', 'pav_order_id']], how='left', on=['pav_order_id'], suffixes=('', '_left'))
    )
    tmp = tmp[tmp['item_id'] != tmp['item_id_left']].copy()
    tmp1 = tmp.groupby(['item_id'])['item_id_left'].agg(lambda x: Counter(x).most_common(10))

    most_freq_dict = {k: v for (k, v) in tmp1.iteritems()}

    del tmp1, tmp
    gc.collect()
    return most_freq_dict

def create_basket(test_data):
    pred = test_data.groupby(['pav_order_id', 'buyer_id'])['item_id'].agg([('basket', list)])
    pred.reset_index(inplace=True)
    return pred

def make_predictions(test_data, most_freq_dict):
    pred = create_basket(test_data)
    order = []
    cart = []
    for index, row in pred.iterrows():
        order_id = row['pav_order_id']
        preds = rec_by_basket(row['buyer_id'], row['basket'], most_freq_dict=most_freq_dict)
        order.append(order_id)
        cart.append(preds)
    ans = pd.DataFrame({'pav_order_id': order, 'preds': cart})
    return ans

In [10]:
# соберем словарь встречаемостей - какие item_id покупались чаще с каждым item_id 
most_freq_dict = make_coocurs_dict(hist_data)
# предсказываем
test_data = pd.read_csv('test.csv')

In [None]:
pred = make_predictions(test_data, most_freq_dict)

In [None]:
pred.to_csv("final_preds.csv", index=False)

In [50]:
max(pred['preds'].apply(len)), min(pred['preds'].apply(len))

(20, 11)