In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# необходимые библиотеки

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

import warnings

warnings.filterwarnings("ignore")

In [3]:
# Необходимые функции из src:
# metrics:

def precision(recommended_list, bought_list):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision


def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision


def money_precision_at_k(recommended_list, bought_list, prices_recommended, k=5):
        
    # your_code
    # Лучше считать через матричное произведение, а не цикл
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    prices_recommended = np.array(prices_recommended)
    
    recommended_list = recommended_list[:k]
    prices_recommended = prices_recommended[:k]
    
    # Почему здесь изменен порядок в функции isin?
    flags = np.isin(recommended_list, bought_list)
    
    relevant_revenue = (flags * prices_recommended).sum()
    recommended_revenue = prices_recommended.sum()
    
    precision = relevant_revenue / recommended_revenue
    
    return precision

def recall(recommended_list, bought_list):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)
    
    return recall


def recall_at_k(recommended_list, bought_list, k=5):
    
    # your code
    recall_at_k = recall(recommended_list[:k], bought_list)
    
    return recall_at_k


def money_recall_at_k(recommended_list, bought_list, prices_recommended, prices_bought, k=5):
    
    # your_code
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    prices_recommended = np.array(prices_recommended)
    prices_bought = np.array(prices_bought)

    
    assert recommended_list.shape == prices_recommended.shape
    assert bought_list.shape == prices_bought.shape
    
    
    recommended_list = recommended_list[:k]
    prices_recommended = prices_recommended[:k]
    
    
    flags = np.isin(bought_list, recommended_list)
    
    money_recall = (flags * prices_bought).sum() / (prices_bought).sum()
    
    return money_recall

In [4]:
# recommenders:

from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization

from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, weighting=True):

        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999]

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999]
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        self.user_item_matrix = self._prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            # self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T
            self.user_item_matrix = tfidf_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

    @staticmethod
    def _prepare_matrix(data):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(data,
                                          index='user_id', columns='item_id',
                                          values='quantity',  # Можно пробовать другие варианты
                                          aggfunc='count',
                                          fill_value=0
                                          )

        user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        # own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender = ItemItemRecommender(K=3, num_threads=1)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=10, regularization=0.001, iterations=15, num_threads=1):
#         """Обучает LMF"""

#         model = LogisticMatrixFactorization(factors=n_factors,
#                                             regularization=regularization,
#                                             iterations=iterations,
#                                             num_threads=num_threads)
#         model.fit(csr_matrix(user_item_matrix).T.tocsr())

#         return model

        
#         """Обучает BPR"""

#         model = BayesianPersonalizedRanking(factors=n_factors,
#                                             regularization=regularization,
#                                             iterations=iterations,
#                                             num_threads=num_threads)
#         model.fit(csr_matrix(user_item_matrix).T.tocsr())

#         return model

        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model


    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[1][0]  # И берем второй (не товар из аргумента метода)
        return self.id_to_itemid[top_rec]

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        res = [self.id_to_itemid[rec[0]] for rec in model.recommend(userid=self.userid_to_id[user],
                                        user_items=csr_matrix(self.user_item_matrix).tocsr(),
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=[self.itemid_to_id[999999]],
                                        recalculate_user=False)]

#         res = [self.id_to_itemid[rec[0]] for rec in model.recommend(userid=self.userid_to_id[user],
#                                         user_items=csr_matrix(self.user_item_matrix).tocsr(),
#                                         N=N,
#                                         filter_already_liked_items=False,
#                                         filter_items=[self.itemid_to_id[999999]],
#                                         recalculate_user=True)]

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.model, N=N)

    def get_own_recommendations(self, user, N=5):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.own_recommender, N=N)

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)

        res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
        similar_users = [rec[0] for rec in similar_users]
        similar_users = similar_users[1:]   # удалим юзера из запроса

        for user in similar_users:
            res.extend(self.get_own_recommendations(user, N=1))

        res = self._extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

In [5]:
# utils:

def prefilter_items(data, take_n_popular=5000, item_features=None):
    # Уберем самые популярные товары (их и так купят)
    popularity = data.groupby('item_id')['user_id'].nunique().reset_index() / data['user_id'].nunique()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
    
    top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]
    
    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]
    
    # Уберем товары, которые не продавались за последние 12 месяцев
    data_lost_months = data[data['week_no'] >= data['week_no'].max() - 48]
    not_zero_array = data_lost_months[data_lost_months['quantity'] > 0]['item_id'].unique()
    zero_purchases = data_lost_months[data_lost_months["item_id"].isin(not_zero_array) == False]['item_id'].unique()
    data = data[data["item_id"].isin(zero_purchases) == False]
    
    # Уберем не интересные для рекоммендаций категории (department)
    
    not_interesting = data[data['retail_disc'] < 0][data['quantity'] == 0]['item_id'].unique()
    # те, что не продаются даже со скидками
    data = data[data.isin(not_interesting) == False]
    
    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб.
    
    too_chip = data[data['sales_value'] < data['sales_value'].mean()][data['quantity'] > data['quantity'].mean()]['item_id']
    # те, что дешевле среднего и при этом достаточно популярные
    data = data[data.isin(too_chip) == False]
    
    # Уберем слишком дорогие товары
    
    too_expansive = data[data['sales_value'] > data['sales_value'].mean()][data['quantity'] < 1]['item_id']
    # те, что дороже большинства и не покупаются (=> недостаточно интересные, чтобы быть настолько дорогими)
    data = data[~data['item_id'].isin(too_expansive)]
    
    # Те товары, что не бывают без скидок:
    
    not_every_day_disc = data[data['retail_disc'] >= 0]['item_id'].unique()
    data = data[data['item_id'].isin(not_every_day_disc)]
    
    # Возьмем топ по популярности
    popularity = data.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

    top = popularity.sort_values('n_sold', ascending=False).head(take_n_popular).item_id.tolist()
    
    # Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
    data.loc[~data['item_id'].isin(top), 'item_id'] = 999999
    
    return data

### Подготовка данных:

In [7]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


In [8]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [9]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, 5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [10]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [11]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

train_users = data_train_lvl_1['user_id'].unique()

result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1 = result_lvl_1.query('user_id in @train_users')
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


#### Результат работы модели первого уровня (лучший из полученых мною результатов на Kaggle, в функции fit_own_recommender использовалось значение K=3, вес в MainRecommender - tfidf_weight):

In [12]:
result_lvl_1['own_rec'] = [recommender.get_own_recommendations(i, N=5) for i in list(result_lvl_1['user_id'])]

result_lvl_1.apply(lambda x: recall_at_k(x['own_rec'], x['actual'], 5), axis=1).mean() * 100

3.699310170377275

In [14]:
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [15]:
data_train_lvl_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


In [16]:
n_items_before = data_train_lvl_2['item_id'].nunique()

data_train_lvl_2 = prefilter_items(data_train_lvl_2)

n_items_after = data_train_lvl_2['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 27649 to 5001


In [17]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=5))

In [18]:
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2021,"[1082185.0, 6534178.0, 981760.0, 951590.0, 102..."
1,1753,"[1082185.0, 6534178.0, 1029743.0, 1106523.0, 1..."


In [19]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2021,1082185.0,1
0,2021,6534178.0,1
0,2021,981760.0,1
0,2021,951590.0,1


## Генерация признаков:

In [20]:
# генерация признаков:
# для пользователя:

# общая сумма покупок за период покупателя:
for i in user_features["user_id"]:
    receipts = list()
    max_quantity = 0
    most_popular = 0
    
    user_features.loc[user_features['user_id'] == i, 'total_sum'] = data_train_lvl_2[data_train_lvl_2['user_id'] == i]['sales_value'].sum()
    
    # средний чек:
    for j in data_train_lvl_2[data_train_lvl_2['user_id'] == i]['basket_id'].unique():
        receipts.append(data_train_lvl_2[data_train_lvl_2['user_id'] == i][data_train_lvl_2['basket_id'] == j]['sales_value'].sum())
    if len(receipts) != 0:
        user_features.loc[user_features['user_id'] == i, 'average_receipt'] = sum(receipts)/len(receipts)
    else: user_features.loc[user_features['user_id'] == i, 'average_receipt'] = 0
    
    # самый популярный магазин:
    for j in data_train_lvl_2[data_train_lvl_2['user_id'] == i]['store_id'].unique():
        new_m = len(data_train_lvl_2[(data_train_lvl_2['user_id'] == i)&(data_train_lvl_2['store_id'] == j)]['basket_id'].unique())
        if max_quantity < new_m:
            max_max_quantity = new_m
            most_popular = j
    user_features.loc[user_features['user_id'] == i, 'most_popular_store'] = j 

In [21]:
# генерация признаков:
# для товаров:

for i in item_features["commodity_desc"].unique():
    
    # средняя цена в категории:
    items_list = np.array(item_features[item_features["commodity_desc"] == i]["item_id"])
    item_features.loc[item_features["commodity_desc"] == i, 'average_cat_value'] = data_train_lvl_2[(data_train_lvl_2['item_id'].isin(items_list))]['sales_value'].sum()/len(items_list)
    
for j in item_features["item_id"].unique():
    
    # покупок в неделю:
    weeks = np.array(data_train_lvl_2[(data_train_lvl_2['item_id'] == j)]['week_no'].unique())
    item_features.loc[item_features['item_id'] == j, 'average_weekly_quantity'] = data_train_lvl_2[(data_train_lvl_2['item_id'] == j)]['quantity'].sum()/len(weeks)
        
    # соотношение цены товара к средней в категории:
    item_features.loc[item_features['item_id'] == j, 'worth'] = data_train_lvl_2[data_train_lvl_2['item_id'] == j]['sales_value'].mean()/item_features[item_features['item_id'] == j]['average_cat_value']

In [22]:
user_items_features = data_train_lvl_2[['user_id', 'item_id', 'quantity']].copy()
user_items_features = user_items_features.merge(item_features[['item_id', 'commodity_desc', 'average_weekly_quantity']], on=['item_id'], how='right')

user_items_features.head(2)

Unnamed: 0,user_id,item_id,quantity,commodity_desc,average_weekly_quantity
0,2021.0,840361.0,1.0,EGGS,78.666667
1,950.0,840361.0,1.0,EGGS,78.666667


In [23]:
# признаки для отношений пользователь-товар слишком долго считались, поэтому решила ввести коэффициенты популярности товаров

# коэффициент популярности среди покупателей для каждой категории
for j in list(user_items_features['commodity_desc'].unique()):
    popularity = len(user_items_features[user_items_features['commodity_desc'] == j]['user_id'].unique())
    if popularity < 100:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.8
    elif popularity < 600:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.6
    elif popularity < 1000:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.4
    else:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.2
        
# насколько популярен товар для пользователя, учитывая его популярность среди всех пользователей
for i in list(user_items_features['user_id'].unique()):
    for j in list(user_items_features['is_popular'].unique()):
        user_items_features.loc[(user_items_features['user_id'] == i) & (user_items_features['is_popular'] == j), 'is_favorite_cat'] = user_items_features[(user_items_features['user_id'] == i) & (user_items_features['is_popular'] == j)]['quantity'].sum() * user_items_features['is_popular']

In [24]:
user_items_features = user_items_features.drop(['quantity', 'commodity_desc', 'average_weekly_quantity'], axis=1)

In [25]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

t_lvl2_users = targets_lvl_2['user_id']
t_lvl2_items = targets_lvl_2['item_id']
t_lvl2_targets = targets_lvl_2['target']

In [26]:
targets_lvl_2 = targets_lvl_2[['target', 'user_id','item_id']]

In [27]:
targets_lvl_2

Unnamed: 0,target,user_id,item_id
0,0.0,2021,1082185.0
1,0.0,2021,6534178.0
2,0.0,2021,981760.0
3,1.0,2021,951590.0
4,1.0,2021,951590.0
...,...,...,...
13805,0.0,1697,1082185.0
13806,0.0,1697,6534178.0
13807,0.0,1697,1029743.0
13808,0.0,1697,995242.0


In [28]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on=['item_id'], how='left')

In [29]:
targets_lvl_2.head(2)

Unnamed: 0,target,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_cat_value,average_weekly_quantity,worth
0,0.0,2021,1082185.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,30.856627,313.166667,0.03467
1,0.0,2021,6534178.0,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,16.178906,,


In [30]:
targets_lvl_2 = targets_lvl_2.merge(user_features, on=['user_id'], how='left')

In [31]:
targets_lvl_2 = targets_lvl_2.merge(user_items_features, on=['user_id', 'item_id'], how='left')

### Обучение модели второго уровня:

In [32]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [33]:
X_train.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_cat_value,average_weekly_quantity,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store,is_popular,is_favorite_cat
0,2021,1082185.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,30.856627,313.166667,...,,,,,,,,,,
1,2021,6534178.0,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,16.178906,,...,,,,,,,,,,


In [34]:
# категоризация признаков (для работы модели)

digital_features = ['manufacturer', 'average_cat_value', 'average_weekly_quantity', 'worth', 'total_sum', 'average_receipt', 'most_popular_store', 'is_popular','is_favorite_cat']
for i in digital_features:
    a = X_train[i].min()
    b = X_train[i].mean()
    c = X_train[i].max()
    # print(a, (b-a)/2, b, (c-b)/2, c)
    X_train[i] = pd.cut(X_train[i], bins=[a, a+(b-a)/2, b, b+(c-b)/2, c], labels=False)
    
not_digital_features = ['department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'average_weekly_quantity', 'age_desc',
                        'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc']

for i in not_digital_features:
    cat_numbers = list(c for c in range(len(list(X_train[i].unique()))))
    for j,n in zip(list(X_train[i].unique()), cat_numbers):
        X_train.loc[X_train[i] == j, i] = n
        
X_train.fillna(99999, inplace= True)

In [35]:
X_train.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_cat_value,average_weekly_quantity,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store,is_popular,is_favorite_cat
0,2021,1082185.0,99999.0,0,0,0,0,0,2.0,3.0,...,99999,99999,99999,99999,99999,99999.0,99999.0,99999.0,99999.0,99999.0
1,2021,6534178.0,0.0,1,1,1,1,1,1.0,99999.0,...,99999,99999,99999,99999,99999,99999.0,99999.0,99999.0,99999.0,99999.0


In [36]:
# модель - MLPClassifier из sklearn.neural_network

mlp = MLPClassifier(random_state=1, solver='sgd', learning_rate_init=0.000001, max_iter=500).fit(X_train, y_train)
train_preds = mlp.predict(X_train)

In [37]:
user_val_lvl_2 = pd.DataFrame(data_val_lvl_2['user_id'].unique())
user_val_lvl_2.columns = ['user_id']

train_users = data_train_lvl_1['user_id'].unique()
user_val_lvl_2 = user_val_lvl_2[user_val_lvl_2['user_id'].isin(train_users)]

user_val_lvl_2['candidates'] = user_val_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=5))

In [38]:
s = user_val_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

user_val_lvl_2 = user_val_lvl_2.drop('candidates', axis=1).join(s)

user_val_lvl_2.head(4)

Unnamed: 0,user_id,item_id
0,338,1082185.0
0,338,6534178.0
0,338,1029743.0
0,338,995242.0


### Обучение модели на валидационной выборке:

In [39]:
preds_2 = user_val_lvl_2.merge(item_features, on='item_id', how='left')
preds_2 = preds_2.merge(user_features, on='user_id', how='left')
preds_2 = preds_2.merge(user_items_features, on=['user_id', 'item_id'], how='left')

preds_2.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_cat_value,average_weekly_quantity,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store,is_popular,is_favorite_cat
0,338,1082185.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,30.856627,313.166667,...,,,,,,,,,,
1,338,6534178.0,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,16.178906,,...,,,,,,,,,,


In [40]:
# категоризация валидационной выборки:

digital_features = ['manufacturer', 'average_cat_value', 'average_weekly_quantity', 'worth', 'total_sum', 'average_receipt', 'most_popular_store', 'is_popular','is_favorite_cat']

for i in digital_features:
    a = preds_2[i].min()
    b = preds_2[i].mean()
    c = preds_2[i].max()
    # print(a, (b-a)/2, b, (c-b)/2, c)
    preds_2[i] = pd.cut(X_train[i], bins=[a, a+(b-a)/2, b, b+(c-b)/2, c], labels=False)
    
not_digital_features = ['department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'average_weekly_quantity', 'age_desc',
                        'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc']

for i in not_digital_features:
    cat_numbers = list(c for c in range(len(list(preds_2[i].unique()))))
    for j,n in zip(list(preds_2[i].unique()), cat_numbers):
        preds_2.loc[preds_2[i] == j, i] = n
        
preds_2.fillna(99999, inplace= True)

In [41]:
features = digital_features + not_digital_features

In [46]:
X_val = preds_2

val_preds = mlp.predict_proba(X_val[features])[:,1]

In [47]:
preds_2['proba'] = val_preds

recomendations = preds_2[['user_id', 'item_id', 'proba']]

In [48]:
mlp_recs = pd.DataFrame(recomendations.sort_values(['user_id', 'proba'], ascending=False).groupby('user_id')\
             .apply(lambda x: x['item_id'].iloc[:5].values)).rename(columns={0: 'mlp_recs'})

mlp_recs_test = mlp_recs.merge(
    pd.DataFrame(
        data_val_lvl_2.groupby('user_id')['item_id'].unique()).rename(columns={'item_id': 'actual'}).reset_index(), how='left',on='user_id')

mlp_recs_test.head(5)

Unnamed: 0,user_id,mlp_recs,actual
0,1,"[995242.0, 995242.0, 1029743.0, 840361.0, 1082...","[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[1082185.0, 6534178.0, 1029743.0, 1106523.0, 9...","[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 9...","[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 1...","[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 1...","[835098, 872137, 910439, 924610, 992977, 10412..."


### Результаты работы модели второго уровня:

In [49]:
# приблизительная оценка работы модели (на неё ориентировалась, подбирая параметры):

mlp_recs_test.apply(lambda x: recall_at_k(x['mlp_recs'], x['actual']), axis=1).mean() * 100

3.6566656979547476

In [50]:
mlp_recs.rename(columns={'user_id': 'user_id'})
mlp_recs = mlp_recs.reset_index()

mlp_recs

Unnamed: 0,user_id,mlp_recs
0,1,"[995242.0, 995242.0, 1029743.0, 840361.0, 1082..."
1,3,"[1082185.0, 6534178.0, 1029743.0, 1106523.0, 9..."
2,6,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 9..."
3,7,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 1..."
4,8,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 1..."
...,...,...
2036,2496,"[1082185.0, 6534178.0, 981760.0, 981760.0, 108..."
2037,2497,"[1082185.0, 6534178.0, 1029743.0, 995242.0, 86..."
2038,2498,"[1082185.0, 6534178.0, 1029743.0, 1106523.0, 1..."
2039,2499,"[1070820.0, 1070820.0, 1082185.0, 6534178.0, 1..."


In [53]:
data_test = pd.read_csv('test_users.csv')

# data_test

In [54]:
for i in list(data_test['user_id'].unique()):
    if bool(len(mlp_recs[mlp_recs['user_id'] == i])):
        pred_value = str(tuple(mlp_recs[mlp_recs['user_id'] == i]['mlp_recs'])[0])
        data_test.loc[data_test['user_id'] == i, 'preds'] = pred_value[1:len(pred_value)-1].replace('.','')
    else: 
        data_test.loc[data_test['user_id'] == i, 'preds'] = '1082185 981760 995242 1029743 840361'

data_test = data_test.rename(columns={'user_id': 'UserId', 'preds': 'Predicted'})
data_test

Unnamed: 0,UserId,Predicted
0,1,995242 995242 1029743 840361 1082185
1,2,1082185 981760 995242 1029743 840361
2,3,1082185 6534178 1029743 1106523 951590
3,6,1082185 1082185 6534178 1029743 995242
4,7,1082185 1082185 6534178 1029743 1106523
...,...,...
1703,2494,1082185 6534178 1029743 840361 1127831
1704,2496,1082185 6534178 981760 981760 1082185
1705,2498,1082185 6534178 1029743 1106523 1106523
1706,2499,1070820 1070820 1082185 6534178 1029743


In [55]:
data_test.to_csv('submission_2StepModel.csv', index=False)

##### Подбор наиболее значимых признаков (не улучшил результат)

In [57]:
X_val = preds_2

for feature in features:
    new_features = features.copy()
    new_features.remove(feature)
    mlp = MLPClassifier(random_state=1, solver='sgd', learning_rate_init=0.000001, max_iter=50).fit(X_train[new_features], y_train)
    val_preds = mlp.predict_proba(X_val[new_features])[:,1]
    preds_2['proba_' + feature] = val_preds

In [66]:
best_features = []
for feature in features:
    recomendations = preds_2[['user_id', 'item_id', 'proba_'+feature]]
    mlp_recs = pd.DataFrame(recomendations.sort_values(['user_id', 'proba_'+feature], ascending=False).groupby('user_id')\
             .apply(lambda x: x['item_id'].iloc[:5].values)).rename(columns={0: 'mlp_recs'})
    mlp_recs_test = mlp_recs.merge(
    pd.DataFrame(
        data_val_lvl_2.groupby('user_id')['item_id'].unique()).rename(columns={'item_id': 'actual'}).reset_index(), how='left',on='user_id')
    # print('excluding', feature, mlp_recs_test.apply(lambda x: recall_at_k(x['mlp_recs'], x['actual']), axis=1).mean() * 100 - 3.6445371300449025)
    c = mlp_recs_test.apply(lambda x: recall_at_k(x['mlp_recs'], x['actual']), axis=1).mean() * 100 - 3.6445371300449025
    if (c < 0):
        print(feature, 'полезный признак')
        best_features.append(feature)
    else:
        print(feature, 'не очень полезный признак', c)

manufacturer не очень полезный признак 0.00801087455717564
average_cat_value полезный признак
average_weekly_quantity полезный признак
worth не очень полезный признак 0.00013265071824708485
total_sum полезный признак
average_receipt полезный признак
most_popular_store полезный признак
is_popular полезный признак
is_favorite_cat не очень полезный признак 0.012533794700746892
department полезный признак
brand не очень полезный признак 0.006581973733361668
commodity_desc полезный признак
sub_commodity_desc не очень полезный признак 0.006074636197885397
curr_size_of_product не очень полезный признак 0.014859589044072141
average_weekly_quantity полезный признак
age_desc полезный признак
marital_status_code не очень полезный признак 0.0008776390398539391
income_desc полезный признак
homeowner_desc не очень полезный признак 0.009033497158346115
hh_comp_desc не очень полезный признак 0.0006508735966161971
household_size_desc полезный признак
kid_category_desc полезный признак


In [67]:
best_features = best_features + ['commodity_desc', 'sub_commodity_desc']

mlp = MLPClassifier(random_state=1, solver='sgd', learning_rate_init=0.000001, max_iter=50).fit(X_train[best_features], y_train)
val_preds = mlp.predict_proba(X_val[best_features])[:,1]
preds_2['proba'] = val_preds

In [68]:
recomendations = preds_2[['user_id', 'item_id', 'proba']]
mlp_recs = pd.DataFrame(recomendations.sort_values(['user_id', 'proba'], ascending=False).groupby('user_id')\
                        .apply(lambda x: x['item_id'].iloc[:5].values)).rename(columns={0: 'mlp_recs'})
mlp_recs_test = mlp_recs.merge(
    pd.DataFrame(
        data_val_lvl_2.groupby('user_id')['item_id'].unique()).rename(columns={'item_id': 'actual'}).reset_index(), how='left',on='user_id')

mlp_recs_test.apply(lambda x: recall_at_k(x['mlp_recs'], x['actual']), axis=1).mean() * 100

3.6447933206133016