In [2]:
pip install -U lightautoml




[notice] A new release of pip is available: 23.1.1 -> 23.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als
from sklearn.model_selection import train_test_split
# Модель второго уровня
from lightgbm import LGBMClassifier
import lightgbm  as lgb 
import catboost as catb
from tqdm import tqdm

In [4]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.tasks.common_metric import mean_quantile_error

In [5]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    # print(len(bought_list))
    # print(len(recommended_list))
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    # print(len(flags))
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [6]:
def recall_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)
    
    return recall

In [7]:
def prefilter_items(data, item_features, drop_categories=[],take_n_popular=5000):
    # 1. Уберем товары, которые не продавались за последние 12 месяцев
    data = data.loc[~(data['week_no'] < data['week_no'].max() - 12)]

    # 2. Уберем не интересные для рекоммендаций категории (department)
    not_important_goods = item_features.loc[(item_features['department'].isin(drop_categories)), 'item_id'].tolist()
    data = data.loc[(~data['item_id'].isin(not_important_goods))]

    # 3. Уберем слишком дешевые товары (на них не заработаем). Товары, со средней ценой < 1$
    data.drop(data[data['sales_value'] < 1].index, axis=0, inplace=True)

    # 4. Уберем слишком дорогие товары. Товары со средней ценой > 30$
    data.drop(data[data['sales_value'] > 30].index, axis=0, inplace=True)

    # 5. Уберем самые популярные товары (их и так купят)
    popularity = data.groupby('item_id')['user_id'].nunique().reset_index() / data['user_id'].nunique()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)

    top_popular = popularity[popularity['share_unique_users'] > 0.8].item_id.tolist()
    data = data.loc[(~data['item_id'].isin(top_popular))]
    # data = data.loc[(~data['item_id'].isin(not_important_goods))]
    #
    # # 6. Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
    data = data.loc[(~data['item_id'].isin(top_notpopular))]
    # result = data
	
    # Возьмем топ по популярности
    popularity = data.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

    top = popularity.sort_values('n_sold', ascending=False).head(take_n_popular).item_id.tolist()	
    
    # Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
    data.loc[~data['item_id'].isin(top), 'item_id'] = 999999
    
    # ...
    return data

In [8]:
def postfilter_items(recommendations, item_features, N=5):
    """Пост-фильтрация товаров
    
    Input
    -----
    recommendations: list
        Ранжированный список item_id для рекомендаций
    item_info: pd.DataFrame
        Датафрейм с информацией о товарах
    """
    
    # Уникальность
#     recommendations = list(set(recommendations)) - неверно! так теряется порядок
    unique_recommendations = []
    [unique_recommendations.append(item) for item in recommendations if item not in unique_recommendations]
    
    # Разные категории
    categories_used = []
    final_recommendations = []
    
    CATEGORY_NAME = 'sub_commodity_desc'
    for item in unique_recommendations:
        category = item_features.loc[item_features['item_id'] == item, CATEGORY_NAME].values[0]
        
        if category not in categories_used:
            final_recommendations.append(item)
            
        unique_recommendations.remove(item)
        categories_used.append(category)
    
    # Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)
    n_rec = len(final_recommendations)
    if n_rec < N:
        final_recommendations.extend(unique_recommendations[:N - n_rec])  # (!) это не совсем верно
    else:
        final_recommendations = final_recommendations[:N]
        
    # 2 новых товара (юзер никогда не покупал)
    # your_code
    
    # 1 дорогой товар, > 7 долларов
    # your_code
    
    assert len(final_recommendations) == N, 'Количество рекомендаций != {}'.format(N)
    return final_recommendations
#------------------------------------------------------------------------------------------------

def get_similar_item(model,  itemid_to_id, id_to_itemid, x):
    id = itemid_to_id[x]
    recs = model.similar_items(id, N=2)
    top_rec = recs[1][0]
    return id_to_itemid[top_rec]


def get_similar_items_recommendation(user, data, itemid_to_id, id_to_itemid,  model, N=5):
    """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

    top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
    top_purchases.sort_values('quantity', ascending=False, inplace=True)
    top_purchases = top_purchases[top_purchases['item_id'] != 999999]

    top_users_purchases = top_purchases[top_purchases['user_id'] == user].head(N)
    res = top_users_purchases['item_id'].apply(lambda x: get_similar_item(model, itemid_to_id=itemid_to_id, id_to_itemid=id_to_itemid, x=x)).tolist()
    return res

def fit_own_recomender(user_item_matrix):
    own = ItemItemRecommender(K=1, num_threads=4) # K - кол-во билжайших соседей
    own.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=False)
    return own

def get_own_recommendations(own, userid, user_item_matrix, N):
    recs = own.recommend(userid=userid, 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=N, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)
    return recs

def get_similar_users_recommendation(userid, userid_to_id, id_to_userid, user_item_matrix, model, N=5):
    """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
    res = []

    # Находим топ-N похожих пользователей
    similar_users = model.similar_users(userid_to_id[userid], N=N+1) # user + N его друзей.
    similar_users = [rec[0] for rec in similar_users]
    similar_users = similar_users[1:]   # удалим юзера из запроса
 
    own = fit_own_recomender(user_item_matrix)
 
    for user in similar_users:
        userid = id_to_userid[user] #own recommender works with user_ids
        res.extend(get_own_recommendations(own, userid, user_item_matrix, N=1))

    return res

In [9]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    recs = popular.head(n).item_id
    return recs.tolist()

In [10]:
def get_freq_encoder(data,feature_names):
    for feature_name in feature_names:
        freq_encoder = data[feature_name].value_counts(normalize=True)
        data[feature_name] = data[feature_name].map(freq_encoder)
    return data

In [11]:
def category_to_digit(df, features):
    df = df.copy(deep=True)
    for i, feature in enumerate(features):
        # feature = str.replace(feature,' ','_')
        values_list = df[feature].value_counts()
        names = sorted(values_list.index)
        # names = sorted(names)
        for name in names:
            name = str.replace(name,' ','_')
            df.insert(3, f'{feature}_{name}', np.where((df[feature]==name),1,0), True)
    df.drop(features, axis=1, inplace=True)
    return df

In [12]:
def perpare_lvl2_1(val_data, train_data, recommender, item_features, user_features, N=50):
    # val_data = data_train_lvl_2.copy()
    # train_data = data_train_lvl_1.copy()

    users_warm = pd.DataFrame(val_data['user_id'].unique()) # Добавим туда еще фитчи user-ов и item-ов.
    users_warm.columns = ['user_id']
    # Пока только warm start
    users_warm = users_warm[users_warm['user_id'].isin(train_data['user_id'].unique())]

    users_cold = pd.DataFrame(val_data['user_id'].unique()) # Добавим туда еще фитчи user-ов и item-ов.
    users_cold.columns = ['user_id']
    # cold_start
    users_cold = users_cold[~users_cold['user_id'].isin(users_warm['user_id'].unique())]

    # Заполняем кандидатов, на основе предсказания модели 1-го уровня.
    users_cold['candidates'] = users_cold['user_id'].apply(lambda x: recommender.get_top_popular(N=N))
    s = users_cold.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'item_id'

    # Это кандидаты. (т.е. предпологаемые покупки совершенные на основе предсказаний.)
    users_cold = users_cold.drop('candidates', axis=1).join(s)
    users_cold['drop'] = 1  # фиктивная переменная
    # Заполняем кандидатов, на основе предсказания модели 1-го уровня.
    users_warm['candidates'] = users_warm['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=N))
    # test_users = data
    s = users_warm.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'item_id'

    # Это кандидаты. (т.е. предпологаемые покупки совершенные на основе предсказаний.)
    users_warm = users_warm.drop('candidates', axis=1).join(s)
    users_warm['drop'] = 1  # фиктивная переменная

    # Создадим таблицу с реальными покупками user-ов. 
    targets = val_data[['user_id', 'item_id']].copy() # свойства 
    targets['target'] = 1  # тут только покупки

    # Объединим предпологаемые покупки с реальными, совершенными user-ами.
    targets_cold = users_cold.merge(targets, on=['user_id', 'item_id'], how='left')

    # В результате, напротив товаров, в редсказании которых мы ошиблись, 
    # будет стоять Nan. Заполним их  нулями.  
    targets_cold['target'].fillna(0, inplace= True)
    targets_cold.drop('drop', axis=1, inplace=True)
    # Добавим к нашему датасету фичи user-ов и item-ов.
    targets_cold = targets_cold.merge(item_features, on='item_id', how='left')
    targets_cold = targets_cold.merge(user_features, on='user_id', how='left')

    # Объединим предпологаемые покупки с реальными, совершенными user-ами.
    targets_warm = users_warm.merge(targets, on=['user_id', 'item_id'], how='left')

    # В результате, напротив товаров, в редсказании которых мы ошиблись, 
    # будет стоять Nan. Заполним их  нулями.  
    targets_warm['target'].fillna(0, inplace= True)
    targets_warm.drop('drop', axis=1, inplace=True)
    # targets_warm['target'].mean() #Угадали примерно 17% покупок.

    # Добавим к нашему датасету фичи user-ов и item-ов.
    targets_warm = targets_warm.merge(item_features, on='item_id', how='left')
    targets_warm = targets_warm.merge(user_features, on='user_id', how='left')

    targets_lvl_2 = pd.concat([targets_warm, targets_cold], ignore_index=True)

    # X_ = targets_lvl_2.drop('target', axis=1)
    # y_ = targets_lvl_2[['target']]

    return  targets_lvl_2 #X_, y_,

In [13]:
def perpare_lvl2(val_data, train_data, recommender, item_features, user_features, N=50):
    users_lvl_2 = pd.DataFrame(val_data['user_id'].unique()) # Добавим туда еще фитчи user-ов и item-ов.
    users_lvl_2.columns = ['user_id']

    # Пока только warm start
    users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_data['user_id'].unique())]

    # Заполняем кандидатов, на основе предсказания модели 1-го уровня.
    users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=N))
    # test_users = data
    s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'item_id'

    # Это кандидаты. (т.е. предпологаемые покупки совершенные на основе предсказаний.)
    users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
    users_lvl_2['drop'] = 1  # фиктивная переменная

    # Создадим таблицу с реальными покупками user-ов. 
    targets_lvl_2 = val_data[['user_id', 'item_id']].copy() # свойства 
    targets_lvl_2['target'] = 1  # тут только покупки 

    # Объединим предпологаемые покупки с реальными, совершенными user-ами.
    targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

    # В результате, напротив товаров, в редсказании которых мы ошиблись, 
    # будет стоять Nan. Заполним их  нулями.  
    targets_lvl_2['target'].fillna(0, inplace= True)
    targets_lvl_2.drop('drop', axis=1, inplace=True)
    targets_lvl_2['target'].mean() #Угадали примерно 17% покупок.

    # Добавим к нашему датасету фичи user-ов и item-ов.
    targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
    targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

    X_ = targets_lvl_2.drop('target', axis=1)
    y_ = targets_lvl_2[['target']]
    return X_, y_

In [14]:
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, weighting=True):
        
        # Топ покупок каждого юзера
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True) # Это к-во покупок.
        self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999] # исключим из ТОП-а покупок item_id = 999999

        # Топ покупок по всему датасету
        self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
        self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999] # исключим из ТОП-а покупок item_id = 999999
        self.overall_top_purchases = self.overall_top_purchases.item_id.tolist()

        self.user_item_matrix = self._prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T # настроить параметры взвешивния. 

        self.als_model = self.fit(self.user_item_matrix)
        self.own_recommender_model = self.fit_own_recommender(self.user_item_matrix)

    @staticmethod
    def _prepare_matrix(data):
        """Готовит user-item матрицу"""
        user_item_matrix = pd.pivot_table(data,
                                          index='user_id', 
                                          columns='item_id',
                                          values='quantity',  # Можно пробовать другие варианты
                                          aggfunc='mean',
                                          fill_value=0
                                          )

        user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit

        return user_item_matrix

    @staticmethod
    def _prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=30, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads, 
                                        random_state=0)

        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model

    def _update_dict(self, user_id):
        """Если появился новыю user / item, то нужно обновить словари"""

        if user_id not in self.userid_to_id.keys():

            max_id = max(list(self.userid_to_id.values()))
            max_id += 1

            self.userid_to_id.update({user_id: max_id})
            self.id_to_userid.update({max_id: user_id})

    def _get_similar_item(self, item_id):
        """Находит товар, похожий на item_id"""
        recs = self.als_model.similar_items(self.itemid_to_id[item_id], N=2)  # Товар похож на себя -> рекомендуем 2 товара
        top_rec = recs[1][0]  # И берем второй (не товар из аргумента метода)
        return self.id_to_itemid[top_rec]

    def _extend_with_als(self, user, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.get_als_recommendations(user, N))
            recommendations = recommendations[:N]

        return recommendations

    def get_top_popular(self, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""
        recommendations = self.overall_top_purchases[:N]

        return recommendations

    def _extend_with_top_popular(self, recommendations, N=5):
        """Если кол-во рекоммендаций < N, то дополняем их топ-популярными"""

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_purchases[:N])
            recommendations = recommendations[:N]

        return recommendations

    def _get_recommendations(self, user, model, N=5):
        """Рекомендации через стардартные библиотеки implicit"""
        
        self._update_dict(user_id=user)
        
        recs = model.recommend(userid=self.userid_to_id[user],
                                        user_items=csr_matrix(self.user_item_matrix).tocsr(),
                                        N=N,
                                        filter_already_liked_items=False,
                                        filter_items=[self.itemid_to_id[999999]],
                                        recalculate_user=True)
        
        res = [self.id_to_itemid[rec[0]] for rec in recs]

        # res = self._extend_with_top_popular(res, N=N)

        #assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_als_recommendations(self, user, N=5):
        """Рекомендации через стардартные библиотеки implicit"""

        self._update_dict(user_id=user)
        return self._get_recommendations(user, model=self.als_model, N=N)

    def get_own_recommendations(self, user, N=5, extend_with_top_popular=False):
        """Рекомендуем товары среди тех, которые юзер уже купил"""

        self._update_dict(user_id=user)
        recs = self._get_recommendations(user, model=self.own_recommender_model, N=N)
        if extend_with_top_popular:
                # res = [self.id_to_itemid[rec[0]] for rec in recs]
                recs = self._extend_with_top_popular(recs, N=N)
        # assert len(recs) == N, 'Количество рекомендаций != {}'.format(N)
        return recs

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)

        res = top_users_purchases['item_id'].apply(lambda x: self._get_similar_item(x)).tolist()
        res = self._extend_with_top_popular(res, N=N)

        # assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
        res = []
        
        # Находим топ-N похожих пользователей
        similar_users = self.als_model.similar_users(self.userid_to_id[user], N=N+1)
        similar_users = [rec[0] for rec in similar_users]
        similar_users = similar_users[1:]   # удалим юзера из запроса

        for user in similar_users:
            userid = self.id_to_userid[user] #own recommender works with user_ids
            res.extend(self.get_own_recommendations(userid, N=1))

        res = self._extend_with_top_popular(res, N=N)

        # assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

In [15]:
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

In [16]:
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender 

In [17]:
data = pd.read_csv('C:/Users/1/Downloads/retail_train.csv')
item_features = pd.read_csv('C:/Users/1/Downloads/product.csv')
user_features = pd.read_csv('C:/Users/1/Downloads/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [18]:
N = 150 # к-во товаров получаемых из модели 1-го уровня.
final_predict_count = 30 # К-во рекомендаций выдаваемых
val_count = 5 # финальное к-во репомендаций товаров. На них будет осуществляться подсчет к-ва.
top_items_count = 5000 #

In [19]:
data.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0


In [20]:
item_features.head(3)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,


In [21]:
user_features.head(3)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8


In [22]:
week_day = {'week_day': []}
# считаем номер недели. поле чего вычисляем записи с номеро дня. 
# определяем номер дня соответствуующий номеру последнему дню недели и после этого начинаем вычетать из него.
# номера номера дней. 
max_week_no = data['week_no'].max()
min_week_no = data['week_no'].min()

week_days=[]
for week_no in range(min_week_no,max_week_no + 1):
    max_day_in_week = data.loc[(data['week_no']==week_no),'day'].max()
    days = data.loc[(data['week_no']==week_no),'day']
    for day in days:
        week_days.append(day-max_day_in_week+7)

data['week_day'] = week_days

In [23]:
# Cхема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)

val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,week_day
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,3
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,3


In [24]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=top_items_count)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [25]:
def get_new_values(old_df, new_df,feature):
    old_values = old_df[feature].unique()
    new_values = new_df[feature].unique()
    appended_values = []

    for value  in new_values: 
        if value not in old_values:
            appended_values.append(value)
        
    appended_values = np.unique(appended_values)
    return appended_values

In [26]:
first_users_count = len(data_train_lvl_1['user_id'].unique()) 
first_items_count = len(data_train_lvl_1['item_id'].unique()) 

new_user_lvl_1 = get_new_values(data_train_lvl_1, data_train_lvl_2 ,'user_id')
new_items_lvl_1 = get_new_values(data_train_lvl_1, data_train_lvl_2 ,'item_id')

new_user_lvl_2 = get_new_values(data_train_lvl_1, data_val_lvl_2 ,'user_id')
new_items_lvl_2 = get_new_values(data_train_lvl_1, data_val_lvl_2 ,'item_id')

print(f'Изначальное к-во: users: {first_users_count}, items: {first_items_count}')
print(f'1-й уровень  users: +{len(new_user_lvl_1)}, items: +{len(new_items_lvl_1)}')
print(f'2-й уровень  users: +{len(new_user_lvl_2)}, items: +{len(new_items_lvl_2)}')

Изначальное к-во: users: 2299, items: 5001
1-й уровень  users: +70, items: +22772
2-й уровень  users: +74, items: +19567


In [27]:
data_gr = data.groupby('basket_id').mean()

In [28]:
# Среднее к-во покупаемых товаров.
user_features['median_quantity'] = user_features['user_id'].apply(lambda x: 
                                data_gr.loc[(data_gr['user_id']==x),'quantity'].median())

# Средний чек.                                
user_features['mean_sales_value'] = user_features['user_id'].apply(lambda x: 
                                data_gr.loc[(data_gr['user_id']==x),'sales_value'].mean())

In [29]:
#  Среднее к-во раз в неделю, которое user ходит в магазин.

for i in [1,2,3,4,5,6,7]:
    data_gr[f"day_{i}"] = np.where((data_gr['week_day'] == i),1,0)
    
week_count = data['week_no'].max()
#-----------------------------------------------
def mean_quantity_in_week(user_id):
    days = 0
    for i in [1,2,3,4,5,6,7]:
       days += data_gr.loc[(data_gr['user_id']==user_id),f'day_{i}'].sum()
    days /=week_count
    return int(np.round(days))

#-----------------------------------------------    
user_features['mean_quantity_in_week'] = user_features['user_id'].apply(lambda x: mean_quantity_in_week(x))

In [30]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,median_quantity,mean_sales_value,mean_quantity_in_week
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,1.1,2.726818,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,1.181818,2.989986,1


In [31]:
### Список категориальных фитчей ктороые мы будем разбирать.
features=['income_desc','age_desc','homeowner_desc','kid_category_desc','household_size_desc','hh_comp_desc']
for feature_name in features:
    print(feature_name)
    print(user_features[feature_name].unique())
    print('-'*20)

income_desc
['35-49K' '50-74K' '25-34K' '75-99K' 'Under 15K' '100-124K' '15-24K'
 '125-149K' '150-174K' '250K+' '175-199K' '200-249K']
--------------------
age_desc
['65+' '45-54' '25-34' '35-44' '19-24' '55-64']
--------------------
homeowner_desc
['Homeowner' 'Unknown' 'Renter' 'Probable Renter' 'Probable Owner']
--------------------
kid_category_desc
['None/Unknown' '1' '2' '3+']
--------------------
household_size_desc
['2' '3' '4' '1' '5+']
--------------------
hh_comp_desc
['2 Adults No Kids' '2 Adults Kids' 'Single Female' 'Unknown'
 'Single Male' '1 Adult Kids']
--------------------


In [32]:
income_desc = {'35-49K':42, '50-74K':62, '25-34K':30, '75-99K':87, 'Under 15K':15, '100-124K':112,
       '15-24K':20, '125-149K':137, '150-174K':162, '250K+':250, '175-199K':187, '200-249K':225}
       
user_features['income_desc'] = user_features['income_desc'].apply(lambda x: income_desc[x]) 

age_desc = {'65+':65, '45-54':50, '25-34':30, '35-44':40, '19-24':21, '55-64':60}	

user_features['age_desc'] = user_features['age_desc'].apply(lambda x: age_desc[x])

In [33]:
household_size_desc = {np.nan: 0, '1':1, '2':2, '3':3, '4':4, '5+':5 }

user_features['household_size_desc'] = user_features['household_size_desc'].apply(lambda x: household_size_desc[x])

In [34]:
kid_category_desc = {'None/Unknown':0, np.nan: 0, '1':1, '2':2, '3+':3 }

user_features['kid_category_desc'] = user_features['kid_category_desc'].apply(lambda x: kid_category_desc[x])

In [35]:
user_features[['hh_comp_desc_female', 'hh_comp_desc_male', 'hh_comp_desc_Adults_Kids']] = 0
user_features['hh_comp_desc_female'] = np.where((user_features['hh_comp_desc'] !='Single Male'), 1, 0)
user_features['hh_comp_desc_male'] = np.where((user_features['hh_comp_desc'] !='Single Female'), 1, 0)
user_features.loc[(user_features['hh_comp_desc']=='2 Adults Kids'), 'hh_comp_desc_Adults_Kids'] = 2
user_features.loc[(user_features['hh_comp_desc']=='1 Adult Kids'), 'hh_comp_desc_Adults_Kids'] = 1
user_features.loc[(user_features['hh_comp_desc'].isna()), ['hh_comp_desc_female','hh_comp_desc_male']] = 0
user_features.loc[(user_features['hh_comp_desc']=='Unknown'), ['hh_comp_desc_female','hh_comp_desc_male']] = 0
user_features.drop('hh_comp_desc', axis=1, inplace=True)

In [36]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,household_size_desc,kid_category_desc,user_id,median_quantity,mean_sales_value,mean_quantity_in_week,hh_comp_desc_female,hh_comp_desc_male,hh_comp_desc_Adults_Kids
0,65,A,42,Homeowner,2,0,1,1.1,2.726818,1,1,1,0
1,50,A,62,Homeowner,2,0,7,1.181818,2.989986,1,1,1,0


In [37]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [38]:
item_features_temp = item_features.merge(data, on='item_id', how='left')

In [39]:
# Средняя стоимость товара в категории.

item_price = item_features_temp.groupby(['item_id','commodity_desc'])['sales_value'].mean().reset_index()
item_price.columns= ['item_id','commodity_desc','sales_value']
commoditys_desc = item_price['commodity_desc'].unique()

item_price['commodity_desc_mean_sale']=np.NaN

for commodity_desc in commoditys_desc:
    mean_value = item_price.loc[(item_price['commodity_desc']==commodity_desc),'sales_value'].mean()
    item_price.loc[(item_price['commodity_desc']==commodity_desc),'commodity_desc_mean_sale'] = mean_value

item_price.loc[(item_price['commodity_desc']=='NO COMMODITY DESCRIPTION'),'sales_value']

item_features = item_features.merge(item_price[['item_id','commodity_desc_mean_sale']], on='item_id',how='left')

In [40]:
# К-во покупок в неделю.
quantity_count = item_features_temp.groupby(['item_id'])['quantity'].sum().reset_index()

quantity_count.columns = ['item_id','quantity']

quantity_in_week = item_features_temp.groupby(['item_id'])['week_no'].unique().reset_index()

quantity_in_week.columns = ['item_id','weeks']

quantity_in_week['weeks_count'] = quantity_in_week['weeks'].apply(lambda x: len(x))

quantity_in_week['sale_in_week'] = quantity_count['quantity']/quantity_in_week['weeks_count']  

item_features = item_features.merge(quantity_in_week[['item_id','sale_in_week']], on='item_id',how='left')

In [41]:
recommender = MainRecommender(data_train_lvl_1)



  0%|          | 0/15 [00:01<?, ?it/s]

  0%|          | 0/2299 [00:00<?, ?it/s]

In [None]:
train_data = perpare_lvl2_1(data_train_lvl_2, data_train_lvl_1, recommender,item_features, user_features, N=N)

In [None]:
test_data = perpare_lvl2_1(data_val_lvl_2, data_train_lvl_1, recommender, item_features, user_features, N=N)

In [None]:
train_data.head(2)

In [None]:
# Получим список катероиальных и числовых признаков.
categorical = []
numerical = []
for col, value in train_data.iteritems():
    if value.dtype == 'object':
        categorical.append(col)
    else:
        numerical.append(col)

In [None]:
print(categorical)

In [None]:
print(numerical)

In [None]:
for feature in categorical:
print(f'{feature}: {len(train_data[feature].unique())}')

In [None]:
features = ['commodity_desc', 'sub_commodity_desc', 'curr_size_of_product']
train_data = train_data.drop(features, axis=1)
test_data = test_data.drop(features, axis=1)

In [None]:
print(train_data['department'].unique())

In [None]:
features = [ 'department',
            'brand',
            #'commodity_desc',
            #'sub_commodity_desc',
            #'curr_size_of_product',
            'marital_status_code',
            'homeowner_desc',
            # 'hh_comp_desc',
            # 'household_size_desc',
            # 'kid_category_desc'
           ]

In [None]:
train_data = category_to_digit(train_data, features)

In [None]:
test_data = category_to_digit(test_data, features)

### Обучение модели

In [None]:
TASK = Task('binary', loss='logloss', metric='auc', greater_is_better=False)
TIMEOUT = 300000
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 27
TARGET_NAME = 'target'
TEST_SIZE=0.2

In [None]:
roles = {'target': TARGET_NAME, 'drop': ['user_id', 'item_id']}

In [None]:
automl_model = TabularAutoML(task=TASK,
                            timeout=TIMEOUT,
                            cpu_limit = N_THREADS,
                            # gpu_ids='all',
                            reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                             
                            general_params={'use_algos': [ ['lgb_tuned', 'cb_tuned'] ]},
                             
                            tuning_params={'max_tuning_iter': 10},
                      )

In [None]:
train_preds = automl_model.fit_predict(train_data, roles = roles)

In [None]:
train_preds = train_preds.data[:, 0]

### Предсказания

In [None]:
def get_items(x_data, items, user_id, item_name, N=5, overall_top_purchases=None):
    items_list = []
 
    for item in items:
        flag = (x_data.loc[((x_data['user_id']==user_id) & (x_data['item_id']==item)),item_name].mean())
        
        if (flag > 0.3):
            items_list.append(item)

    if not(overall_top_purchases is None):

        if len(items_list) < N:
            items_list.extend(overall_top_purchases[:N])
        items_list = items_list[:N]
    return items_list

In [None]:
def get_final_recomendations(x_data, y_data, preds):
    x_data = x_data.copy()
    x_data['predict'] = preds
    x_data['actual'] = y_data['target'].values

    result = x_data.sort_values('predict', ascending=False).groupby('user_id')['item_id'].unique().reset_index()

    overall_top_purchases = x_data.groupby('item_id')['item_id'].count()
    overall_top_purchases = overall_top_purchases.sort_values(ascending=False).index.values

    result_df= {'user_id':[], 'actual':[], 'predict':[]}

    for res in tqdm(result.iterrows()):
        user_id = res[1]['user_id']
        item_ids = res[1]['item_id']
        actual = get_items(x_data, item_ids, user_id, 'actual', N=final_predict_count)
        if len(actual)>0:
            result_df['user_id'].append(user_id)
            predict_items= get_items(x_data, item_ids, user_id, 'predict', N=final_predict_count, overall_top_purchases = overall_top_purchases)
            result_df['predict'].append(postfilter_items(predict_items, item_features, N=val_count)) # Бизнес-ограничения. ^_^
            result_df['actual'].append(actual)
    return pd.DataFrame(result_df) 

In [None]:
X_train = train_data.drop('target', axis=1)
y_train = train_data[['target']]

In [None]:
result_train = get_final_recomendations(X_train, y_train, train_preds)

result_train.head(3)

In [None]:
precision_train = result_train.apply(lambda row: precision_at_k(row['predict'], row['actual']), axis=1).mean()
print(f'Train precision: {precision_train:.03}')

In [None]:
test_preds = automl_model.predict(test_data).data[:,0]

In [None]:
X_test = test_data.drop('target', axis=1)
y_test = test_data[['target']]

In [None]:
result_test = get_final_recomendations(X_test, y_test, test_preds)

result_test.head(3)

In [None]:
precision_test = result_test.apply(lambda row: precision_at_k(row['predict'], row['actual']), axis=1).mean()
print(f'Test precision: {precision_test:.03}')

In [None]:
result_test.to_csv('finally_prediction_lama_classifaer.csv', index=False)

In [None]:
import pickle

with open('automl_model_classifaer.pickle', 'wb') as f:
    pickle.dump(automl_model, f, protocol=pickle.HIGHEST_PROTOCOL)