In [1]:
import numpy as np
import pandas as pd

In [2]:
users = pd.read_csv('data/users.csv', sep=';', index_col=None, 
                    dtype={'age': str, 'chb': str, 'chit_type': str, 'gender': str})
items = pd.read_csv('data/items.csv', sep=';', index_col=None, 
                    dtype={'author': str, 'bbk': str, 'izd': str, 'sys_numb': str, 'title': str, 'year_izd': str})
transactions = pd.read_csv('data/train_transactions_extended.csv', sep=';', index_col=None,
                           dtype={'chb': str, 'date_1': str, 'is_printed': str, 'is_real': str, 'source': str,
                                  'sys_numb': str, 'type': str})

# Топ 20 книг не являющихся открыткой к рождеству

In [3]:
christ_books = items[['С Рождеством Христовым'.lower() in str(x).lower()
                      for x in items['title'].values]]['sys_numb'].unique()

mask = ~transactions['sys_numb'].isin(christ_books)
top_books = transactions.loc[mask, 'sys_numb'].value_counts().head(20).index.to_list()
print(top_books)

['RSL07000449201', 'RSL01004357029', 'RSL01006724299', 'RSL01010175347', 'RSL01007886433', 'RSL01004400764', 'RSL01002745675', 'RSL07000372451', 'RSL01004915983', 'RSL01004246481', 'RSL01006754569', 'RSL01002496664', 'RSL01002884697', 'RSL01001842474', 'RSL01000193246', 'RSL01009438671', 'RSL01003955862', 'RSL01002463969', 'RSL01000741775', 'RSL01009740053']


# Удаление повторного прочтения книг

In [4]:
print(transactions.shape)
transactions = transactions.groupby(['chb', 'sys_numb']).first().reset_index()
print(transactions.shape)

(259566, 7)
(229002, 7)


In [5]:
# Функция прогнозирования на основе похожести пользователей

def predict_user(user_data, transactions, top_books):
    user_number = user_data['chb'].values[0]
    user_books = set(user_data['sys_numb'].unique())
    
    # Транзакции по пользователям, которые прочитали как минимум одну книгу из списка для целевого пользователя
    cross_users_data = transactions[(transactions['chb'] != user_number) & 
                                    (transactions['sys_numb'].isin(user_books))][['chb', 'sys_numb']]    
    
    if cross_users_data.shape[0] > 0:
        cross_users_rates = cross_users_data['chb'].value_counts()
        cross_users_rates.name = 'score'
        
        rate = transactions['chb'].value_counts().apply(np.log1p)
        cross_users_rates = cross_users_rates.apply(np.log1p)  
        cross_users_rates /= rate.loc[cross_users_rates.index]   

        cross_users = cross_users_rates.index.to_list()   
        cross_users_books =  transactions[(transactions['chb'].isin(cross_users)) & 
                                          (~transactions['sys_numb'].isin(user_books))]        

        score_books = pd.merge(cross_users_books, cross_users_rates, left_on='chb', right_index=True, how='left')
        top20 = score_books.groupby('sys_numb').sum().sort_values('score', ascending=False).head(20).index.to_list()
        top20 = (top20 + top_books)[:20]        
    else:
        return top_books
    
    return top20

In [6]:
%%time
solution = transactions.groupby('chb').apply(predict_user, transactions, top_books)

Wall time: 11min 43s


In [7]:
solution.name = 'sys_numb'
solution = solution.reset_index()
solution['sys_numb'] = solution['sys_numb'].replace({' ': top_books})
solution = solution.explode('sys_numb')
solution

Unnamed: 0,chb,sys_numb
0,100000641403,RSL01002393437
0,100000641403,RSL01003441482
0,100000641403,RSL02000000555
0,100000641403,RSL01008740130
0,100000641403,RSL01008477654
...,...,...
16752,400001035059,RSL01005094668
16752,400001035059,RSL01006576592
16752,400001035059,RSL01006672291
16752,400001035059,RSL01008771476


In [8]:
solution.to_csv('recomendations.csv', index=False, sep=';')