In [None]:
import pandas as pd
import numpy as np
from implicit.nearest_neighbours import TFIDFRecommender
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import top_k_accuracy_score, accuracy_score

Task of url:
    https://cups.online/ru/training/8/tasks/1808

In [None]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
user_features = pd.read_csv('./user_features.csv')

In [None]:
user_features

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,0,0.000695,-0.001573,-0.001470,0.002085,-0.000498,0.000685,0.000811,0.000666,-0.003031,...,-0.004196,-0.000698,0.001121,-0.001079,-0.001993,-0.001993,0.000422,-0.001168,-0.001168,0.000297
1,1,0.001204,-0.002725,-0.002546,0.003612,-0.000862,0.001187,0.001404,0.001154,-0.005251,...,-0.007268,-0.001209,0.001942,-0.001870,-0.003451,-0.003451,0.000732,-0.002023,-0.002023,0.000515
2,2,0.000491,-0.001112,-0.001039,0.001475,-0.000352,0.000484,0.000573,0.000471,-0.002144,...,-0.002967,-0.000494,0.000793,-0.000763,-0.001409,-0.001409,0.000299,-0.000826,-0.000826,0.000210
3,3,0.000777,-0.001759,-0.001643,0.002332,-0.000557,0.000766,0.000906,0.000745,-0.003389,...,-0.004691,-0.000781,0.001254,-0.001207,-0.002228,-0.002228,0.000472,-0.001306,-0.001306,0.000332
4,4,0.000695,-0.001573,-0.001470,0.002085,-0.000498,0.000685,0.000811,0.000666,-0.003031,...,-0.004196,-0.000698,0.001121,-0.001079,-0.001993,-0.001993,0.000422,-0.001168,-0.001168,0.000297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,492,0.000983,-0.002225,-0.002079,0.002949,-0.000704,0.000969,0.001147,0.000942,-0.004287,...,-0.005934,-0.000988,0.001586,-0.001527,-0.002818,-0.002818,0.000597,-0.001652,-0.001652,0.000421
493,493,0.001300,-0.002943,-0.002750,0.003901,-0.000931,0.001282,0.001517,0.001246,-0.005671,...,-0.007850,-0.001306,0.002098,-0.002020,-0.003728,-0.003728,0.000790,-0.002185,-0.002185,0.000556
494,494,0.000491,-0.001112,-0.001039,0.001475,-0.000352,0.000484,0.000573,0.000471,-0.002144,...,-0.002967,-0.000494,0.000793,-0.000763,-0.001409,-0.001409,0.000299,-0.000826,-0.000826,0.000210
495,495,0.000983,-0.002225,-0.002079,0.002949,-0.000704,0.000969,0.001147,0.000942,-0.004287,...,-0.005934,-0.000988,0.001586,-0.001527,-0.002818,-0.002818,0.000597,-0.001652,-0.001652,0.000421


In [8]:
train

Unnamed: 0,user_id,friend_id,friendship,timestamp
0,140,342,0,1490936622
1,378,172,1,1490936628
2,150,182,0,1490936650
3,455,17,0,1490936704
4,350,409,0,1490936735
...,...,...,...,...
8669,161,312,0,1491215519
8670,406,208,0,1491215543
8671,196,43,0,1491215576
8672,84,100,0,1491215579


In [9]:
test

Unnamed: 0,user_id,timestamp
0,166,1490944431
1,26,1490957371
2,41,1490958147
3,286,1490971255
4,108,1490976836
...,...,...
492,190,1491214814
493,181,1491214829
494,448,1491214928
495,124,1491215197


### TFIDFRecommender 

Подготовка данных для implicit.

Implicit ожидает sparse matrix в формате (items x users), где items = friends

In [None]:
train_weighted = train.copy()

unique_friends = sorted(train_weighted.friend_id.unique())
unique_users = sorted(train_weighted.user_id.unique())

friend_to_idx = {friend_id: idx for idx, friend_id in enumerate(unique_friends)}
user_to_idx = {user_id: idx for idx, user_id in enumerate(unique_users)}

train['weight'] = train.friendship.apply(lambda x: 3.0 if x == 1 else 1.0)

cols = [user_to_idx[uid] for uid in train.user_id]
rows = [friend_to_idx[fid] for fid  in train.friend_id]
data = train.weight.values

friend_user_matrix = csr_matrix(
    (data, (rows, cols)),
    shape=(len(friend_to_idx), len(user_to_idx))
)

print(f'friend x user matrix: {friend_user_matrix.shape}') 
print(f'unique users: {len(unique_users)}') 
print(f'unique friends: {len(unique_friends)}')

friend x user matrix: (444, 497)
unique users: 497
unique friends: 444


In [None]:
pd.DataFrame(friend_user_matrix.toarray(), index=unique_friends, columns=unique_users)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,487,488,489,490,491,492,493,494,495,496
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
440,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
model = TFIDFRecommender(K=50)
model.fit(friend_user_matrix)

print('Recommender has been fitted!')

100%|██████████| 497/497 [00:00<00:00, 92775.34it/s]

Recommender has been fitted!





In [None]:
def recommend(user_id, top_n=20): 
    if user_id not in user_to_idx: 
        popular = train[train.friendship == 1].groupby('friend_id').size().head(top_n).index.to_list() 

        if len(popular) < top_n: 
            glob_popular = train.group_by('friend_id').size().head(top_n).index.to_list() 
            popular.extend([friend for friend in glob_popular if friend not in popular])
        
        return popular[:top_n] 

    user_idx = user_to_idx[user_id] 
    user_history = set(train[train.user_id == user_id].friend_id.tolist()) 

    similar_users, scores = model.similar_items(user_idx, k=30) 

    friend_scores = {} 
    for user_idx, score in zip(similar_users, scores): 

        user_id = unique_users[user_idx]
        similar_users_friends = train[train.user_id == user_id].friend_id

        for fid, friendship in zip(similar_users_friends.friend_id, similar_users_friends.friendship):
            if fid in user_history:
                continue
        
            weight = score * (3.0 if friendship == 1.0 else 1.0)
            friend_scores[fid] = friend_scores.get(fid, 0) + weight

    sorted_friends = sorted(friend_scores.items(), key = lambda x: x[1])
    recommendations = [fid for fid in sorted_friends[:top_n]]

    if len(recommendations) < top_n:
        popular = train[train.friendship == 1].groupby('friend_id').size().sort_values(ascending=False).index.tolist()
        for fid in popular:
            if fid not in user_history and fid not in recommendations and len(recommendations) < top_n:
                recommendations.append(fid)
    
    return recommendations[:top_n]


In [None]:
result_implicit = []

for uid in train.user_id:
    pred = recommend(uid, top_n=20)
    pred = pred[:20] if len(pred) > 20 else pred + [0]*(20 - len(pred))
    result_implicit.append(pred)

submission_implicit = pd.DataFrame(result_implicit, columns=['user_id'] + [str(i) for i in range(20)])
submission_implicit.to_csv('submission_implicit.csv', index=False)
print("submission_implicit.csv создан!")
submission_implicit