In [1]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
from tqdm.auto import tqdm
tqdm.pandas()

import pickle
import matplotlib.pyplot as plt

In [87]:
with open("research/models/SVD.pkl", "rb") as f:
    algo = pickle.load(f)

In [3]:
train = pd.read_csv('research/model_data/merged_historical_data.csv', sep=';')
pca_users = pd.read_csv('research/model_data/pca_users.csv', sep=';')
pca_groups = pd.read_csv('research/model_data/pca_groups.csv', sep=';')
group_to_recommend = pd.read_csv('research/raw_data/group_list_to_recommend.csv', header=None)

In [4]:
group_to_recommend = group_to_recommend.values.reshape(1, -1).tolist()[0]

In [30]:
group_to_recommend = train[train.group_id.isin(group_to_recommend)].group_id.unique().reshape(1, -1).tolist()[0]

In [91]:
neigh_groups = NearestNeighbors(n_neighbors=30, radius=50)
neigh_groups.fit(pca_groups[pca_groups.group_id.isin(group_to_recommend)].drop(columns=['group_id']))

neigh_users = NearestNeighbors(n_neighbors=30, radius=50)
neigh_users.fit(pca_users.drop(columns=['user_id']))

In [92]:
with open("research/models/neigh_users.pkl", "wb") as f:
    pickle.dump(neigh_users, f)

with open("research/models/neigh_groups.pkl", "wb") as f:
    pickle.dump(neigh_groups, f)

In [93]:
def get_similar_items(user_items):
    x = pca_groups[pca_groups.group_id.isin(user_items)].drop(columns=['group_id'])
    indexes = neigh_groups.kneighbors(x, n_neighbors=30, return_distance=False).reshape(1, -1).tolist()[0]
    similar_items = pca_groups.iloc[indexes]['group_id'].drop_duplicates().values
    return similar_items

def get_similar_users(user):
    x = pca_users[pca_users.user_id.isin([user])].drop(columns=['user_id'])
    indexes = neigh_users.kneighbors(x, n_neighbors=30, return_distance=False).reshape(1, -1).tolist()[0]
    similar_users = pca_users.iloc[indexes]['user_id'].drop_duplicates().values
    return similar_users

In [94]:
reader = Reader(rating_scale=(1, 94))

data = Dataset.load_from_df(train[['user_id', 'group_id', 'number_of_lessons']], reader)
Set = data.build_full_trainset()

In [95]:
popular_groups = list(train[train['group_id'].isin(group_to_recommend)].groupby('group_id')['user_id'].count()[:10].index)

In [96]:
def predict_for_users(users, recommend_groups, k=10, exclude_user_items=False, use_similar_items=False):
    predicted_groups = []
    fillValue = Set.global_mean

    for uid in tqdm(users):
        if uid not in [Set.to_raw_uid(uid) for uid in Set.all_users()]:
            predictions = [(uid, group, 1) for group in popular_groups]
        else:
            uid = Set.to_inner_uid(uid)
            objects_to_predict = []
            user_item_ratings = Set.ur[uid]
            user_items = [item for (item,_) in (user_item_ratings)]
            if use_similar_items:
                similar_items = get_similar_items([Set.to_raw_iid(item_id) for item_id in user_items])
                used_items = [Set.to_inner_iid(item_id) for item_id in similar_items if item_id in recommend_groups]
            else:
                used_items = [item for item in Set.all_items() if Set.to_raw_uid(item) if Set.to_raw_iid(item) in group_to_recommend]
            for iid in used_items:
                # get a prediction for specific users and items.
                if exclude_user_items:
                    if iid not in user_items:
                        objects_to_predict.append((Set.to_raw_uid(uid), Set.to_raw_iid(iid), fillValue))
                else:
                    objects_to_predict.append((Set.to_raw_uid(uid), Set.to_raw_iid(iid), fillValue))
            predictions = algo.test(objects_to_predict)
            predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
            predictions = [(value.uid, value.iid, value.est) for value in predictions[:k]]
            if len(predictions) < k:
                used_items = [item for item in Set.all_items() if Set.to_raw_uid(item) if Set.to_raw_iid(item) in group_to_recommend]
                for iid in used_items:
                    # get a prediction for specific users and items.
                    if exclude_user_items:
                        if iid not in user_items:
                            objects_to_predict.append((Set.to_raw_uid(uid), Set.to_raw_iid(iid), fillValue))
                    else:
                        objects_to_predict.append((Set.to_raw_uid(uid), Set.to_raw_iid(iid), fillValue))
                fill_predictions = algo.test(objects_to_predict)
                fill_predictions = sorted(fill_predictions, key=lambda x: x.est, reverse=True)
                fill_predictions = [(value.uid, value.iid, value.est) for value in fill_predictions if value not in predictions]
                predictions += fill_predictions[:k - len(predictions)]
        predicted_groups.append(predictions)
    return predicted_groups

In [97]:
test = pd.read_csv('research/raw_data/test.csv')
users_to_score = test.rename(columns={'уникальный номер участника': 'user_id'})['user_id'].values

In [98]:
predicted_groups = predict_for_users(users_to_score, group_to_recommend, k=10, exclude_user_items=True, use_similar_items=True)

  0%|          | 0/200 [00:00<?, ?it/s]

In [99]:
predicted_groups = [[str(i) for u, i, score in predict] for predict in predicted_groups]

In [100]:
predicted_groups = [', '.join(predicted) for predicted in predicted_groups]

In [101]:
test['уникальный номер группы'] = predicted_groups
test.to_csv('submission.csv', index=False, sep=';')

In [5]:
groups = pd.read_csv('research/backend_data/groups.csv', sep=';')
raw_users = pd.read_csv('research/processed_data_2_stage/users.csv', sep=';')

In [39]:
groups = groups.merge(train.groupby('group_id', as_index=False)['user_id'].count().rename(columns={'user_id': 'popularity'}), how='left', on='group_id').fillna(0)

In [103]:
raw_users.to_csv('research/backend_data/users.csv', sep=';', index=False)

## Get user recommend score for each group

In [104]:
predicted_groups = predict_for_users(demo_user_data.user_id.values,
                                     group_to_recommend,
                                     k=-1,
                                     exclude_user_items=True,
                                     use_similar_items=False)
predicted_groups = [(u, i, score) for predict in predicted_groups for (u, i, score) in predict]

  0%|          | 0/3 [00:00<?, ?it/s]

In [105]:
final = pd.DataFrame(predicted_groups, columns=['user_id', 'group_id', 'score'])
final.to_csv('research/backend_data/group_scores_to_rank.csv', sep=';', index=False)

## Get recommendations

In [106]:
predicted_groups = predict_for_users(demo_user_data.user_id.values,
                                     group_to_recommend, k=10,
                                     exclude_user_items=True,
                                     use_similar_items=True)
predicted_groups = [(u, i, score) for predict in predicted_groups for (u, i, score) in predict]

  0%|          | 0/3 [00:00<?, ?it/s]

In [107]:
final = pd.DataFrame(predicted_groups, columns=['user_id', 'group_id', 'score'])
final.to_csv('research/backend_data/predict_best.csv', sep=';', index=False)

### Predictions to expand interests

In [108]:
predicted_groups = predict_for_users(demo_user_data.user_id.values,
                                     group_to_recommend, k=300,
                                     exclude_user_items=True, use_similar_items=True)
predicted_groups = [(u, i, score) for predict in predicted_groups for (u, i, score) in predict[-10:]]

  0%|          | 0/3 [00:00<?, ?it/s]

In [109]:
final = pd.DataFrame(predicted_groups, columns=['user_id', 'group_id', 'score'])
final.to_csv('research/backend_data/predict_to_expand.csv', sep=';', index=False)