In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score, average_precision_score

import clearml

from dotenv import load_dotenv
load_dotenv()

In [None]:
data_path = 'data/'
train_data = pd.read_parquet(data_path + 'ratings_train.pq')
test_data = pd.read_parquet(data_path + 'ratings_test.pq')
groups5 = pd.read_parquet(data_path + 'groups5.pq')
groups6 = pd.read_parquet(data_path + 'groups6.pq')
groups7 = pd.read_parquet(data_path + 'groups7.pq')

In [None]:
for i, group in enumerate([groups5, groups6, groups7]):
    test_data = test_data.merge(group, on='userId').rename(columns={'group': f'group{i+5}'})
test_data

In [None]:
task = clearml.Task.init(
    project_name = 'MoviesGRS_MFDP', 
    task_name = 'PopularMoviesRecommender', 
    tags = ['Baseline','PopularMoviesRecommender']
)

In [None]:
top_popular_films = (
    train_data
    .groupby(by='movieId')
    .agg({'userId': 'nunique'})
    .sort_values(by='userId', ascending=False)
    .rename(columns={'userId': 'userCount'})
    .reset_index()
)
top_popular_films

In [None]:
top_popular_films_ids = top_popular_films.movieId.values

In [None]:
users_watch_history_train = (
    train_data
    .groupby(by='userId')
    .agg({'movieId': list})
    .reset_index()
)
users_watch_history_train['unwatched'] = (
    users_watch_history_train.movieId
    .apply(lambda x: top_popular_films_ids[
        np.isin(top_popular_films_ids, x, invert=True)
    ][:10])
)
users_watch_history_train

In [None]:
recommendation = users_watch_history_train[['userId', 'unwatched']].rename(columns={'unwatched': 'recommendation'})
recommendation

In [None]:
test_data

In [None]:
users_watch_history_test: pd.DataFrame = (
    test_data
    .sort_values(by='rating', ascending=False)
    .groupby(by='userId')
    .agg(
        {
        **{f'group{i}': 'first' for i in range(5, 8)},
        'movieId': list
        }
    )
    .reset_index()
)
users_watch_history_test

In [None]:
np.isin([1, 2, 3], [2])

In [None]:
recom_and_history_test = users_watch_history_test.merge(recommendation, on='userId')

In [None]:
rec_array = recommendation.values
recom_and_history_test['relevance'] = (
    recom_and_history_test
    .apply(lambda row: np.isin(row['recommendation'], row['movieId']).astype(int), axis=1)
)
recom_and_history_test

In [None]:
recom_and_history_test['P_k'] = (
    recom_and_history_test.relevance
    .apply(lambda x: np.cumsum(x) * x / np.arange(1, len(x) + 1), 2)    
)
recom_and_history_test['P_k'] = (
    recom_and_history_test
    .apply(lambda row: row['P_k'].sum() / min(len(row['movieId']), len(row['recommendation'])), axis=1)
)
recom_and_history_test['P_k'] = (
    recom_and_history_test.P_k
    .apply(lambda x: np.around(x, 2))
)
recom_and_history_test

In [None]:
logger = clearml.Logger.current_logger()

In [None]:
metrics_results = {}

In [None]:
for group in [f'group{i}' for i in range(5, 8)]:
    metrics_name = f"MAP_{group}"
    metrics_value = (
        recom_and_history_test
        .groupby(by=group)
        .P_k
        .mean()
        .mean()
    )
    
    metrics_results[metrics_name] = metrics_value
    logger.report_single_value(
        name=metrics_name,
        value=metrics_value
    )

In [None]:
test_data

In [None]:
recommend_df = recom_and_history_test[['userId', 'recommendation']].explode('recommendation')
recommend_df

In [None]:
recommend_df = (
    recommend_df
    .merge(
        test_data, 
        left_on=['userId', 'recommendation'],
        right_on=['userId', 'movieId'],
        how='left'    
    )
    [['userId', 'recommendation', 'rating']]
    .fillna(0)
)
recom_and_history_test['rec_ratings'] = (
    recommend_df
    .groupby(by='userId')
    .agg({'rating': list})
    .reset_index()
    .rating
)
recom_and_history_test


In [None]:
recom_and_history_test['pseudo_model_output'] = (
    recom_and_history_test.recommendation
    .apply(lambda x: [(len(x) - i) for i in range(len(x))])
)
recom_and_history_test

In [None]:
recom_and_history_test['NDCG_k'] = (
    recom_and_history_test.apply(
        lambda row: ndcg_score([row['rec_ratings']], [row['pseudo_model_output']]),
        axis=1
    )
)


In [None]:
for group in [f'group{i}' for i in range(5, 8)]:
    metrics_name = f"NDCG_{group}"
    metrics_value = (
        recom_and_history_test
        .groupby(by=group)
        .NDCG_k
        .mean()
        .mean()
    )
    
    metrics_results[metrics_name] = metrics_value
    logger.report_single_value(
        name=metrics_name,
        value=metrics_value
    )

In [None]:
metrics_results

In [None]:
metrics_results_2d = {}
for res in metrics_results:
    m, g = res.split('_')
    metrics_results_2d[m] = {**metrics_results_2d.get(m, {}), **{g: metrics_results[res]}}
metrics_results_2d

In [None]:
pd.DataFrame(metrics_results_2d)

In [None]:
task.upload_artifact('results_df', pd.DataFrame(metrics_results_2d))

In [None]:
task.close()