# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas scipy matplotlib`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import math
from statistics import mean, median, stdev
from collections import defaultdict
from random import choice, sample
from typing import Sequence

from prettytable import PrettyTable
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds

In [2]:
PATH = './lab-files/ml-latest-small'

In [3]:
# wczytujemy oceny uzytkownikow i obliczamy (za pomoc dekompozycji macierzy) wszystkie przewidywane oceny filmow

def read_ratings(path: str, k=600, scale_factor=2.0, print_stats=True) -> pd.DataFrame:
    # idea: https://www.kaggle.com/code/indralin/movielens-project-1-2-collaborative-filtering
    reviews = pd.read_csv(f'{path}/ratings.csv', names=['userId', 'movieId', 'rating', 'time'], delimiter=',', engine='python', skiprows=1)

    reviews.drop(['time'], axis=1, inplace=True)
    reviews_no, _ = reviews.shape
    reviews_matrix = reviews.pivot(index='userId', columns='movieId', values='rating')
    movies = reviews_matrix.columns
    users = reviews_matrix.index
    users_no, movies_no = reviews_matrix.shape
    print(f'Got {reviews_no} reviews for {movies_no} movies and {users_no} users.')

    user_ratings_mean = np.nanmean(reviews_matrix.values, axis=1)
    normalized_reviews_matrix = np.nan_to_num(reviews_matrix.values - user_ratings_mean.reshape(-1, 1), 0.0)

    U, sigma, Vt = svds(normalized_reviews_matrix, k=k)
    sigma = np.diag(sigma)
    predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1).clip(0.5, 5.0)
    mean_square_error = np.nanmean(np.square(predicted_ratings - reviews_matrix.values))
    std_square_error = np.nanstd(np.square(predicted_ratings - reviews_matrix.values))
    print(f'Reviews prediction mean square error = {mean_square_error}')
    print(f'Reviews prediction standatd deviation of square error = {std_square_error}')

    if print_stats:
        stats = [
            ('metric', 'dataset', 'prediction'),
            ('avg', np.nanmean(reviews_matrix), np.mean(predicted_ratings)),
            ('st_dev', np.nanstd(reviews_matrix), np.std(predicted_ratings)),
            ('median', np.nanmedian(reviews_matrix), np.median(predicted_ratings)),
            ('p25', np.nanquantile(reviews_matrix, 0.25), np.quantile(predicted_ratings, 0.25)),
            ('p75', np.nanquantile(reviews_matrix, 0.75), np.quantile(predicted_ratings, 0.75))
        ]
        print('Stats (for raings in original range [0.5, 5.0]):')
        print('\n'.join([str(s) for s in stats]))

    rounded_predictions = np.rint(scale_factor * predicted_ratings) # cast values to {1, 2, ..., 10}
    return pd.DataFrame(data=rounded_predictions, index=list(users), columns=list(movies))

ratings = read_ratings(PATH)
# dostep do danych:
# ratings[movieId][userId] pobiera 1 wartosc
# ratings.loc[:, movieId] pobiera wektor dla danego filmu
# ratings.loc[userId, :] pobiera wektor dla danego uzytkownika
ratings

Got 100836 reviews for 9724 movies and 610 users.
Reviews prediction mean square error = 1.657778784292466e-05
Reviews prediction standatd deviation of square error = 0.0007928950536518413
Stats (for raings in original range [0.5, 5.0]):
('metric', 'dataset', 'prediction')
('avg', np.float64(3.501556983616962), np.float64(3.657222337747399))
('st_dev', np.float64(1.042524069618056), np.float64(0.49546237560971024))
('median', np.float64(3.5), np.float64(3.7052240088117694))
('p25', np.float64(3.0), np.float64(3.3574517183164025))
('p75', np.float64(4.0), np.float64(3.999981626883001))


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,8.0,9.0,8.0,9.0,9.0,8.0,9.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
2,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
3,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
4,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
5,8.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,5.0,7.0,7.0,7.0,7.0,7.0,5.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
607,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
608,5.0,4.0,4.0,6.0,6.0,6.0,6.0,6.0,6.0,8.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
609,6.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,8.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


In [4]:
# wczytujemy nazwy filmow i kategorie

movies_metadata: pd.DataFrame = pd.read_csv(f'{PATH}/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
# wczytujemy przykladowe grupy uzytkownikow
groups: list[list[int]] = pd.read_csv(f'{PATH}/../groups.csv') \
    .to_numpy() \
    .tolist()
groups

[[111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [6]:
# przygotowujemy funkcje pomocnicza

def describe_group(group: list[int], N=10) -> None:
    print(f'\n\nUser ids: {group}')
    group_size = len(group)

    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')

    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[-N:].index)]
    worst_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[:N].index)]

    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')

describe_group(groups[5])



User ids: [269, 360, 469, 287, 308]

Mean ratings deviation: 1.1259149574579788
Median ratings deviation: 1.0954451150103321
Standard deviation of ratings deviation: 0.17836724055768716

Best movies:
Toy Story (1995), 8.2*
Forrest Gump (1994), 8.2*
Willy Wonka & the Chocolate Factory (1971), 8.0*
Braveheart (1995), 8.0*
Terminator 2: Judgment Day (1991), 7.8*
Schindler's List (1993), 7.8*
Dances with Wolves (1990), 7.6*
James and the Giant Peach (1996), 7.6*
Dead Man Walking (1995), 7.6*
Nixon (1995), 7.6*

Worst movies:
Broken Arrow (1996), 5.2*
Sleepy Hollow (1999), 5.4*
The Devil's Advocate (1997), 5.4*
Cable Guy, The (1996), 5.4*
Mission: Impossible (1996), 5.6*
Nutty Professor, The (1996), 5.6*
Down Periscope (1996), 5.8*
Cheech and Chong's Up in Smoke (1978), 5.8*
Wrong Man, The (1956), 5.8*
Fog, The (2005), 5.8*


## Część 2. - algorytmy proste

In [7]:
# zdefiniujmy interfejs dla wszystkich algorytmow rekomendacyjnych

class Recommender:
    def recommend(
        self,
        movies: pd.Index,
        ratings: pd.DataFrame,
        group: list[int],
        size: int
    ) -> list[str]:
        pass


# jako pierwszy zaimplementujemy algorytm losowy - dla porownania

class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'

    def recommend(
        self,
        movies: pd.Index,
        ratings: pd.DataFrame,
        group: list[int],
        size: int
    ) -> list[str]:
        return sample(list(movies), size)

In [8]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen

class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'

    def recommend(
        self,
        movies: pd.Index,
        ratings: pd.DataFrame,
        group: list[int],
        size: int
    ) -> list[str]:
        return list(
            ratings.loc[group] \
                .mean() \
                .sort_values(ascending=False) \
                .index[:size]
        )

In [9]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen,
#   ale rownoczesnie wykluczajacy te filmy, ktore otrzymaly choc jedna ocene ponizej thresholdu

class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold: float):
        self.name = 'average_without_misery'
        self.score_threshold: float = score_threshold

    def recommend(
        self,
        movies: pd.Index,
        ratings: pd.DataFrame,
        group: list[int],
        size: int
    ) -> list[str]:
        group_ratings: pd.DataFrame = ratings.loc[group]
        return list(
            group_ratings \
                .mean(axis=0) \
                .sort_values(ascending=False)[
                    group_ratings.min(axis=0) >= self.score_threshold
                ] \
                .index[:size]
        )

In [10]:
# algorytm uwzgledniajacy preferencje tylko jednego uzytkownika w kazdej iteracji

class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'

    def recommend(
        self,
        movies: pd.Index,
        ratings: pd.DataFrame,
        group: list[int],
        size: int
    ) -> list[str]:
        recommended_movies: list[int] = []
        user_movies: dict[int, pd.Series] = {
            user: ratings.loc[user].sort_values(ascending=False)
            for user in group
        }
        user_movie_indicies: dict[int, int] = {user: 0 for user in group}

        i: int = 0
        while len(recommended_movies) < size:
            user: int = group[i % len(group)]
            top_user_movie: int = user_movies[user] \
                .index[user_movie_indicies[user]] \
                .item()

            if top_user_movie not in recommended_movies:
                recommended_movies.append(top_user_movie)

            i += 1
            user_movie_indicies[user] += 1

        return recommended_movies

In [11]:
# wybrany algorytm wyborczy (dyktatura, Borda, Copeland)

class VotingRecommender(Recommender):
    def __init__(self):
        self.name = 'borda_count'

    def recommend(
        self,
        movies: pd.Index,
        ratings: pd.DataFrame,
        group: list[int],
        size: int
    ) -> list[str]:
        movie_scores: dict[int, int] = {movie: 0 for movie in movies}

        for user in group:
            rating_enumeration = enumerate(
                ratings.loc[user].sort_values().index,
                start=1
            )
            for i, movie in rating_enumeration:
                movie_scores[movie] += i

        return list(
            sorted(movie_scores, key=movie_scores.get, reverse=True)
        )[:size]

In [12]:
# algorytm zachlanny, aproksymujacy metode Proportional Approval Voting
#   w kazdej iteracji wybieramy ten film, ktory najbardziej zwieksza zadowolenie zgodnie z punktacja PAV

class ProportionalApprovalVotingRecommender(Recommender):
    def __init__(self, threshold: float):
        self.name = 'PAV'
        self.threshold: float = threshold

    def recommend(
        self,
        movies: pd.Index,
        ratings: pd.DataFrame,
        group: list[int],
        size: int
    ) -> list[str]:
        recommendation: list[int] = []
        user_sat: dict[int, int] = {user: 1 for user in group}

        for k in range(size):
            best_movie: int = 0
            best_score: float = 0.0

            for movie in movies:
                score: float = 0.0

                for user in group:
                    if ratings.loc[user][movie] >= self.threshold:
                        score += 1.0 / user_sat[user]

                if score > best_score and movie not in recommendation:
                    best_movie = movie
                    best_score = score

            for user in group:
                if ratings.loc[user][best_movie] >= self.threshold:
                    user_sat[user] += 1

            recommendation.append(best_movie)

        return recommendation

## Część 3. - funkcje celu

In [13]:
# dwie funkcje pomocnicze:
#  - znajdujaca ulubione filmy danego uzytkownika
#  - obliczajaca sume ocen wystawionych przez uzytkownika wszystkim filmom w rekomendacji

def top_n_movies_for_user(
    ratings: pd.DataFrame,
    movies: pd.Index,
    user_id: int,
    n: int
) -> list[int]:
    return list(ratings.loc[user_id].sort_values(ascending=False).index[:n])

def total_score(
    recommendation: list[int],
    user_id: int,
    ratings: pd.DataFrame
) -> float:
    user_ratings = ratings.loc[user_id]
    return user_ratings.loc[user_ratings.index.isin(recommendation)] \
        .sum() \
        .item()

In [14]:
# funkcja obliczajaca zadowolenie pojedynczego uzytkownika
#  - iloraz zadowolenia z wygenerowanej rekomendacji oraz zadowolenia z hipotetycznej rekomendacji idealnej
def overall_user_satisfaction(
    recommendation: list[int],
    user_id: int,
    movies: pd.Index,
    ratings: pd.DataFrame
) -> float:
    n: int = len(recommendation)
    top_user_movies: list[int] = top_n_movies_for_user(
        ratings, movies, user_id, n
    )

    recommendation_score: float = total_score(recommendation, user_id, ratings)
    top_user_movie_score: float = total_score(top_user_movies, user_id, ratings)

    return recommendation_score / top_user_movie_score


# funkcja celu - srednia z zadowolenia wszystkich uzytkownikow w grupie
def overall_group_satisfaction(
    recommendation: list[int],
    group: list[int],
    movies: pd.Index,
    ratings: pd.DataFrame
) -> float:
    return mean([
        overall_user_satisfaction(recommendation, user, movies, ratings)
        for user in group
    ])

# funkcja celu - roznica miedzy maksymalnym i minimalnym zadowolenie w grupie
def group_disagreement(
    recommendation: list[int],
    group: list[int],
    movies: pd.Index,
    ratings: pd.DataFrame
) -> float:
    overall_user_satisfactions: list[float] = [
        overall_user_satisfaction(recommendation, user_id, movies, ratings)
        for user_id in group
    ]
    return max(overall_user_satisfactions) - min(overall_user_satisfactions)

## Część 4. - Sequential Hybrid Aggregation

In [15]:
# algorytm balansujacy pomiedzy wyborem elementow o najwyzszej sredniej ocen
#   i o najwyzszej minimalnej ocenie
#   wyliczajacy w kazdej iteracji parametr alfa - jak na wykladzie
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = 'sequential_hybrid_aggregation'

    def recommend(
        self,
        movies: pd.Index,
        ratings: pd.DataFrame,
        group: list[int],
        size: int
    ) -> list[str]:
        avg_score: pd.Series = ratings.loc[group].mean()
        least_score: pd.Series = ratings.loc[group].min()
        alpha: float = 1.0
        score: dict[int, float] = {}
        recommendation: list[int] = []

        for k in range(size):
            score = {
                movie: (1.0 - alpha) * avg_score.loc[movie]
                    + alpha * least_score.loc[movie]
                for movie in movies
            }
            score.update({movie: -21.37 for movie in recommendation})

            max_score_movie: int = max(score.items(), key=lambda k: k[1])[0]
            recommendation.append(max_score_movie)

            alpha = group_disagreement(recommendation, group, movies, ratings)

        return recommendation

## Część 5. - porównanie algorytmów

In [16]:
def round_values(values: Sequence[float]) -> list[str]:
    return [f'{value:.3f}' for value in values]


recommenders: list[Recommender] = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(),
    ProportionalApprovalVotingRecommender(5),
    SequentialHybridAggregationRecommender()
]

recommendation_size = 10

# dla kazdego algorytmu:
#  - wygenerujmy jedna rekomendacje dla kazdej grupy
#  - obliczmy wartosci obu funkcji celu dla kazdej rekomendacji
#  - wypiszmy wyniki na konsole

group_table: PrettyTable = PrettyTable(
    field_names=['Recommender Name', 'Metric']
        + [', '.join(map(str, group)) for group in groups]
)
summary_table: PrettyTable = PrettyTable(
    field_names=[
        'Recommender Name',
        'Metric',
        'Mean',
        'Median',
        'Minimum',
        'Maximum',
        'Standard Deviation',
    ]
)

for recommender in recommenders:
    group_sats: list[float] = []
    group_disags: list[float] = []

    for group in groups:
        recommendation: list[int] = recommender.recommend(
            ratings.columns, ratings, group, recommendation_size
        )
        group_sat: float = overall_group_satisfaction(
            recommendation, group, movies_metadata, ratings
        )
        group_dis: float = group_disagreement(
            recommendation, group, movies_metadata, ratings
        )

        group_sats.append(group_sat)
        group_disags.append(group_dis)

    group_table.add_row(
        [recommender.name, 'Satisfaction'] + round_values(group_sats)
    )
    group_table.add_row(
        ['', 'Disagreement'] + round_values(group_disags),
        divider=True,
    )

    summary_table.add_row(
        [recommender.name, 'Satisfaction']
        + round_values([
            mean(group_sats),
            median(group_sats),
            min(group_sats),
            max(group_sats),
            stdev(group_sats),
        ])
    )
    summary_table.add_row(
        ['', 'Disagreement']
            + round_values([
                mean(group_disags),
                median(group_disags),
                min(group_disags),
                max(group_disags),
                stdev(group_disags),
            ]),
        divider=True
    )

print(group_table)
print()
print(summary_table)

+-------------------------------+--------------+-------------------------+-------------------------+-------------------------+-------------------------+----------------------+-------------------------+-------------------------+-------------------------+
|        Recommender Name       |    Metric    | 111, 307, 474, 599, 414 | 469, 182, 232, 448, 600 | 508, 581, 497, 402, 566 | 300, 515, 245, 568, 507 | 2, 371, 252, 518, 37 | 269, 360, 469, 287, 308 | 243, 527, 418, 118, 370 | 186, 559, 327, 553, 314 |
+-------------------------------+--------------+-------------------------+-------------------------+-------------------------+-------------------------+----------------------+-------------------------+-------------------------+-------------------------+
|             random            | Satisfaction |          0.600          |          0.656          |          0.761          |          0.879          |        0.829         |          0.634          |          0.796          |          0