# Aula 14 - Vieses em Sistemas de Recomendação - Exercícios

In [1]:
import pandas as pd
import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import accuracy, Dataset, SVD
from surprise.model_selection import KFold
from surprise.prediction_algorithms.knns import KNNBasic

## Vídeos de Apoio

https://www.youtube.com/watch?v=OSv5J1EVEqA

https://www.youtube.com/watch?v=abGCaK86tY4


## Exercícios de Vieses

### Download do Dataset

In [None]:
# !wget https://raw.githubusercontent.com/Andre-Sacilotti/recsys_lectures/main/datasets/steam-200k.csv  -O ./steam-200k.csv

--2025-10-31 14:21:45--  https://raw.githubusercontent.com/Andre-Sacilotti/recsys_lectures/main/datasets/steam-200k.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2268468 (2.2M) [text/plain]
Saving to: ‘./steam-200k.csv’


2025-10-31 14:21:47 (1.60 MB/s) - ‘./steam-200k.csv’ saved [2268468/2268468]



### Funções comuns

In [3]:
df = pd.read_csv("./steam-200k.csv")
df['hours_played'] = np.log10(df['hours_played'])/np.log10(df['hours_played']).max()
df['user_id'] = df['user_id'].astype(str)
df.head()

train, test = train_test_split(df, test_size=.3, random_state=42)

soma_interacoes_por_jogo = train.groupby('game_title').count()[['hours_played']]
soma_interacoes_por_jogo.columns = ['interactions']

map_soma = soma_interacoes_por_jogo.to_dict()['interactions']

soma_interacoes_por_jogo.head(5)

soma_interacoes_por_jogo_ordenado = soma_interacoes_por_jogo.sort_values('interactions', ascending=False)

map_id = dict(list(enumerate(soma_interacoes_por_jogo_ordenado.index)))
map_id = {v:k for k,v in map_id.items()}

jogos_maior_20porcento_nome = soma_interacoes_por_jogo_ordenado.reset_index().values[:11, 0]
jogos_menor_20porcento_nome = soma_interacoes_por_jogo_ordenado.reset_index().values[-2680:, 0]

jogos_entre_os_20porcento = set(df['game_title']) - set(jogos_maior_20porcento_nome) - set(jogos_menor_20porcento_nome)

In [4]:
block_buster_group = []
niche_group = []
diverse_group = []

for user_id in train['user_id'].unique():
    interacted_by_user = train[train['user_id'] == user_id]
    high_pop = interacted_by_user[interacted_by_user['game_title'].isin(jogos_maior_20porcento_nome)]
    lowest_pop = interacted_by_user[interacted_by_user['game_title'].isin(jogos_menor_20porcento_nome)]

    if len(high_pop)/len(interacted_by_user) > 0.5:
        block_buster_group.append(user_id)
    elif len(lowest_pop)/len(interacted_by_user) > 0.5:
        niche_group.append(user_id)
    else:
        diverse_group.append(user_id)


In [5]:
preds = get_recommendation(model)[0]

NameError: name 'get_recommendation' is not defined

In [35]:
gap_bb_profile = np.mean([calculate_gap_profile(train, i) for i in block_buster_group])
gap_n_profile = np.mean([calculate_gap_profile(train, i) for i in niche_group])
gap_d_profile = np.mean([calculate_gap_profile(train, i) for i in diverse_group])
gap_bb_profile, gap_n_profile, gap_d_profile

(2446.012039943106, 24.63106397647737, 275.0086012684163)

In [57]:
def item_is_relevant(user_id, item_id):
    aux = df[df["user_id"] == user_id]
    if item_id in list(aux['game_title']):
        return True
    return False

def calculate_MRR(map_recommendations):

    MRR = 0
    for user_id in map_recommendations:
        user_find_corerect_item = False
        for index, (item, score) in enumerate(map_recommendations[user_id]):
            if user_find_corerect_item is False:
                if item_is_relevant(user_id, item):
                    MRR += (1/(index+1))
                    user_find_corerect_item = True
        
    return MRR/len(map_recommendations)


def calculate_gap_profile(train, user_id):
    list_items = train[train['user_id'] == user_id]['game_title'].values
    gap = sum([map_soma[i] for i in list_items])
    return gap/len(list_items)
    

def calculate_gap(list_items):
    gap = sum([map_soma.get(i, 0) for i in list_items])
    return gap/len(list_items)

def calculate_gap_groups(predictions):
    gap_bb_rec1 = 0
    gap_d_rec1 = 0
    gap_n_rec1 = 0

    n_bb, n_d, n_n = 0, 0, 0

    for user, reclist in zip(test['user_id'].unique()[:200], predictions):
        if user in block_buster_group:
            n_bb += 1
            gap_bb_rec1 += calculate_gap([i[0] for i in predictions[reclist]])
        elif user in niche_group:
            n_n += 1
            gap_n_rec1 += calculate_gap([i[0] for i in predictions[reclist]])
        else:
            n_d += 1
            gap_d_rec1 += calculate_gap([i[0] for i in predictions[reclist]])

    gap_bb_rec1 = gap_bb_rec1/n_bb
    gap_d_rec1 = gap_d_rec1/n_d
    gap_n_rec1 = gap_n_rec1/n_n
    
    delta_gab_bb = (gap_bb_rec1 - gap_bb_profile )/gap_bb_profile
    delta_gab_d = (gap_d_rec1 - gap_d_profile )/gap_d_profile
    delta_gab_n = (gap_n_rec1 - gap_n_profile )/gap_n_profile
    
    return delta_gab_bb, delta_gab_d, delta_gab_n

def get_recommendation(model, alpha=0.5):
    
    prediction_user_map = {}
    prediction_user_map_corrected = {}

    for user in test['user_id'].unique()[:200]:

        data = {"game_title": list(set(df["game_title"].unique()))}
        user_testset_df = pd.DataFrame(data)
        user_testset_df["hours_played"] = 0.0
        user_testset_df["user_id"] = user

        testset = (
            Dataset.load_from_df(
                user_testset_df[["user_id", "game_title", "hours_played"]],
                reader=reader,
            )
            .build_full_trainset()
            .build_testset()
        )
        
        pred_list = model.test(testset)
        
        predictions = sorted(
            [(pred.iid, pred.est)for pred in pred_list if ((pred.uid == user))],
            key=lambda x: x[1],reverse=True
        )
        
        predictions_corrected = sorted(
            [(pred.iid, (1-alpha)*pred.est + (alpha/np.log(map_soma.get(pred.iid, 1)+1)) )for pred in pred_list if ((pred.uid == user))],
            key=lambda x: x[1],reverse=True
        )
        
        prediction_user_map[user] = predictions[:10]
        prediction_user_map_corrected[user] = predictions_corrected[:10]

                
    return prediction_user_map, prediction_user_map_corrected

***Exercício 01:*** Explore como outros modelos se comportam em relação ao viés de popularidade. Tente visualizar a distribuição das recomendações e as métricas estudadas.

In [None]:
# TODO

***Exercício 02:*** Qual o efeito provocado pela variação do parâmetro $\alpha$? Demonstre como variam as métricas de acurácia e viés de popularidade com a variação do parâmetro.

In [None]:
# TODO

## Exercícios de Fairness

In [72]:
!wget https://raw.githubusercontent.com/Andre-Sacilotti/recsys_lectures/main/datasets/ratings.dat  -O ./ratings.dat
!wget https://raw.githubusercontent.com/Andre-Sacilotti/recsys_lectures/main/datasets/movies.dat  -O ./movies.dat

--2022-11-13 10:26:34--  https://raw.githubusercontent.com/Andre-Sacilotti/recsys_lectures/main/datasets/ratings.dat
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8001::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4318513 (4.1M) [text/plain]
Saving to: ‘./ratings.dat’


2022-11-13 10:26:36 (15.4 MB/s) - ‘./ratings.dat’ saved [4318513/4318513]

--2022-11-13 10:26:36--  https://raw.githubusercontent.com/Andre-Sacilotti/recsys_lectures/main/datasets/movies.dat
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8003::154, 2606:50c0:8001::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 171308 (167K) [text/plain]
Saving to: ‘./mo

In [91]:
df = pd.read_csv("./ratings.dat", sep=",")
df_genres = pd.read_csv("./movies.dat", sep="::", names=['item', 'title', 'genres'])
train, test = train_test_split(df, test_size=.3, random_state=42)
genre_map = {i['item']:i['genres'].split("|") for i in df_genres[['item', 'genres']].to_dict('records')}

  df_genres = pd.read_csv("./movies.dat", sep="::", names=['item', 'title', 'genres'])


### Funções Comuns

In [102]:
def get_user_recommendation_distribution(prediction_user_map):
    user_rec_distribution = {}
    n = 0
    for (item, score) in prediction_user_map:
        for genre in genre_map[item]:
            if genre not in user_rec_distribution:
                user_rec_distribution[genre] = 0
            n += 1
            user_rec_distribution[genre] += 1
            
    user_rec_distribution = {k: v/n for k, v in sorted(user_rec_distribution.items(), key=lambda item: item[1])}
    return user_rec_distribution


def user_rank_miscalibration(user_profile_dist, rec_profile_dist, alpha=0.001):
    p_g_u = user_profile_dist
    q_g_u = rec_profile_dist
    
    Ckl = 0
    for genre, p in p_g_u.items():
        q = q_g_u.get(genre, 0.0)
        til_q = (1 - alpha) * q + alpha * p

        if til_q == 0 or p_g_u.get(genre, 0) == 0:
            Ckl = Ckl
        else:
            Ckl += p * np.log2(p / til_q)
    return Ckl

def get_mean_rank_miscalibration(predictions):
    
    MRMC = 0
    
    for user in predictions:
        RMC = 0
        user_profile_dist = get_user_profile_distribution(train, user)
        if user_profile_dist == {}:
            continue
        
        void = user_rank_miscalibration(user_profile_dist, {})
        N = len(predictions[user])
        for i in range(1, N):
            user_rec_dist = get_user_recommendation_distribution(predictions[user][:i])
            kl = user_rank_miscalibration(user_profile_dist, user_rec_dist)
            RMC += kl/void

        MRMC += RMC/N
    
    return MRMC/len(predictions)

def get_recommendation_raw(model):
    
    prediction_user_map = {}

    for user in test['user'].unique()[:200]:

        data = {"item": list(set(df["item"].unique()))}
        user_testset_df = pd.DataFrame(data)
        user_testset_df["rating"] = 0.0
        user_testset_df["user"] = user

        testset = (
            Dataset.load_from_df(
                user_testset_df[["user", "item", "rating"]],
                reader=reader,
            )
            .build_full_trainset()
            .build_testset()
        )
        
        pred_list = model.test(testset)
        
        predictions = sorted(
            [(pred.iid, pred.est)for pred in pred_list if ((pred.uid == user))],
            key=lambda x: x[1],reverse=True
        )
        
        
        prediction_user_map[user] = predictions[:10]

                
    return prediction_user_map

def rerank_recommendation(profile_dist, list_recomended_items, user, N, tradeoff):
    re_ranked_list = []
    re_ranked_with_score = []
    
    for _ in range(N):
        
        max_mmr = -np.inf
        max_item = None
        max_item_rating = None
        
        for item, rating in list_recomended_items:
            if item in re_ranked_list:
                continue
                
            temporary_list = re_ranked_list + [item]
            temporary_list_with_score = re_ranked_with_score + [(item, rating)]
                
            weight_part = sum(
                recomendation[1]
                for recomendation in temporary_list_with_score
            )
            
            full_tmp_calib = calculate_calibration_sum(
                profile_dist,
                temporary_list_with_score,
                user
            )
            
            maximized = (1 - tradeoff)*weight_part - tradeoff*full_tmp_calib
            
            if maximized > max_mmr:
                max_mmr = maximized
                max_item = item
                max_item_rating = rating
            
        if max_item is not None:
            re_ranked_list.append(max_item)
            re_ranked_with_score.append((max_item, max_item_rating))
            
    return re_ranked_list, re_ranked_with_score  

def calculate_calibration_sum(profile_dist, temporary_list_with_score, user, alpha=0.001):
    kl_div = 0.0
    reco_distr = get_user_recommendation_distribution(temporary_list_with_score)
    for genre, p in profile_dist.items():
        q = reco_distr.get(genre, 0.0)
        til_q = (1 - alpha) * q + alpha * p

        if p == 0.0 or til_q == 0.0:
            kl_div = kl_div
        else:
            kl_div = kl_div + (p * np.log2(p / til_q))
    return kl_div

def get_user_profile_distribution(df, user):
    user_profile_distribution = {}
    n = 0
    for item in df[df['user'] == user]['item'].values:

        for genre in genre_map[item]:
            if genre not in user_profile_distribution:
                user_profile_distribution[genre] = 0
            n += 1
            user_profile_distribution[genre] += 1
            
    user_profile_distribution = {k: v/n for k, v in sorted(user_profile_distribution.items(), key=lambda item: item[1])}
    return user_profile_distribution


def get_recommendation_fairness(model, lambda_=0.5):
    
    prediction_user_map = {}

    for user in test['user'].unique()[:200]:
        
        user_profile_distribution = get_user_profile_distribution(train, user)

        data = {"item": list(set(df["item"].unique()))}
        user_testset_df = pd.DataFrame(data)
        user_testset_df["rating"] = 0.0
        user_testset_df["user"] = user

        testset = (
            Dataset.load_from_df(
                user_testset_df[["user", "item", "rating"]],
                reader=reader,
            )
            .build_full_trainset()
            .build_testset()
        )
        
        pred_list = model.test(testset)
        
        predictions = sorted(
            [(pred.iid, pred.est)for pred in pred_list if ((pred.uid == user))],
            key=lambda x: x[1],reverse=True
        )
        
        reranked_list = rerank_recommendation(
            user_profile_distribution,
            predictions[:100],
            user,
            10,
            lambda_
        )
        
        
        prediction_user_map[user] = reranked_list[1]

                
    return prediction_user_map

***Exercício 03:*** Explore como outros modelos se comportam com as injustiças relacionadas ao genero do filme. Visualize as métricas para diferentes modelos.

In [None]:
# TODO

***Exercício 04:*** Qual o efeito provocado pela variação do parâmetro $\lambda$ da calibração? Tente visualizar o efeito provocado nas métricas de fairness e acurácia se variarmos o parâmetro lambda.

In [None]:
# TODO