In [1]:
import datetime
import warnings
import pickle
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

from fairness_methods.methods import FairnessMethods
from models.basic_mf_model import BasicMatrixFactorization
from utils.data_generator import *
from utils.metrics import RecommendationSystemMetrics
from utils.util import *
from utils.tuning import *

from models.nn_fairness_model import run_nn_fairness_model

warnings.filterwarnings('ignore')

In [2]:
def get_item_id_to_group(df):
    """
    split movie id into groups types like genres
    :param df: input data
    :return: dict with movie_id as key group id as value
    """
    movie_id_to_index = {movie_id: index for (index, movie_id) in enumerate(df['movie_id'].unique())}
    movie_id_to_group = {}
    for row in df[["movie_id"] + GENRES].to_dict('records'):
        # set genres group by women ratings and men ratings as wrote in the paper
        # "Women and men both score action, crime, and sci-fi films about equally, but men rate these film much more frequently"
        is_woman_group = 1 if sum([row[genre] for genre in WOMEN_GENRES]) > 0 else 0
        movie_id_to_group[movie_id_to_index[row['movie_id']]] = is_woman_group

    return movie_id_to_group

In [3]:
def get_user_ids_by_gender(df):
    """
    split the input users to groups by gender
    :param df: input data
    :return: user_gender: gender to list of user ids, user_id_to_group: dict of user ids to group id
    """
    user_id_to_index = {user_id: index for (index, user_id) in enumerate(df['user_id'].unique())}
    user_gender = {}
    user_id_to_group = {}
    for row in df[["user_id", "gender"]].to_dict('records'):
        user_gender.setdefault(row['gender'], []).append(user_id_to_index[row['user_id']])
        user_id_to_group[user_id_to_index[row['user_id']]] = 1 if row['gender'] == "F" else 0
    user_gender["F"] = list(set(user_gender["F"]))
    user_gender["M"] = list(set(user_gender["M"]))
    return user_gender, user_id_to_group

In [4]:
def preprocessing(tmp_df, zero_matrix, columns="movie_id", test_ratio=0.2, movie_len=True):
    """
    preprocess before training. split into train and test, fill empty values with zero matrix, add missing items and convert it to matrix
    :param tmp_df: input data
    :param zero_matrix: output matrix shape with zero values
    :param columns: column name
    :param test_ratio: percentage of test data
    :param movie_len: is it movie_len dataset or synthetic dataset
    :return: train_df, test_df, tmp_user_gender: gender to list of user ids, tmp_user_id_to_group: dict of user ids to group id dict with movie_id as key group id as value
    """
    tmp_train, tmp_test = train_test_split(tmp_df, test_size=test_ratio, shuffle=True, stratify=tmp_df[["user_id"]])
#     tmp_train, tmp_test = train_test_split(tmp_df, test_size=test_ratio, shuffle=True)

    tmp_train = tmp_train[tmp_train[columns].isin(tmp_test[columns].unique())]
    tmp_user_gender, tmp_user_id_to_group = get_user_ids_by_gender(tmp_df)
    item_to_group = get_item_id_to_group(tmp_df) if movie_len else {}
    tmp_train_set = normalize_train_test_matrix(columns, tmp_train, zero_matrix)
    tmp_test_set = normalize_train_test_matrix(columns, tmp_test, zero_matrix)
    tmp_test_set = tmp_test_set.replace(0, np.nan)
    print("train_set shape: ", tmp_train_set.shape, " test_set shape: ", tmp_test_set.shape)
    return tmp_train_set, tmp_test_set.to_numpy(), tmp_user_gender, tmp_user_id_to_group, item_to_group


def normalize_train_test_matrix(columns, tmp_split_df, zero_matrix):
    print("start normalization")
    tmp_set = pd.pivot_table(tmp_split_df, values='rating', index='user_id', columns=columns)
    tmp_set = (tmp_set-tmp_set.min())/(tmp_set.max()-tmp_set.min())
    for source_matrix_column in zero_matrix.columns:
        if source_matrix_column not in tmp_set.columns:
            tmp_set[source_matrix_column] = 0
    tmp_set = tmp_set[zero_matrix.columns]
    tmp_set.columns = range(tmp_set.shape[1])
    tmp_set.fillna(0, inplace=True)
    print("finished normalization")
    return tmp_set

# generate_synthetic_data

In [5]:
# uniform user groups and uniform observation probabilities (U)
U_ratings, U_item_id_to_group = generate_synthetic_data(observation_model='uniform', user_distribution='uniform')
# uniform user groups and biased observation probabilities (O)
O_ratings, O_item_id_to_group = generate_synthetic_data(observation_model='unbalanced', user_distribution='uniform')
# biased user groups and uniform observation probabilities (P)
P_ratings, P_item_id_to_group = generate_synthetic_data(observation_model='uniform', user_distribution='imbalanced')
# biased user groups and biased observation probabilities (OP)
OP_ratings, OP_item_id_to_group = generate_synthetic_data(observation_model='unbalanced', user_distribution='imbalanced')

synthetic_tests_df = {"U": U_ratings, "O": O_ratings, "P": P_ratings, "OP": OP_ratings}
synthetic_tests_item_groups = {"U": U_item_id_to_group, "O": O_item_id_to_group, "P": P_item_id_to_group,
                               "OP": OP_item_id_to_group}

****************************rating probability****************************

╒════╤═══════╤════════╤════════╕
│    │   Fem │   STEM │   Masc │
╞════╪═══════╪════════╪════════╡
│ W  │   0.8 │    0.2 │    0.2 │
├────┼───────┼────────┼────────┤
│ WS │   0.8 │    0.8 │    0.2 │
├────┼───────┼────────┼────────┤
│ M  │   0.2 │    0.8 │    0.8 │
├────┼───────┼────────┼────────┤
│ MS │   0.2 │    0.2 │    0.8 │
╘════╧═══════╧════════╧════════╛
****************************observation probability****************************

╒════╤═══════╤════════╤════════╕
│    │   Fem │   STEM │   Masc │
╞════╪═══════╪════════╪════════╡
│ W  │   0.4 │    0.4 │    0.4 │
├────┼───────┼────────┼────────┤
│ WS │   0.4 │    0.4 │    0.4 │
├────┼───────┼────────┼────────┤
│ M  │   0.4 │    0.4 │    0.4 │
├────┼───────┼────────┼────────┤
│ MS │   0.4 │    0.4 │    0.4 │
╘════╧═══════╧════════╧════════╛
****************************rating probability****************************

╒════╤═══════╤════════╤════════╕
│    │  

In [6]:
# html_path = 'exp_results.html'
# zero_synthetic_matrix = np.full((NUM_USERS, NUM_ITEMS), 0)
# zero_synthetic_matrix = pd.DataFrame(zero_synthetic_matrix)
# res_all = pd.DataFrame()
# res_all_val = pd.DataFrame()

# for test_name, tmp_df_ratings in synthetic_tests_df.items():
#     res_per_test ={}
#     res_per_test_val ={}
#     for metric in ['val_score', 'abs_score', 'over_score', 'under_score', 'under_score', 'non_parity_score',None]:
#         train_set, test_set, tmp_user_gender, tmp_user_id_to_group, _ = preprocessing(tmp_df_ratings,zero_synthetic_matrix, columns="item_id",movie_len=False)
#         trainig_dict= run_nn_fairness_model('gmf',train_set,test_set,metric = metric,num_epochs=30, num_users=train_set.shape[0], num_items=train_set.shape[1], embed_dim=8,
#                               dis_group=tmp_user_gender['F'], adv_group=tmp_user_gender['M'], lr=0.1,layers=[64,32,16,8], early_stop=0.00000001,unfairness_reg=0.5,early_stop_tol=3)
        
#         res_per_test[metric] = trainig_dict['train_dict_unfair']
#         res_per_test_val[metric] =trainig_dict['val_dict_unfair']
#         print(f'done {metric}')
#     print(f'done {test_name}')
#     res_per_test = pd.DataFrame.from_dict(res_per_test)
#     res_per_test_val = pd.DataFrame.from_dict(res_per_test_val)

#     res_per_test['test'] = test_name
#     res_per_test_val['test'] = test_name

#     res_all = pd.concat([res_all,res_per_test], axis=0)
#     res_all_val = pd.concat([res_all_val,res_per_test_val], axis=0)

#     res_all.to_csv('res_gmf_30e.csv')
#     res_all_val.to_csv('res_gmf_30e_val.csv')



In [None]:
zero_synthetic_matrix = np.full((NUM_USERS, NUM_ITEMS), 0)
zero_synthetic_matrix = pd.DataFrame(zero_synthetic_matrix)
tuning_all = pd.DataFrame()
tuning_all_val = pd.DataFrame()
for i in range(50):
    print(i)
    res_all = pd.DataFrame()
    res_all_val = pd.DataFrame()
    for test_name, tmp_df_ratings in synthetic_tests_df.items():
        hp= get_random_hp_choises()
        res_per_test ={}
        res_per_test_val ={}
        for metric in ['val_score', 'abs_score', 'over_score', 'under_score', 'under_score', 'non_parity_score',None]:
            train_set, test_set, tmp_user_gender, tmp_user_id_to_group, _ = preprocessing(tmp_df_ratings,zero_synthetic_matrix, columns="item_id",movie_len=False)
            trainig_dict= run_nn_fairness_model('gmf',train_set,test_set,metric = metric,num_epochs=2, num_users=train_set.shape[0], num_items=train_set.shape[1], embed_dim=8,
                                  dis_group=tmp_user_gender['F'], adv_group=tmp_user_gender['M'], lr=hp['lr'],layers=[64,32,16,8], early_stop=hp['early_stop'],unfairness_reg=hp['unfairness_reg'],early_stop_tol=3)

            res_per_test['model'] = hp['model']
            res_per_test_val['model'] = hp['model']

            res_per_test['early_stop'] = hp['early_stop']
            res_per_test_val['early_stop'] = hp['early_stop']
            
            res_per_test['unfairness_reg'] = hp['unfairness_reg']
            res_per_test_val['unfairness_reg'] = hp['unfairness_reg']

            res_per_test['lr'] = hp['lr']
            res_per_test_val['lr'] = hp['lr']


            res_per_test[metric] = trainig_dict['train_dict_unfair']
            res_per_test_val[metric] =trainig_dict['val_dict_unfair']
            print(f'done {metric}')
        print(f'done {test_name}')
        res_per_test = pd.DataFrame.from_dict(res_per_test)
        res_per_test_val = pd.DataFrame.from_dict(res_per_test_val)

        res_per_test['test'] = test_name
        res_per_test_val['test'] = test_name

        res_all = pd.concat([res_all,res_per_test], axis=0)
        res_all_val = pd.concat([res_all_val,res_per_test_val], axis=0)
        
    tuning_all = pd.concat([tuning_all,res_all], axis=0)
    tuning_all_val = pd.concat([res_all_val,tuning_all_val], axis=0)
    tuning_all.to_csv('tuning_syn.csv')
    tuning_all_val.to_csv('tuning_syn_val.csv')



0
start normalization
finished normalization
start normalization
finished normalization
train_set shape:  (400, 300)  test_set shape:  (400, 300)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_id (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 item_id (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 8)            3200        ['user_id[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 8)    