In [1]:
import datetime
import warnings
import pickle
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

from fairness_methods.methods import FairnessMethods
from models.basic_mf_model import BasicMatrixFactorization
from utils.data_generator import *
from utils.metrics import RecommendationSystemMetrics
from utils.util import *
from models.nn_fairness_model import run_nn_fairness_model

warnings.filterwarnings('ignore')

In [2]:
ml_data = generate_real_data()
# ml_data = ml_data.iloc[:1000,:]

In [3]:
# normalize user id
ml_data["user_id"] = ml_data["user_id"].replace(
    {user_id: index for (index, user_id) in enumerate(list(ml_data["user_id"].unique()))})
ml_data["movie_id"] = ml_data["movie_id"].replace(
    {movie_id: index for (index, movie_id) in enumerate(list(ml_data["movie_id"].unique()))})

In [4]:
def get_item_id_to_group(df):
    """
    split movie id into groups types like genres
    :param df: input data
    :return: dict with movie_id as key group id as value
    """
    movie_id_to_index = {movie_id: index for (index, movie_id) in enumerate(df['movie_id'].unique())}
    movie_id_to_group = {}
    for row in df[["movie_id"] + GENRES].to_dict('records'):
        # set genres group by women ratings and men ratings as wrote in the paper
        # "Women and men both score action, crime, and sci-fi films about equally, but men rate these film much more frequently"
        is_woman_group = 1 if sum([row[genre] for genre in WOMEN_GENRES]) > 0 else 0
        movie_id_to_group[movie_id_to_index[row['movie_id']]] = is_woman_group

    return movie_id_to_group

In [5]:
def get_user_ids_by_gender(df):
    """
    split the input users to groups by gender
    :param df: input data
    :return: user_gender: gender to list of user ids, user_id_to_group: dict of user ids to group id
    """
    user_id_to_index = {user_id: index for (index, user_id) in enumerate(df['user_id'].unique())}
    user_gender = {}
    user_id_to_group = {}
    for row in df[["user_id", "gender"]].to_dict('records'):
        user_gender.setdefault(row['gender'], []).append(user_id_to_index[row['user_id']])
        user_id_to_group[user_id_to_index[row['user_id']]] = 1 if row['gender'] == "F" else 0
    user_gender["F"] = list(set(user_gender["F"]))
    user_gender["M"] = list(set(user_gender["M"]))
    return user_gender, user_id_to_group

In [6]:
def preprocessing(tmp_df, zero_matrix, columns="movie_id", test_ratio=0.2, movie_len=True):
    """
    preprocess before training. split into train and test, fill empty values with zero matrix, add missing items and convert it to matrix
    :param tmp_df: input data
    :param zero_matrix: output matrix shape with zero values
    :param columns: column name
    :param test_ratio: percentage of test data
    :param movie_len: is it movie_len dataset or synthetic dataset
    :return: train_df, test_df, tmp_user_gender: gender to list of user ids, tmp_user_id_to_group: dict of user ids to group id dict with movie_id as key group id as value
    """
    tmp_train, tmp_test = train_test_split(tmp_df, test_size=test_ratio, shuffle=True, stratify=tmp_df[["user_id"]])
#     tmp_train, tmp_test = train_test_split(tmp_df, test_size=test_ratio, shuffle=True)

    tmp_train = tmp_train[tmp_train[columns].isin(tmp_test[columns].unique())]
    tmp_user_gender, tmp_user_id_to_group = get_user_ids_by_gender(tmp_df)
    item_to_group = get_item_id_to_group(tmp_df) if movie_len else {}
    tmp_train_set = normalize_train_test_matrix(columns, tmp_train, zero_matrix)
    tmp_test_set = normalize_train_test_matrix(columns, tmp_test, zero_matrix)
    tmp_test_set = tmp_test_set.replace(0, np.nan)
    print("train_set shape: ", tmp_train_set.shape, " test_set shape: ", tmp_test_set.shape)
    return tmp_train_set, tmp_test_set.to_numpy(), tmp_user_gender, tmp_user_id_to_group, item_to_group


def normalize_train_test_matrix(columns, tmp_split_df, zero_matrix):
    print("start normalization")
    tmp_set = pd.pivot_table(tmp_split_df, values='rating', index='user_id', columns=columns)
    tmp_set = (tmp_set-tmp_set.min())/(tmp_set.max()-tmp_set.min())
    for source_matrix_column in zero_matrix.columns:
        if source_matrix_column not in tmp_set.columns:
            tmp_set[source_matrix_column] = 0
    tmp_set = tmp_set[zero_matrix.columns]
    tmp_set.columns = range(tmp_set.shape[1])
    tmp_set.fillna(0, inplace=True)
    print("finished normalization")
    return tmp_set

In [7]:
empty_basic_matrix = pd.pivot_table(ml_data, values='rating', index='user_id', columns="movie_id")
empty_basic_matrix[:] = 0
train_set, test_set, user_gender, user_id_to_group, item_to_group = preprocessing(ml_data,empty_basic_matrix)

start normalization
finished normalization
start normalization
finished normalization
train_set shape:  (4297, 1308)  test_set shape:  (4297, 1308)


In [None]:
for metric in [None,'val_score','abs_score','over_score','under_score']:
    trainig_dict= run_nn_fairness_model('nmf',train_set,test_set,metric = metric,num_epochs=200, num_users=train_set.shape[0], num_items=train_set.shape[1], embed_dim=8,
                          dis_group=user_gender['F'], adv_group=user_gender['M'], lr=0.001,layers=[64,32,16,8], early_stop=0.00000001,unfairness_reg=0.5,early_stop_tol=3)
    print(trainig_dict)
    with open(f'training_dict_{metric}_Exp2.pickle', 'wb') as handle:
        pickle.dump(trainig_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

Epoch 0, Loss: 0.03619726374745369
Epoch 1, Loss: 0.03529508784413338
Epoch 2, Loss: 0.03456813469529152
Epoch 3, Loss: 0.03401249274611473
Epoch 4, Loss: 0.03360583260655403
Epoch 5, Loss: 0.03331625461578369
Epoch 6, Loss: 0.033109888434410095
Epoch 7, Loss: 0.03295747935771942
Epoch 8, Loss: 0.032837286591529846
Epoch 9, Loss: 0.03273520618677139
Epoch 10, Loss: 0.032642606645822525
Epoch 11, Loss: 0.032554201781749725
Epoch 12, Loss: 0.03246687352657318
Epoch 13, Loss: 0.03237871080636978
Epoch 14, Loss: 0.03228846937417984
Epoch 15, Loss: 0.03219551220536232
Epoch 16, Loss: 0.03209982439875603
Epoch 17, Loss: 0.032002173364162445
Epoch 18, Loss: 0.03190397843718529
Epoch 19, Loss: 0.031807154417037964
Epoch 20, Loss: 0.031714048236608505
Epoch 21, Loss: 0.03162704035639763
Epoch 22, Loss: 0.03154800087213516
Epoch 23, Loss: 0.03147796168923378
Epoch 24, Loss: 0.03141707554459572
Epoch 25, Loss: 0.03136459365487099
Epoch 26, Loss: 0.03131904453039169
Epoch 27, Loss: 0.0312782339751

File Name                                             Modified             Size
config.json                                    2023-02-26 21:09:41         6101
metadata.json                                  2023-02-26 21:09:41           64
variables.h5                                   2023-02-26 21:09:41       947648
Epoch 0, Loss: 0.04433306306600571
Epoch 1, Loss: 0.04383361339569092
Epoch 2, Loss: 0.0433950200676918
Epoch 3, Loss: 0.04298018664121628
Epoch 4, Loss: 0.04256162792444229
Epoch 5, Loss: 0.04212784767150879
Epoch 6, Loss: 0.04167765751481056
Epoch 7, Loss: 0.04121653735637665
Epoch 8, Loss: 0.04075255244970322
Epoch 9, Loss: 0.040293995290994644
Epoch 10, Loss: 0.03984817862510681
Epoch 11, Loss: 0.039420854300260544
Epoch 12, Loss: 0.03901622071862221
Epoch 13, Loss: 0.038637615740299225
Epoch 14, Loss: 0.03828778490424156
Epoch 15, Loss: 0.03796924650669098
Epoch 16, Loss: 0.037684403359889984
Epoch 17, Loss: 0.03743534907698631
Epoch 18, Loss: 0.03722365200519562
Epo

In [None]:
for metric in [None,'val_score','abs_score','over_score','under_score']:
    
    with open(f'trainig_dict_{metric}_exp_1.pickle', 'rb') as handle:
        b = pickle.load(handle)
        print(b)



In [None]:
trainig_dict_None_exp_1.pickle
training_dict_None_exp_1.pickle

In [None]:
a = {'hello': 'world'}

with open('training_dict+.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('filename.pickle', 'rb') as handle:
    b = pickle.load(handle)

print(a == b)