In [10]:
# This notebook is adapted from https://github.com/bhavik08/Group-movie-recommender-system/blob/master/Final_Project.ipynb

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Group-movie-recommender-system

Mounted at /content/drive
/content/drive/MyDrive/Group-movie-recommender-system


In [2]:
import numpy as np
import math
import warnings
from sklearn.metrics import mean_squared_error
import pandas as ps

In [3]:
rate_dataset_df = ps.read_csv("EN_41K_rate.tsv", sep="\t", names=["user_id","music_id","ratings"])

In [4]:
rate_dataset_df["ratings"].describe()

count    41062.000000
mean         2.808472
std          1.005290
min          0.068966
25%          2.133055
50%          3.071038
75%          3.667727
max          4.000000
Name: ratings, dtype: float64

In [11]:
#Configuration reader.
class Config:
    def __init__(self):
        
        self.training_file = "EN41K_train.tsv"
        self.testing_file = "EN41K_test.tsv"
        
        self.small_grp_size = 3
        self.medium_grp_size = 5
        self.large_grp_size = 10
        
        self.max_iterations_mf = 5
        self.lambda_mf = 0.7
        self.learning_rate_mf = 0.15
        
        self.num_factors = 5
        
        #AF (after factorization)
        self.rating_threshold_af = 3
        self.num_recos_af = 25
        
        #BF (before factorization)
        self.rating_threshold_bf = 3
        self.num_recos_bf = 25
        
        #WBF (weighted before factorization)
        self.rating_threshold_wbf = 3
        self.num_recos_wbf = 25
        
        self.is_debug = False

In [None]:
evaluation_list = []

## Execution code

#### Variables
----
`small_groups` , `medium_groups` , `large_groups` , `group_set` , `group_type`

#### Instance
----

`gr(GroupRec())`

#### MF model training & Group Recommendation

In [17]:
gr = GroupRec()

Reading training data from  EN41K_train.tsv ...
Reading testing data from  EN41K_test.tsv ...


In [None]:
# Debug code
#print(gr.ratings[:2,:8])
#print(gr.test_ratings.sum())
#print(gr.ratings.shape)
#print(gr.test_ratings.shape)
#print(gr.groups)
#print(gr.predictions)

In [18]:
print((gr.cfg.max_iterations_mf))
gr.sgd_factorize()

5
Doing matrix factorization...
Iteration:  0
training mse:  0.8496211581658373
test mse:  1.1484912676217782
Iteration:  1
training mse:  0.8415053856395047
test mse:  1.0813840433167832
Iteration:  2
training mse:  0.8364687937976634
test mse:  1.0596770922467882
Iteration:  3
training mse:  0.8404589659085276
test mse:  1.0490345810355939
Iteration:  4
training mse:  0.8382965500041312
test mse:  1.042928408439522


In [None]:
# Debug code
#print(gr.ratings.shape)
#print(gr.test_ratings.shape)
#print(gr.groups)
#print(gr.predictions.shape)
#print(gr.predictions)

In [23]:
#generate groups programmatically
#disjoint means none of the groups shares any common members     
small_groups = Group.generate_groups(gr.cfg, gr.ratings, gr.test_ratings, gr.num_users, 20, gr.cfg.small_grp_size, disjoint=False)
medium_groups = Group.generate_groups(gr.cfg, gr.ratings, gr.test_ratings, gr.num_users, 20, gr.cfg.medium_grp_size, disjoint=False)
large_groups = Group.generate_groups(gr.cfg, gr.ratings, gr.test_ratings, gr.num_users, 20, gr.cfg.large_grp_size, disjoint=False)

group_set = [small_groups, medium_groups, large_groups]
group_type = ['small', 'medium', 'large']

for idx, groups in enumerate(group_set):
    if groups is []:
        continue

    # print n examples of generated groups
    n = 5
    print('\n******* Running for ', group_type[idx], ' groups *************')
    print('generated groups (only first %d are getting printed here): ' % n)
    for group in groups[:n]:
        print(group.members)


******* Running for  small  groups *************
generated groups (only first 5 are getting printed here): 
[324, 884, 1183]
[220, 1028, 1086]
[149, 422, 1191]
[904, 924, 1276]
[879, 1147, 1283]

******* Running for  medium  groups *************
generated groups (only first 5 are getting printed here): 
[119, 1173, 1174, 1249, 1255]
[48, 601, 976, 1028, 1252]
[512, 734, 808, 1134, 1270]
[345, 373, 692, 964, 1275]
[29, 925, 1073, 1115, 1232]

******* Running for  large  groups *************
generated groups (only first 5 are getting printed here): 
[343, 559, 643, 998, 1023, 1062, 1073, 1205, 1290, 1346]
[15, 95, 380, 461, 646, 767, 838, 1079, 1242, 1265]
[3, 24, 64, 247, 465, 692, 710, 1082, 1268, 1285]
[209, 369, 490, 505, 513, 516, 898, 1063, 1274, 1338]
[108, 163, 507, 550, 575, 735, 803, 1074, 1222, 1294]


In [24]:
for idx, groups in enumerate(group_set):
    if groups is []:
        continue
    print('\n******* Running for ', group_type[idx], ' groups *************')

    gr.add_groups(groups)
    eval_results = gr.run_all_methods(groups)

    #evaluation_list.append(eval_results)

    gr.remove_groups(groups)


******* Running for  small  groups *************

#########-------For AF-------#########

AF method: mean precision:  0.5
AF method: mean recall:  0.001282051282051282

#########-------For BF-------#########

BF method: mean precision:  0.25
BF method: mean recall:  0.001282051282051282

#########-------For WBF-------#########

WBF method: mean precision:  0.0
WBF method: mean recall:  0.0

******* Running for  medium  groups *************

#########-------For AF-------#########

AF method: mean precision:  0.4166666666666667
AF method: mean recall:  0.00701412429378531

#########-------For BF-------#########

BF method: mean precision:  0.25
BF method: mean recall:  0.004166666666666667

#########-------For WBF-------#########

WBF method: mean precision:  0.4
WBF method: mean recall:  0.006166666666666666

******* Running for  large  groups *************

#########-------For AF-------#########

AF method: mean precision:  1.0
AF method: mean recall:  0.004041060291060291

#########-

#### Precision & Recall data from 20 times Experiments

In [None]:
list_ = []
new_eval_list = []
count = 1
for i in range(len(evaluation_list)):
  if count % 3 == 0:
    for j in range(len(evaluation_list[i])):
      list_.append(evaluation_list[i][j])
    new_eval_list.append(list_)
    list_ = []
  else:
    for j in range(len(evaluation_list[i])):
      list_.append(evaluation_list[i][j])
  count += 1

In [None]:
len(evaluation_list)

60

In [None]:
eval_df = ps.DataFrame(new_eval_list, columns = ["small_AF", "small_BF", "small_WBF", "medium_AF", "medium_BF", "medium_WBF", "large_AF", "large_BF", "large_WBF" ])
eval_df

Unnamed: 0,small_AF,small_BF,small_WBF,medium_AF,medium_BF,medium_WBF,large_AF,large_BF,large_WBF
0,"[1.0, 0.004223227752639518]","[0.5, 0.0009259259259259259]","[1.0, 0.0009259259259259259]","[1.0, 0.0032700776235258995]","[0.3333333333333333, 0.0011627906976744186]","[0.5, 0.0020556478405315613]","[0.75, 0.001776485788113695]","[0.25, 0.0011627906976744186]","[0.6, 0.001776485788113695]"
1,"[0.6666666666666666, 0.0054487179487179484]","[nan, 0.0]","[0.0, 0.0]","[0.75, 0.0037321937321937323]","[1.0, 0.0027777777777777775]","[nan, 0.0]","[0.75, 0.0017717032282188708]","[0.6, 0.004135487528344671]","[0.75, 0.0028133698948855377]"
2,"[0.0, 0.0]","[0.0, 0.0]","[0.3333333333333333, 0.0016666666666666666]","[0.75, 0.004767533490937746]","[1.0, 0.0034647550776583033]","[0.6666666666666666, 0.0034143518518518516]","[1.0, 0.008756613756613758]","[0.5, 0.0037221332519891204]","[1.0, 0.004658303464755077]"
3,"[0.8, 0.00863104483794139]","[nan, 0.0]","[1.0, 0.0034722222222222225]","[0.7142857142857143, 0.008427186848239478]","[0.0, 0.0]","[1.0, 0.001808367071524966]","[0.5714285714285714, 0.004076743155660426]","[0.0, 0.0]","[0.8333333333333334, 0.005537431116348386]"
4,"[1.0, 0.006439393939393939]","[0.4, 0.002488425925925926]","[0.0, 0.0]","[1.0, 0.0033365570599613154]","[0.0, 0.0]","[0.5, 0.0010638297872340426]","[0.75, 0.003238738212142467]","[0.41666666666666663, 0.0026331719128329296]","[0.25, 0.0008928571428571428]"
5,"[0.6666666666666666, 0.003041958041958042]","[0.0, 0.0]","[0.3333333333333333, 0.0007692307692307692]","[0.5, 0.00078125]","[0.0, 0.0]","[0.0, 0.0]","[0.3333333333333333, 0.0025]","[0.5, 0.0023809523809523807]","[0.5, 0.0023809523809523807]"
6,"[1.0, 0.005441595441595441]","[0.0, 0.0]","[0.0, 0.0]","[0.6666666666666666, 0.004257246376811593]","[0.5, 0.005131578947368421]","[0.5, 0.0036309523809523805]","[0.7, 0.003693934734236412]","[0.5833333333333334, 0.0037640816609316183]","[0.5, 0.0011627906976744186]"
7,"[1.0, 0.002]","[0.5, 0.0020833333333333333]","[0.3333333333333333, 0.0022727272727272726]","[1.0, 0.0017241379310344827]","[0.16666666666666666, 0.0008771929824561403]","[0.5, 0.002943650126156434]","[0.6666666666666666, 0.0036363636363636364]","[0.6666666666666666, 0.003101503759398496]","[0.0, 0.0]"
8,"[0.0, 0.0]","[1.0, 0.0010638297872340426]","[0.0, 0.0]","[1.0, 0.0033288043478260864]","[0.5, 0.0029411764705882353]","[0.3333333333333333, 0.0007246376811594203]","[0.8333333333333334, 0.007052351384247936]","[0.0, 0.0]","[1.0, 0.0029472610722610726]"
9,"[0.625, 0.007446236559139785]","[0.5, 0.00125]","[0.5, 0.005779569892473118]","[0.3333333333333333, 0.0008928571428571428]","[0.75, 0.0043607660455486546]","[1.0, 0.0008928571428571428]","[0.6, 0.005705165130568356]","[0.25, 0.0008771929824561403]","[0.5, 0.0008771929824561403]"


In [None]:
avg_eval_results = []
for column_name in eval_df.columns:
  pre_rec_array = np.array(eval_df[column_name].to_list())
  avg_eval_results.append(np.nanmean(pre_rec_array, axis = 0))

In [None]:
avg_eval_results_df = ps.DataFrame(avg_eval_results).T
avg_eval_results_df.columns = ["small_AF", "small_BF", "small_WBF", "medium_AF", "medium_BF", "medium_WBF", "large_AF", "large_BF", "large_WBF" ]
avg_eval_results_df.index = ["precision", "recall"]
avg_eval_results_df

Unnamed: 0,small_AF,small_BF,small_WBF,medium_AF,medium_BF,medium_WBF,large_AF,large_BF,large_WBF
precision,0.656964,0.411111,0.51,0.729881,0.479167,0.465278,0.70244,0.4075,0.552917
recall,0.004194,0.001704,0.002349,0.003133,0.002062,0.001849,0.004312,0.002363,0.002459


#### Generating Explanations

In [None]:
large_groups[0].members

[91, 300, 302, 588, 812, 872, 1280, 1294, 1346, 1347]

In [None]:
# large_groups
large_groups = group_set[2]

In [34]:
# find AF group vector
print(group_set[1][2].grp_factors_af)
print(group_set[1][2].bias_af)

# find group members vectors
print(gr.user_factors[group_set[1][2].members, :])
print(gr.user_biases[group_set[1][2].members])

# find recommended music ID
print(group_set[1][2].reco_list_af)

# find recommended music vectors
print(gr.item_factors[group_set[1][2].reco_list_af])

[ 0.02101781  0.04496065 -0.07088136  0.00342788 -0.02857114]
-0.0482328896752162
[[-0.02173481  0.01814226 -0.04460008  0.0063939   0.03168001]
 [ 0.07451212  0.08319631 -0.05291501 -0.00177048 -0.02293285]
 [-0.01362764  0.01744778 -0.11239957  0.00766848 -0.07983841]
 [-0.64848781  0.18999331  0.06307645  0.73891721 -0.2862293 ]
 [ 0.09736188  0.26146698  0.41860049  0.70812905  0.73932672]]
[ 0.25780084 -0.16143093 -0.13342482  0.          0.        ]
[1540 3471 4114 4125 3997 2732 3826 3731 3714 3610 1694 3870 4241 3339
 3696 4218   55 1669 1992 1475 2887  287  220 4002 3341]
[[ 1.43646815e-01  1.50027722e-01 -2.09192687e-01  4.56373730e-02
  -2.25255175e-01]
 [ 7.02079123e-02 -1.45957011e-01 -1.26354374e-01 -1.44345632e-01
   2.10027227e-01]
 [-2.72388266e-01  4.08759535e-01 -2.81465374e-01 -2.23222250e-01
  -6.71501049e-01]
 [ 2.55657181e-01  1.68266437e-01  3.27285849e-01  2.07544568e-01
   1.80651737e-02]
 [-2.78772323e-03  5.29975059e-01  6.82923680e-02  6.18029647e-01
  -3.6

In [32]:
from sklearn.metrics.pairwise import euclidean_distances
# Choose a group
present_group = group_set[1][2]
print("group members: ", present_group.members)
print("recommendation list: ", present_group.reco_list_af)

# Computing top influencers of the playlist
group_vector = present_group.grp_factors_af
member_vectors = gr.user_factors[present_group.members, :]
similarities = []
for member_vector in member_vectors:
  similarity = 1 / (1 + euclidean_distances([member_vector], [group_vector]))
  similarities.append(similarity[0,0])

top_influencers_idx = np.flip(np.array(similarities).argsort())
top_influecers = {}
for index in top_influencers_idx:
  top_influecers[present_group.members[index]] = str(similarities[index]*100)[:2]+str("%")
print(top_influecers)

group members:  [512, 734, 808, 1134, 1270]
recommendation list:  [1540 3471 4114 4125 3997 2732 3826 3731 3714 3610 1694 3870 4241 3339
 3696 4218   55 1669 1992 1475 2887  287  220 4002 3341]
{734: '93%', 808: '92%', 512: '92%', 1134: '48%', 1270: '45%'}


In [None]:
gr.ratings_global_mean

2.807252631311695

In [None]:
gr.predict_user_rating(37,2732)

3.7020066990087575

In [35]:
# "Likely enjoyed by"

# get the recommendation result and music vectors
music_reco_list = present_group.reco_list_af
music_vectors = gr.item_factors[music_reco_list]
music_biases = gr.item_biases[music_reco_list]

# get the member vectors
member_vectors = gr.user_factors[present_group.members, :]
member_biases = gr.user_biases[present_group.members]

# compute the rate 
music_top_enjoyed_user = {}

for every_music in music_reco_list:
  # retrive the music vector
  #music_vector = gr.item_factors[every_music]
  #music_bias = gr.item_biases[every_music]
  all_member_rates = []

  for every_member in present_group.members:
    #member_vector = gr.user_factors[every_member,:]
    #member_bias = gr.user_biases[every_member]
    #member_rate = gr.ratings_global_mean
    member_rate = gr.predict_user_rating(every_member,every_music)
    all_member_rates.append(member_rate)
  # top_enjoyed_user
  top_enjoyed_user_idx = np.flip(np.array(all_member_rates).argsort()).tolist()

  # remove members whose rates are lesser than 3
  top_enjoyed_user_idx = [x for x in top_enjoyed_user_idx if all_member_rates[x]>=3]
  top_enjoyed_user = [present_group.members[x] for x in top_enjoyed_user_idx]
  music_top_enjoyed_user[every_music] = top_enjoyed_user

In [36]:
music_top_enjoyed_user

{1540: [512, 1134, 808, 734, 1270],
 3471: [512, 1270, 808, 734, 1134],
 4114: [1134, 512, 808, 734],
 4125: [1270, 512, 1134, 734, 808],
 3997: [1134, 1270, 512, 808, 734],
 2732: [512, 1134, 1270, 808, 734],
 3826: [512, 1134, 808, 734],
 3731: [512, 1134, 808, 734, 1270],
 3714: [512, 1134, 1270, 808, 734],
 3610: [1134, 512, 1270, 808, 734],
 1694: [512, 808, 734],
 3870: [512, 1134, 808, 734, 1270],
 4241: [1134, 512, 1270, 808, 734],
 3339: [1134, 512, 1270, 808, 734],
 3696: [1134, 512, 808, 734],
 4218: [512, 1270, 1134, 808, 734],
 55: [512, 1134, 1270, 808, 734],
 1669: [1134, 512, 808, 1270, 734],
 1992: [512, 1270, 808, 734, 1134],
 1475: [1134, 512, 1270, 808, 734],
 2887: [1270, 512, 1134, 734, 808],
 287: [512, 1134, 1270, 808, 734],
 220: [512, 1134, 1270, 808, 734],
 4002: [512, 808, 734, 1270, 1134],
 3341: [512, 808, 734, 1270, 1134]}

## Group class

The class 'Group' is responsible for generating random groups of different sizes : small, medium and large and performing evaluations of different methods AF, BF and WBF used for recommendation for these groups.

#### Funcitons
----
`find_candidate_items()` , `non_testable_items()`, `generate_groups()` , 

`generate_actual_recommendations()` , `evaluate_af()` , `evaluate_bf()` , `evaluate_wbf()`

#### In-class variable
----

`candidate_items`

`actual_recos`,`false_positive` (tp and fp)

`ratings_per_member`

- AF
`grp_factors_af` , `bias_af` , `precision_af` , `recall_af` , `reco_list_af`

- BF
`grp_factors_bf` , `bias_bf` , `precision_bf` , `recall_bf` , `reco_list_bf`

- WBF
`grp_factors_wbf`, `bias_wbf` , `precision_wbf` , `recall_wbf` , `weight_matrix_wbf` , `reco_list_wbf`

In [12]:
class Group:
    def __init__(self, members, candidate_items, ratings):
        # member ids
        self.members = sorted(members)
        
        # List of items that can be recommended.
        # These should not have been watched by any member of group.
        self.candidate_items = candidate_items

        self.actual_recos = []
        self.false_positive = []
        
        self.ratings_per_member = [np.size(ratings[member].nonzero()) for member in self.members]
        
        # AF
        self.grp_factors_af = []
        self.bias_af = 0
        self.precision_af = 0
        self.recall_af = 0
        self.reco_list_af = [] 
        
        # BF
        self.grp_factors_bf = []
        self.bias_bf = 0
        self.precision_bf = 0
        self.recall_bf = 0
        self.reco_list_bf = []
        
        # WBF
        self.grp_factors_wbf = []
        self.bias_wbf = 0
        self.precision_wbf = 0
        self.recall_wbf = 0
        self.weight_matrix_wbf = []
        self.reco_list_wbf = []

# Group Class - find_candidate_items

@staticmethod
def find_candidate_items(ratings, members):
    if len(members) == 0: return []

    unwatched_items = np.argwhere(ratings[members[0]] == 0)
    for member in members:
        cur_unwatched = np.argwhere(ratings[member] == 0)
        unwatched_items = np.intersect1d(unwatched_items, cur_unwatched)

    return unwatched_items
Group.find_candidate_items = find_candidate_items

# Group Class - non_testable_items

@staticmethod
def non_testable_items(members, ratings): 
    non_eval_items = np.argwhere(ratings[members[0]] == 0)
    for member in members:
        cur_non_eval_items = np.argwhere(ratings[member] == 0)
        non_eval_items = np.intersect1d(non_eval_items, cur_non_eval_items)
    return non_eval_items
Group.non_testable_items = non_testable_items

# Group Class - generate_groups

@staticmethod
def generate_groups(cfg, ratings, test_ratings, num_users, count, size, disjoint = True):
    avbl_users = [i for i in range(num_users)]
    groups = []
    testable_threshold = 25

    iter_idx = 0
    while iter_idx in range(count):
        group_members = np.random.choice(avbl_users, size = size, replace = False)
        candidate_items = Group.find_candidate_items(ratings, group_members)
        non_eval_items = Group.non_testable_items(group_members, test_ratings)
        testable_items = np.setdiff1d(candidate_items, non_eval_items)
        #if len(testable_items) < testable_threshold:
          #print("iter_idx: ", iter_idx, "-----testable_items length: ", len(testable_items))


        if len(candidate_items) != 0 and len(testable_items) >= testable_threshold:
            #print("success", len(testable_items))
            groups += [Group(group_members, candidate_items, ratings)]
            avbl_users = np.setdiff1d(avbl_users, group_members)
            iter_idx += 1

    return groups
Group.generate_groups = generate_groups

# Group Class - generate_actual_recommendations

def generate_actual_recommendations(self, ratings, threshold):
    non_eval_items = Group.non_testable_items(self.members, ratings)

    #items = np.argwhere(np.logical_or(ratings[self.members[0]] >= threshold, ratings[self.members[0]] == 0)).flatten()
    #fp = np.argwhere(np.logical_and(ratings[self.members[0]] > 0, ratings[self.members[0]] < threshold)).flatten()
    items = np.argwhere(np.logical_or(ratings[self.members[0]] >= threshold, ratings[self.members[0]] == 0)).flatten()
    
    fp = np.argwhere(np.logical_and(ratings[self.members[0]] > 0, ratings[self.members[0]] < threshold)).flatten()
    for member in self.members:
        cur_items = np.argwhere(np.logical_or(ratings[member] >= threshold, ratings[member] == 0)).flatten()
        fp = np.union1d(fp, np.argwhere(np.logical_and(ratings[member] > 0, ratings[member] < threshold)).flatten())
        items = np.intersect1d(items, cur_items)

    items = np.setdiff1d(items, non_eval_items)

    self.actual_recos = items
    self.false_positive = fp
Group.generate_actual_recommendations  = generate_actual_recommendations

# Group Class - evaluate_af, evaluate_bf, evaluate_wbf

def evaluate_af(self, is_debug=True):
    tp = float(np.intersect1d(self.actual_recos, self.reco_list_af).size)
    fp = float(np.intersect1d(self.false_positive, self.reco_list_af).size)

    try:
        self.precision_af = tp / (tp + fp)
    except ZeroDivisionError:
        self.precision_af = np.NaN

    try:
        self.recall_af = tp / self.actual_recos.size
    except ZeroDivisionError:
        self.recall_af = np.NaN


    #print('tp: ', tp)
    #print('fp: ', fp)
    #print('precision_af: ', self.precision_af)
    #print('recall_af: ', self.recall_af)

    return self.precision_af, self.recall_af, tp, fp
Group.evaluate_af = evaluate_af

def evaluate_bf(self, is_debug=True):
    tp = float(np.intersect1d(self.actual_recos, self.reco_list_bf).size)
    fp = float(np.intersect1d(self.false_positive, self.reco_list_bf).size)

    try:
        self.precision_bf = tp / (tp + fp)
    except ZeroDivisionError:
        self.precision_bf = np.NaN

    try:
        self.recall_bf = tp / self.actual_recos.size
    except ZeroDivisionError:
        self.recall_bf = np.NaN

    #print('tp: ', tp)
    #print('fp: ', fp)
    #print('precision_bf: ', self.precision_bf)
    #print('recall_bf: ', self.recall_bf)

    return self.precision_bf, self.recall_bf, tp, fp
Group.evaluate_bf = evaluate_bf

def evaluate_wbf(self, is_debug=True):
    tp = float(np.intersect1d(self.actual_recos, self.reco_list_wbf).size)
    fp = float(np.intersect1d(self.false_positive, self.reco_list_wbf).size)

    try:
        self.precision_wbf = tp / (tp + fp)
    except ZeroDivisionError:
        self.precision_wbf = np.NaN

    try:
        self.recall_wbf = tp / self.actual_recos.size
    except ZeroDivisionError:
        self.recall_wbf = np.NaN


    #print('tp: ', tp)
    #print('fp: ', fp)
    #print('precision_bf: ', self.precision_wbf)
    #print('recall_bf: ', self.recall_wbf)

    return self.precision_wbf, self.recall_wbf, tp, fp
Group.evaluate_wbf = evaluate_wbf

## Aggregator

#### Functions
----

`average()` , `average_bf()` , `weighted_average()`

In [13]:
class Aggregators:
    def __init__(self):
        pass
    
    #pass ratings or factors as input
    @staticmethod
    def average(arr):
        return np.average(arr, axis = 0, weights = None)

    @staticmethod
    def average_bf(arr):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            arr[arr == 0] = np.nan
            return np.nanmean(arr, axis=0)
    
    @staticmethod
    def weighted_average(arr, weights):
        return np.average(arr, axis = 0, weights = weights)

## GroupRec Class

This is our main class responsible for reading the data, defining methods for our appoaches and finally evaluating them.

#### In-class variables
----

`ratings` (all original ratings), `test_ratings`

`groups`(groups assigned from outside)

`num_users` , `num_items`

`predictions` (predicted ratings matrix based on factors)

`user_factors` , `item_factors` , `user_biases` , `item_biases`

`ratings_global_mean`

#### Functions
----

`read_data()` , `sgd_factorize()` , `sgd_mse()` , `predict_user_rating()` , `predict_group_rating()` ,

`predict_all_ratings()` , `af_runner()` , `bf_runner()` , `wbf_runner()` , `evaluation()`



In [14]:
# overflow warnings should be raised as errors
np.seterr(over='raise')

class GroupRec:
    def __init__(self):
        self.cfg = Config()
        
        # training and testing matrices
        self.ratings = None
        self.test_ratings = None

        self.groups = []
        
        # read data into above matrices
        self.read_data()
        
        self.num_users = self.ratings.shape[0]
        self.num_items = self.ratings.shape[1]
        
        # predicted ratings matrix based on factors.
        self.predictions = np.zeros((self.num_users, self.num_items))
        
        # output after svd factorization
        # initialize all unknowns with random values from -1 to 1
        self.user_factors = np.random.uniform(-1, 1, (self.ratings.shape[0], self.cfg.num_factors))
        self.item_factors = np.random.uniform(-1, 1, (self.ratings.shape[1], self.cfg.num_factors))

        self.user_biases = np.zeros(self.num_users)
        self.item_biases = np.zeros(self.num_items)
        
        # global mean of ratings a.k.a mu
        self.ratings_global_mean = 0

    # add list of groups
    def add_groups(self, groups):
        self.groups = groups
    
    # remove groups
    def remove_groups(self, groups):
        self.groups = []
        
###
# GroupRec Class - read_data
# We have used 'pandas' library for reading testing data and training data from the csv file. We will finally generate our user * item ratings matrix here.
###

# read training and testing data into matrices
def read_data(self):
    column_headers = ['user_id', 'item_id', 'rating', 'timestamp']

    print('Reading training data from ', self.cfg.training_file, '...')
    training_data = ps.read_csv(self.cfg.training_file, sep='\t', names=column_headers)

    print('Reading testing data from ', self.cfg.testing_file, '...')
    testing_data = ps.read_csv(self.cfg.testing_file, sep='\t', names=column_headers)

    num_users = max(max(training_data.user_id.unique()),max(testing_data.user_id.unique()))+1
    num_items = max(max(training_data.item_id.unique()),max(testing_data.item_id.unique()))+1

    self.ratings = np.zeros((num_users, num_items))
    self.test_ratings = np.zeros((num_users, num_items))

    for row in training_data.itertuples(index=False):
        self.ratings[row.user_id, row.item_id] = row.rating

    for row in testing_data.itertuples(index=False):
        self.test_ratings[row.user_id, row.item_id] = row.rating      
GroupRec.read_data = read_data

###
# GroupRec Class - sgd_factorize
# Now we would like to factorize the rating matrix. We have considered the number of factors to be 15. And we are using gradient descent for error minimization.
###

def sgd_factorize(self):
    #solve for these for matrix ratings        
    ratings_row, ratings_col = self.ratings.nonzero()
    num_ratings = len(ratings_row)
    learning_rate = self.cfg.learning_rate_mf
    regularization = self.cfg.lambda_mf

    self.ratings_global_mean = np.mean(self.ratings[np.where(self.ratings != 0)])

    print('Doing matrix factorization...')
    try:
        for iter in range(self.cfg.max_iterations_mf):
            print('Iteration: ', iter)
            rating_indices = np.arange(num_ratings)
            np.random.shuffle(rating_indices)

            for idx in rating_indices:
                user = ratings_row[idx]
                item = ratings_col[idx]

                pred = self.predict_user_rating(user, item)
                error = self.ratings[user][item] - pred

                self.user_factors[user] += learning_rate \
                                            * ((error * self.item_factors[item]) - (regularization * self.user_factors[user]))
                self.item_factors[item] += learning_rate \
                                            * ((error * self.user_factors[user]) - (regularization * self.item_factors[item]))

                self.user_biases[user] += learning_rate * (error - regularization * self.user_biases[user])
                self.item_biases[item] += learning_rate * (error - regularization * self.item_biases[item])

            self.sgd_mse()

    except FloatingPointError:
        print('Floating point Error: ')
GroupRec.sgd_factorize = sgd_factorize

###
# GroupRec Class - sgd_mse
###

def sgd_mse(self):
    self.predict_all_ratings()
    predicted_training_ratings = self.predictions[self.ratings.nonzero()].flatten()
    actual_training_ratings = self.ratings[self.ratings.nonzero()].flatten()

    predicted_test_ratings = self.predictions[self.test_ratings.nonzero()].flatten()
    actual_test_ratings = self.test_ratings[self.test_ratings.nonzero()].flatten()

    training_mse = mean_squared_error(predicted_training_ratings, actual_training_ratings)
    print('training mse: ', training_mse)
    test_mse = mean_squared_error(predicted_test_ratings, actual_test_ratings)
    print('test mse: ', test_mse)
GroupRec.sgd_mse = sgd_mse


###
# GroupRec Class - predict_user_rating
###

def predict_user_rating(self, user, item):
    prediction = self.ratings_global_mean + self.user_biases[user] + self.item_biases[item]
    prediction += self.user_factors[user, :].dot(self.item_factors[item, :].T)
    return prediction
GroupRec.predict_user_rating = predict_user_rating

###
# GroupRec Class - predict_group_rating
###

def predict_group_rating(self, group, item, method):
    if (method == 'af'):
        factors = group.grp_factors_af; bias_group = group.bias_af
    elif (method == 'bf'):
        factors = group.grp_factors_bf; bias_group = group.bias_bf
    elif (method == 'wbf'):
        factors = group.grp_factors_wbf; bias_group = group.bias_wbf

    return self.ratings_global_mean + bias_group + self.item_biases[item] \
                                    + np.dot(factors.T, self.item_factors[item])
GroupRec.predict_group_rating = predict_group_rating

###
# GroupRec Class - predict_all_ratings (predict all user ratings)
###

def predict_all_ratings(self):
    for user in range(self.num_users):
        for item in range(self.num_items):
            self.predictions[user, item] = self.predict_user_rating(user, item)
GroupRec.predict_all_ratings = predict_all_ratings

In [15]:

###
# GroupRec Class - run_all_methods
# running all our proposed methods and evaluating them altogether.
###

def run_all_methods(self, groups):
    if (groups is None):
        groups = self.groups
    #PS: could call them without passing groups as we have already added groups to grouprec object
    self.af_runner(groups, Aggregators.weighted_average)
    self.bf_runner(groups, Aggregators.average_bf)
    self.wbf_runner(groups, Aggregators.average_bf)

    #evaluation
    eval_results = self.evaluation()

    return eval_results
GroupRec.run_all_methods = run_all_methods


###
# GroupRec Class - af_runner
###

def af_runner(self, groups = None, aggregator = Aggregators.average):
    #if groups is not passed, use self.groups
    if (groups is None):
        groups = self.groups

    #calculate factors
    for group in groups:
        member_factors = self.user_factors[group.members, :]
        member_biases = self.user_biases[group.members]

        #aggregate the factors
        if (aggregator == Aggregators.average):
            group.grp_factors_af = aggregator(member_factors)
            group.bias_af = aggregator(member_biases)
        elif (aggregator == Aggregators.weighted_average):
            group.grp_factors_af = aggregator(member_factors, weights = group.ratings_per_member)
            group.bias_af = aggregator(member_biases, weights = group.ratings_per_member)

        #predict ratings for all candidate items
        group_candidate_ratings = {}
        for idx, item in enumerate(group.candidate_items):
            cur_rating = self.predict_group_rating(group, item, 'af')

            if (cur_rating > self.cfg.rating_threshold_af):
                group_candidate_ratings[item] = cur_rating

        #sort and filter to keep top 'num_recos_af' recommendations
        group_candidate_ratings = sorted(list(group_candidate_ratings.items()), key=lambda x: x[1], reverse=True)[:self.cfg.num_recos_af]

        group.reco_list_af = np.array([rating_tuple[0] for rating_tuple in group_candidate_ratings])

GroupRec.af_runner = af_runner

###
# GroupRec Class - bf_runner
###

def bf_runner(self, groups=None, aggregator=Aggregators.average_bf):
    # aggregate user ratings into virtual group
    # calculate factors of group
    lamb = self.cfg.lambda_mf

    for group in groups:
        all_movies = np.arange(len(self.ratings.T))
        watched_items = sorted(list(set(all_movies) - set(group.candidate_items)))

        group_rating = self.ratings[group.members, :]
        agg_rating = aggregator(group_rating)
        s_g = []
        for j in watched_items:
            s_g.append(agg_rating[j] - self.ratings_global_mean - self.item_biases[j])

        # creating matrix A : contains rows of [item_factors of items in watched_list + '1' vector]
        A = np.zeros((0, self.cfg.num_factors))

        for item in watched_items:
            A = np.vstack([A, self.item_factors[item]])
        v = np.ones((len(watched_items), 1))
        A = np.c_[A, v]

        factor_n_bias = np.dot(np.linalg.inv(np.dot(A.T, A) + lamb * np.identity(self.cfg.num_factors + 1)), np.dot(A.T, s_g))
        group.grp_factors_bf = factor_n_bias[:-1]
        group.bias_bf = factor_n_bias[-1]

        # Making recommendations on candidate list :
        group_candidate_ratings = {}
        for idx, item in enumerate(group.candidate_items):
            cur_rating = self.predict_group_rating(group, item, 'bf')

            if (cur_rating > self.cfg.rating_threshold_bf):
                group_candidate_ratings[item] = cur_rating

        # sort and filter to keep top 'num_recos_bf' recommendations
        group_candidate_ratings = sorted(list(group_candidate_ratings.items()), key=lambda x: x[1], reverse=True)[
                                  :self.cfg.num_recos_bf]

        group.reco_list_bf = np.array([rating_tuple[0] for rating_tuple in group_candidate_ratings])      
GroupRec.bf_runner = bf_runner

###
# GroupRec Class - wbf_runner
###

def wbf_runner(self, groups=None, aggregator=Aggregators.average_bf):
    # aggregate user ratings into virtual group
    # calculate factors of group
    lamb = self.cfg.lambda_mf
    for group in groups:
        all_movies = np.arange(len(self.ratings.T))
        watched_items = sorted(list(set(all_movies) - set(group.candidate_items)))

        group_rating = self.ratings[group.members, :]
        agg_rating = aggregator(group_rating)
        s_g = []
        for j in watched_items:
            s_g.append(agg_rating[j] - self.ratings_global_mean - self.item_biases[j])

        # creating matrix A : contains rows of [item_factors of items in watched_list + '1' vector]
        A = np.zeros((0, self.cfg.num_factors))  # 3 is the number of features here = K

        for item in watched_items:
            A = np.vstack([A, self.item_factors[item]])
        v = np.ones((len(watched_items), 1))
        A = np.c_[A, v]

        wt = []
        for item in watched_items:
            rated = np.argwhere(self.ratings[:, item] != 0)  # list of users who have rated this movie
            watched = np.intersect1d(rated, group.members)  # list of group members who have watched this movie
            std_dev = np.std([a for a in self.ratings[:, item] if a != 0])  # std deviation for the rating of the item
            wt += [len(watched) / float(len(group.members)) * 1 / (1 + std_dev)]  # list containing diagonal elements
        W = np.diag(wt)  # diagonal weight matrix

        factor_n_bias = np.dot(np.linalg.inv(np.dot(np.dot(A.T, W),A) + lamb * np.identity(self.cfg.num_factors + 1)),
                               np.dot(np.dot(A.T, W), s_g))
        group.grp_factors_wbf = factor_n_bias[:-1]
        group.bias_wbf = factor_n_bias[-1]

        # Making recommendations on candidate list :
        group_candidate_ratings = {}
        for idx, item in enumerate(group.candidate_items):
            cur_rating = self.predict_group_rating(group, item, 'wbf')

            if (cur_rating > self.cfg.rating_threshold_wbf):
                group_candidate_ratings[item] = cur_rating

        # sort and filter to keep top 'num_recos_wbf' recommendations
        group_candidate_ratings = sorted(list(group_candidate_ratings.items()), key=lambda x: x[1], reverse=True)[
                                  :self.cfg.num_recos_wbf]

        group.reco_list_wbf = np.array([rating_tuple[0] for rating_tuple in group_candidate_ratings])
GroupRec.wbf_runner = wbf_runner

In [16]:

###
# GroupRec Class - evaluation
# Evaluating af,bf,wbf methods
###

def evaluation(self):

    eval_results = []

    # For AF
    af_precision_list = []
    af_recall_list = []
    print("\n#########-------For AF-------#########")
    for grp in self.groups:
        grp.generate_actual_recommendations(self.test_ratings, self.cfg.rating_threshold_af)
        (precision, recall, tp, fp) = evaluate_af(grp)
        af_precision_list.append(precision)
        af_recall_list.append(recall)

    af_mean_precision = np.nanmean(np.array(af_precision_list))
    af_mean_recall = np.nanmean(np.array(af_recall_list))
    print('\nAF method: mean precision: ', af_mean_precision)
    print('AF method: mean recall: ', af_mean_recall)
    
    eval_results.append([af_mean_precision, af_mean_recall])

    # For BF
    bf_precision_list = []
    bf_recall_list = []
    print("\n#########-------For BF-------#########")
    for grp in self.groups:
        grp.generate_actual_recommendations(self.test_ratings, self.cfg.rating_threshold_bf)
        (precision, recall, tp, fp) = evaluate_bf(grp)
        bf_precision_list.append(precision)
        bf_recall_list.append(recall)

    bf_mean_precision = np.nanmean(np.array(bf_precision_list))
    bf_mean_recall = np.nanmean(np.array(bf_recall_list))
    print('\nBF method: mean precision: ', bf_mean_precision)
    print('BF method: mean recall: ', bf_mean_recall)

    eval_results.append([bf_mean_precision, bf_mean_recall])

    # For WBF
    wbf_precision_list = []
    wbf_recall_list = []
    print("\n#########-------For WBF-------#########")
    for grp in self.groups:
        grp.generate_actual_recommendations(self.test_ratings, self.cfg.rating_threshold_wbf)
        (precision, recall, tp, fp) = evaluate_wbf(grp)
        wbf_precision_list.append(precision)
        wbf_recall_list.append(recall)

    wbf_mean_precision = np.nanmean(np.array(wbf_precision_list))
    wbf_mean_recall = np.nanmean(np.array(wbf_recall_list))
    print('\nWBF method: mean precision: ', wbf_mean_precision)
    print('WBF method: mean recall: ', wbf_mean_recall)

    eval_results.append([wbf_mean_precision, wbf_mean_recall])

    return eval_results

GroupRec.evaluation = evaluation