In [5]:
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm
from math import log
from scipy.sparse import coo_matrix
from sklearn.neighbors import LocalOutlierFactor
import torch

In [2]:
datasets = ['BookCrossing', 'Epinions', 'LFM360K', 'ML1M', 'ML20M', 'Yelp']

In [None]:
for dataset in datasets:
    print(dataset, "- start evaluating")
    with open('../../mod_data/data_statistic.pkl' , 'rb') as f:
        info = pickle.load(f)
        num_user = info[dataset][0]
        num_item = info[dataset][1]

    train_df = pd.read_csv('../../mod_data/' + dataset+ '/sep_data/train_df.csv')

    pos_user_array = train_df['userId'].values
    pos_item_array = train_df['itemId'].values

    # Evaluate mainstreaminess by similarity-based approach
    train_mat = coo_matrix((np.ones(len(pos_user_array), dtype=int), (pos_user_array, pos_item_array)), shape=(num_user, num_item)).toarray()
    user_pop = np.sum(train_mat, axis=1)
    Jaccard_mat = np.matmul(train_mat, train_mat.T)
    deno = user_pop.reshape((-1, 1)) + user_pop.reshape((1, -1)) - Jaccard_mat + 1e-7
    deno = np.array(deno, dtype='float32')
    Jaccard_mat = np.array(Jaccard_mat, dtype='float32')
    Jaccard_mat /= deno
    Jaccard_mat = Jaccard_mat + np.eye(num_user) * -9999
    Jaccard_mat = Jaccard_mat[np.where(Jaccard_mat > -1)].reshape((num_user, num_user - 1))
    MS_similarity = np.mean(Jaccard_mat, axis=1)
    with open('../mainstream_score/' + dataset+  '/MS_similarity.npy', "wb") as f:
        np.save(f, MS_similarity)
    print("complete the similarity-based approach")

    # Evaluate mainstreaminess by distribution-based approach
    avg_user = np.mean(train_mat, axis=0)
    MS_distribution = np.matmul(train_mat, avg_user.reshape((-1, 1))).reshape(-1)
    deno1 = np.sum(train_mat ** 2, axis=1) ** 0.5
    deno2 = np.sum(avg_user ** 2) ** 0.5
    MS_distribution = MS_distribution / deno1 / deno2
    with open('../mainstream_score/' + dataset+ '/MS_distribution.npy', "wb") as f:
        np.save(f, MS_distribution)
    print("complete the distribution-based approach")

    # Evaluate mainstreaminess by density-based approach
    clf = LocalOutlierFactor(n_neighbors=300, n_jobs=-1)
    clf.fit(train_mat)
    MS_density = -clf.negative_outlier_factor_
    with open('../mainstream_score/' + dataset+  '/MS_density.npy', "wb") as f:
        np.save(f, MS_density)
    print("complete the density-based approach")

    Evaluate mainstreaminess by our approach
    train_mat = coo_matrix((np.ones(len(pos_user_array), dtype=int), (pos_user_array, pos_item_array)), shape=(num_user, num_item)).toarray()
    user_pop = np.sum(train_mat, axis=1)
    item_pop = np.sum(train_mat, axis=0)

    item_logpop = []
    for i in item_pop:
        item_logpop.append(np.log2(i+1))

    MS_ours = []
    for i in range(train_mat.shape[0]):
        user_mainscore_sum = 0.0
        for uid in np.where(train_mat[i] != 0)[0]:
            user_mainscore_sum += item_logpop[uid]
        MS_ours.append(user_mainscore_sum / user_pop[i])

    with open('../mainstream_score/' + dataset+ '/MS_ours.npy', "wb") as f:
        np.save(f, MS_ours)
    print("complete our approach")
    print("----------------------------------------------------")