In [1]:
import os
import time
import argparse
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.backends.cudnn as cudnn

import model
import evaluate
import data_utils
import pandas as pd
from item_side_utils import *

import random
random_seed = 1
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
np.random.seed(random_seed)
random.seed(random_seed)

ModuleNotFoundError: No module named 'model'

In [None]:
data_path = "/storage/wjwang/filter_bubbles/data/"
# dataset = 'ml_1m'
dataset = 'amazon_book_only_first'

In [None]:
user_target = np.load(data_path+dataset+'/user_target.npy', allow_pickle=True).item()
category_list = np.load(data_path+dataset+'/category_list.npy', allow_pickle=True).tolist()
item_category = np.load(data_path+dataset+'/item_category.npy', allow_pickle=True).tolist()

train_path = data_path + '{}/training_list.npy'.format(dataset)
valid_path = data_path + '{}/validation_dict.npy'.format(dataset)
test_path = data_path + '{}/testing_dict.npy'.format(dataset)

train_list = np.load(train_path, allow_pickle=True).tolist()
valid_dict = data_utils.loadData(valid_path)
test_dict = data_utils.loadData(test_path)

train_dict = {}
for pair in train_list:
    userID, itemID = pair
    if userID not in train_dict:
        train_dict[userID] = []
    train_dict[userID].append(itemID)

In [None]:
def get_group_distribution(user_list, interaction_dict, item_feature, category_len, is_category_avg = True):
    distribution = [0] * category_len
    distribution_user = [0] * category_len
    for user in user_list:
        distribution = [0] * category_len
        for item in interaction_dict[user]:
            for cate in item_feature[item]:
                if is_category_avg == True:
                    distribution[cate] += 1/len(item_feature[item])
                else:
                    distribution[cate] += 1
        distribution_user = [distribution_user[i] + distribution[i]/len(interaction_dict[user]) for i in range(category_len)]
    distribution_avg = [i/len(user_list) for i in distribution_user]
    return distribution_avg

category_len = len(category_list)

user_target_weights = {}
user_target_weights_small = {}
user_mask_main = {}
user_distribution_dict = {}
user_distribution_list = []

user_mask_main_small = {}
user_target_main = {}
user_target_main_small = {}

threshold_max = 0.5
threshold_min = 0.1

for userID in test_dict:
    distribution_avg = get_group_distribution([userID], train_dict, item_category, category_len, False)
    indices = np.argsort(-np.array(distribution_avg))
    
    test_distribution_avg = get_group_distribution([userID], test_dict, item_category, category_len, False)
    test_indices = np.argsort(-np.array(test_distribution_avg))
    
    if distribution_avg[indices[0]] > threshold_max and indices[0] != test_indices[0]: # users with preference drifts
        user_target_weights[userID] = test_distribution_avg
        user_mask_main[userID] = [indices[0]]
        user_target_main[userID] = [test_indices[0]]
        
        cnt = 0
        for item in test_dict[userID]:
            if indices[0] not in item_category[item]:
                cnt += 1
        if cnt > 2:
            user_target_weights_small[userID] = test_distribution_avg
            user_mask_main_small[userID] = indices[0]
            user_target_main_small[userID] = [test_indices[0]]
            
    user_distribution_dict[userID] = distribution_avg
    user_distribution_list.append(distribution_avg)
    
print(len(user_target_weights))
print(len(user_mask_main_small))


# np.save(data_path+dataset+'/user_mask_main.npy', user_mask_main)
# np.save(data_path+dataset+'/user_target_weights.npy', user_target_weights)
# np.save(data_path+dataset+'/user_target_main.npy', user_target_main)

# np.save(data_path+dataset+'/user_mask_main_small.npy', user_mask_main_small)
# np.save(data_path+dataset+'/user_target_weights_small.npy', user_target_weights_small)
# np.save(data_path+dataset+'/user_target_main_small.npy', user_target_main_small)

np.save(data_path+dataset+'/user_distribution_dict.npy', user_distribution_dict)
np.save(data_path+dataset+'/user_distribution_list.npy', user_distribution_list)

In [None]:
# 分组不能只限制target cate 里的items要多，main cate里面的items也要多。或者不限制cnt>2