In [1]:
from scipy.io import loadmat
import pandas as pd
import os
import shutil
import sys

import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sn
sn.set()

import bottleneck as bn

In [2]:
epimat = loadmat('/home/worldchanger01/Downloads/Data/epinions/rating.mat')
epimat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'rating'])

In [3]:
epimat['rating'].shape

(922267, 4)

In [4]:
epimat['rating'][0]

array([1, 1, 3, 2], dtype=int32)

In [5]:
data_epimat=epimat['rating']
data_epimat = pd.DataFrame({"UserId":data_epimat[:, 0],"ProductId":data_epimat[:, 1],"CategoryId":data_epimat[:, 2],"Rating":data_epimat[:, 3]})

In [6]:
# binarize the data (only keep ratings >= 4)
dataset_epimat = data_epimat[data_epimat['Rating'] > 3.5]

In [7]:
dataset_epimat.head()

Unnamed: 0,UserId,ProductId,CategoryId,Rating
3,1,4,3,5
5,1,6,11,4
7,1,8,3,4
8,1,9,3,4
11,2,12,3,4


In [8]:
dataset_epimat.describe()

Unnamed: 0,UserId,ProductId,CategoryId,Rating
count,680730.0,680730.0,680730.0,680730.0
mean,11081.471221,75995.601288,8.709963,4.572588
std,6359.002329,81872.287838,6.10637,0.494703
min,1.0,1.0,1.0,4.0
25%,5551.0,9490.0,4.0,4.0
50%,11157.0,41047.5,7.0,5.0
75%,16451.0,124321.0,13.0,5.0
max,22166.0,296277.0,27.0,5.0


In [9]:
dataset_epimat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 680730 entries, 3 to 922265
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   UserId      680730 non-null  int32
 1   ProductId   680730 non-null  int32
 2   CategoryId  680730 non-null  int32
 3   Rating      680730 non-null  int32
dtypes: int32(4)
memory usage: 15.6 MB


In [10]:
dataset_epimat.groupby("UserId").size()

UserId
1         4
2        20
3        14
4        10
5        11
         ..
22162    16
22163    12
22164     8
22165    14
22166    10
Length: 22157, dtype: int64

In [11]:
users_set = set(dataset_epimat.UserId)
num_users = len(users_set)

In [12]:
dataset_epimat.groupby("ProductId").size()

ProductId
1         154
2         160
3          11
4          32
5          34
         ... 
296272      1
296273      1
296274      1
296275      1
296277      1
Length: 242317, dtype: int64

In [13]:
items_set = set(dataset_epimat.ProductId)
num_items = len(items_set)

In [14]:
num_inter = dataset_epimat.shape[0]
num_inter

680730

In [15]:
def get_count(tp, id):
    inter_count_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = inter_count_groupbyid.size()
    return count

In [16]:
def filter_regular_users(tp, min_uc=20, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'ProductId')
        tp = tp[tp['ProductId'].isin(itemcount.index[itemcount["size"] >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'UserId')
        tp = tp[tp['UserId'].isin(usercount.index[usercount["size"] >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'UserId'), get_count(tp, 'ProductId') 
    return tp, usercount, itemcount

In [17]:
regular_users_data, user_activity, item_popularity = filter_regular_users(dataset_epimat)

In [18]:
num_regular_users = user_activity.shape[0]
num_regular_users

7838

In [19]:
num_coldstart_users = num_users - num_regular_users
num_coldstart_users

14319

In [20]:
interaction_density = (1. * dataset_epimat.shape[0] / (num_users * num_items))*100
print("The interaction density is: %.4f%%" %interaction_density)

The interaction density is: 0.0127%


### TRUST NETWORK

In [21]:
epitrust = loadmat('/home/worldchanger01/Downloads/Data/epinions/trustnetwork.mat')
epitrust.keys()

dict_keys(['__header__', '__version__', '__globals__', 'trustnetwork'])

In [22]:
epitrust["trustnetwork"].shape

(355754, 2)

In [23]:
dataset_epitrust = epitrust['trustnetwork']
dataset_epitrust = pd.DataFrame({"Truster":dataset_epitrust[:, 0],"Trustee":dataset_epitrust[:, 1]})

In [24]:
dataset_epitrust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355754 entries, 0 to 355753
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Truster  355754 non-null  uint16
 1   Trustee  355754 non-null  uint16
dtypes: uint16(2)
memory usage: 1.4 MB


In [25]:
dataset_epitrust.head()

Unnamed: 0,Truster,Trustee
0,15373,9831
1,4247,9831
2,4644,9831
3,6823,9831
4,7479,9831


In [26]:
trust_relation_density = (1. * dataset_epitrust.shape[0] / (num_users * num_users))*100
print("The trust relation density is: %.4f%%" %trust_relation_density)

The trust relation density is: 0.0725%


### DATA PROCESSING FOR REGULAR USERS

In [27]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [28]:
#create train/validation/test users
n_users = unique_uid.size
print(n_users)
n_heldout_users = 1000

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

7838


In [29]:
train_inter = regular_users_data.loc[regular_users_data['UserId'].isin(tr_users)]

In [30]:
unique_sid = pd.unique(train_inter['ProductId'])

In [31]:
unique_sid.shape

(42299,)

In [32]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [33]:
DATA_DIR="/home/worldchanger01/Downloads/Data/epinions/core/regularusers"

In [34]:
pro_dir = os.path.join(DATA_DIR, 'pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

In [35]:
with open(os.path.join(pro_dir, 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)

In [36]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('UserId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [37]:
vad_inter = regular_users_data.loc[regular_users_data['UserId'].isin(vd_users)]
vad_inter = vad_inter.loc[vad_inter['ProductId'].isin(unique_sid)]

In [38]:
vad_inter_tr, vad_inter_te = split_train_test_proportion(vad_inter)

0 users sampled


In [39]:
test_inter = regular_users_data.loc[regular_users_data['UserId'].isin(te_users)]
test_inter = test_inter.loc[test_inter['ProductId'].isin(unique_sid)]

In [40]:
test_inter_tr, test_inter_te = split_train_test_proportion(test_inter)

0 users sampled


In [41]:
def numerize(tp):
        uid = [profile2id[x] for x in tp['UserId']]
        sid = [show2id[x] for x in tp['ProductId']]
        return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [42]:
train_data = numerize(train_inter)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

In [43]:
vad_data_tr = numerize(vad_inter_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

In [44]:
vad_data_te = numerize(vad_inter_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

In [45]:
test_data_tr = numerize(test_inter_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

In [46]:
test_data_te = numerize(test_inter_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

### DATA PROCESSING FOR COLD-START USERS

In [47]:
rg_users = list(regular_users_data['UserId'])

In [48]:
coldst_df = dataset_epimat.loc[~dataset_epimat['UserId'].isin(rg_users)]

In [49]:
coldst_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 444307 entries, 11 to 922265
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   UserId      444307 non-null  int32
 1   ProductId   444307 non-null  int32
 2   CategoryId  444307 non-null  int32
 3   Rating      444307 non-null  int32
dtypes: int32(4)
memory usage: 10.2 MB


In [50]:
dataset_epimat.shape

(680730, 4)

In [51]:
regular_users_data.shape

(236423, 4)

In [52]:
user_activity.shape

(7838, 2)

In [53]:
user_activity_cs = get_count(coldst_df,"UserId")
user_activity_cs.shape

(14319, 2)

In [54]:
item_popularity.shape

(113739, 2)

In [55]:
len(set(coldst_df.UserId))

14319

In [56]:
item_activity_cs = get_count(coldst_df,"UserId")
item_activity_cs.shape

(14319, 2)

In [57]:
coldst_df.head()

Unnamed: 0,UserId,ProductId,CategoryId,Rating
11,2,12,3,4
13,2,14,3,5
14,2,15,5,5
15,2,16,5,4
16,2,1,3,4


In [58]:
unique_uid_cs = user_activity_cs.index

np.random.seed(98765)
idx_perm_cs = np.random.permutation(unique_uid_cs.size)
unique_uid_cs = unique_uid_cs[idx_perm_cs]

In [59]:
#create train/validation/test users
n_users_cs = unique_uid.size
print(n_users_cs)
n_heldout_users_cs = 1000

tr_cs_users = unique_uid[:(n_users_cs - n_heldout_users_cs * 2)]
vd_cs_users = unique_uid[(n_users_cs - n_heldout_users_cs * 2): (n_users_cs - n_heldout_users_cs)]
te_cs_users = unique_uid[(n_users_cs - n_heldout_users_cs):]

7838


In [60]:
train_inter_cs = coldst_df.loc[coldst_df['UserId'].isin(tr_cs_users)]

In [61]:
unique_sid_cs = pd.unique(train_inter_cs['ProductId'])

In [62]:
unique_sid_cs.shape

(71401,)

In [63]:
show2id_cs = dict((sid, i) for (i, sid) in enumerate(unique_sid_cs))
profile2id_cs = dict((pid, i) for (i, pid) in enumerate(unique_uid_cs))

In [64]:
DATA_DIR="/home/worldchanger01/Downloads/Data/epinions/core/coldstart"

In [65]:
pro_dir = os.path.join(DATA_DIR, 'pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid_cs:
        f.write('%s\n' % sid)

In [66]:
with open(os.path.join(pro_dir, 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid_cs:
        f.write('%s\n' % uid)

In [67]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('UserId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [68]:
vad_inter_cs = coldst_df.loc[coldst_df['UserId'].isin(vd_cs_users)]
vad_inter_cs = vad_inter_cs.loc[vad_inter_cs['ProductId'].isin(unique_sid_cs)]

In [69]:
vad_inter_tr_cs, vad_inter_te_cs = split_train_test_proportion(vad_inter_cs)

0 users sampled


In [70]:
test_inter_cs = coldst_df.loc[coldst_df['UserId'].isin(te_cs_users)]
test_inter_cs = test_inter_cs.loc[test_inter_cs['ProductId'].isin(unique_sid_cs)]

In [71]:
test_inter_tr_cs, test_inter_te_cs = split_train_test_proportion(test_inter_cs)

0 users sampled


In [72]:
def numerize(tp):
        uid = [profile2id_cs[x] for x in tp['UserId']]
        sid = [show2id_cs[x] for x in tp['ProductId']]
        return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [73]:
train_data = numerize(train_inter_cs)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

In [74]:
vad_data_tr = numerize(vad_inter_tr_cs)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

In [75]:
vad_data_te = numerize(vad_inter_te_cs)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

In [76]:
test_data_tr = numerize(test_inter_tr_cs)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

In [77]:
test_data_te = numerize(test_inter_te_cs)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)