<a href="https://colab.research.google.com/github/SomdeepAcharyya/Recommender-Systems/blob/main/Social_Trust_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Social Trust Ensemble 
# Learning to Recommend with Social Trust Ensemble
# Hao Ma, Irwin King and Michael R. Lyu

In [None]:
import numpy as np
import scipy.sparse as sp
import time
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error 
import scipy.spatial as spt
import statistics
import math

In [None]:
def RSTE(R,S,N,M,K,lambdaU,lambdaV,lambdaT,R_test,ul,il):
    def sigmoid(z):
        return 1.0 / (1+np.exp(-z))
    def dsigmoid(z):
        return np.exp(-z)/np.power((1+np.exp(-z)),2)
    def rmse(U,V,R):
        keylist = []
        dok_keys = np.array(R.todok().keys()).T.reshape([1,1])[0][0]
        for k in dok_keys:
          row = []
          for l in k:
            row.append(l)
          keylist.append(row) 
        keylist = np.array(keylist)
        utl = keylist[:, 0]
        itl = keylist[:, 1]
        error = (get_csrmat(sigmoid(U.dot(V.T)),utl,itl)-R).power(2).sum()/R.nnz
        return 5*np.sqrt(error)
    def mae(U,V,R):
        keylist = []
        dok_keys = np.array(R.todok().keys()).T.reshape([1,1])[0][0]
        for k in dok_keys:
          row = []
          for l in k:
            row.append(l)
          keylist.append(row) 
        keylist = np.array(keylist)
        utl = keylist[:, 0]
        itl = keylist[:, 1]
        error = abs(get_csrmat(sigmoid(U.dot(V.T)),utl,itl)-R).sum()/R.nnz
        return error
    def get_csrmat(mat,ul,il):
        indx = ul*mat.shape[1]+il
        return sp.csr_matrix((np.take(np.array(mat),indx),(ul,il)),shape=(N,M))
    def costL(U,V):
        tmp = lambdaT*U.dot(V.T)+(1-lambdaT)*S.dot((U.dot(V.T)))
        Rx = get_csrmat(sigmoid(tmp),ul,il)
        cost = 0.5*((R - Rx).power(2)).sum()+0.5*lambdaU*np.linalg.norm(U)**2+0.5*lambdaV*np.linalg.norm(V)**2
        return cost
    def gradient(U,V):
        dU = lambdaU*U
        tmp = lambdaT*U.dot(V.T)+(1-lambdaT)*S.dot((U.dot(V.T)))
        Rv = get_csrmat(dsigmoid(tmp),ul,il)
        Rx = get_csrmat(sigmoid(tmp),ul,il)
        matx = Rv.multiply((Rx-R)).dot(V)
        dU += lambdaT*matx
        dU += (1-lambdaT)*(S.T).dot(matx)
        dV = lambdaV*V
        dV += (Rv.multiply((Rx-R))).T.dot(lambdaT*U+(1-lambdaT)*S.dot(U))
        # print dU,dV
        if np.max(dU)>1:
            dU = dU/np.max(dU)
        if np.max(dV)>1:
            dV = dV/np.max(dV)
        return dU,dV

    def train(U,V):
        res=[]
        steps=15
        rate = 1e-4
        pregradU = 0
        pregradV = 0
        tol=1e-3
        momentum = 0.9
        stage = max(steps/100 , 1)
        for step in range(steps):
            start = time.time()
            dU,dV = gradient(U,V)
            dU = dU + momentum*pregradU
            dV = dV + momentum*pregradV
            pregradU = dU
            pregradV = dV
            if not step%stage and rate>0.0001:
                rate = 0.95*rate
            U -= rate * dU
            V -= rate * dV
            e = costL(U,V) / (len(U) * len(V))
            res.append(e)
            if not step%(stage*5):
                print(step,"mae", e,"rmse", e*34.6410161514, time.time()-start)
            if step>150 or abs(sum(res[-3:])-sum(res[-13:-10]))<tol:
                print("====================")
                print("stop in %d step"%(step))
                print("error is ",e)
                print("====================")
                break
        return U, V
    U = np.random.normal(0,0.01,size=(N,K))
    V = np.random.normal(0,0.01,size=(M,K))
    U,V = train(U,V)
    print("=================RESULT=======================")
    print('K:%d,lambdaU:%s, lambdaV:%s,lambdaT:%s' \
            %(K,lambdaU,lambdaV,lambdaT))
    print("rmse",rmse(U,V,R_test))
    print("mae",mae(U,V,R_test))
    return 0

# example

In [None]:
def t_yelp(limitu,limiti):
    #data from: http://www.trustlet.org/wiki/Epinions_datasets
    def getdata():
        N,M = limitu,limiti
        max_r = 5.0
        cNum = 8
        T=sp.dok_matrix((N,N))
        print('get T')
        for line in open('./yelp_data/users.txt','r'):
            u = int(line.split(':')[0])
            uf = line.split(':')[1][1:-1].split(',')
            if len(uf)>1:
                for x in line.split(':')[1][1:-1].split(',')[:-1]:
                    v = int(x)
                    if u<limitu and v<limitu:
                        T[u,v] = 1.0
        T = T.tocsr()
        print('get R_test')
        utl,itl,rtl = [],[],[]
        for line in open('./yelp_data/ratings-test.txt','r'):
            u,i,r = [int(x) for x in line.split('::')[:3]]
            if u<limitu and i<limiti:
                utl.append(u)
                itl.append(i)
                rtl.append(r/5.0)
        utl,itl = np.array(utl),np.array(itl)
        R_test = sp.csr_matrix((rtl,(utl,itl)),shape=(N,M))
        print('get R')
        ul,il,rl = [],[],[]
        for line in open('./yelp_data/ratings-train.txt','r'):
            u,i,r = [int(x) for x in line.split('::')[:3]]
            if u<limitu and i<limiti:
                ul.append(u)
                il.append(i)
                rl.append(r/5.0)
        ul,il = np.array(ul),np.array(il)
        R = sp.csr_matrix((rl,(ul,il)),shape=(N,M))
        # print "get Circle"
        # C = [[] for i in range(cNum)]
        # for line in open('./yelp_data/items-class.txt','r'):
        #     i,ci = [int(x) for x in line.split(' ')]
        #     if i<limit:
        #         C[ci].append(i)
        return R,T,N,M,R_test,ul,il
    R,T,N,M,R_test,ul,il = getdata()
    lambdaU,lambdaV,lambdaT,K = 0.01, 0.01, 0.01, 5
    RSTE(R,T,N,M,K,lambdaU,lambdaV,lambdaT,R_test,ul,il)

In [None]:
def t_toy_correct():
    R0 = [
         [5,3,0,1],
         [4,0,0,1],
         [1,1,0,5],
         [1,0,0,4],
         [0,1,5,4],
        ]     # rating matrix
    max_r = 5.0    # max _rating
    T0 = [[3,2],[1,3,4],[2],[1,5],[3]]     # trust relationship
    N,M,K=5,4,4     # n : no of users   m : no of items k : latent dimension        (5x4)@(4x4)
    lambdaU,lambdaV,lambdaT=0.02, 0.02, 0.1
    keys = []

    R=sp.dok_matrix((N,M))   # create sparse matrix for user x item
    T=sp.dok_matrix((N,N))   # create sparse matrix for trust among users user x user
    for i in range(len(R0)):    # no of users
        for j in range(len(R0[0])):   # no of items
            if R0[i][j]>0:    # if rating is present   
                keys.append([i,j])
                R[i,j] = 1.0 * R0[i][j] / max_r      # normalise the rating matrix     R is the new normalised rating matrix which will be used *******
    for i in range(len(T0)):    # no of users
        for j in T0[i]:         # no of trusted users of user i
            T[i, j-1]=1.0        # fill up trust matrix from user realtionships   *******
    keys = np.array(keys)        # get the keys
    R,T = R.tocsr(),T.tocsr()
    RSTE(R,T,N,M,K,lambdaU,lambdaV,lambdaT,R,keys[:, 0],keys[:, 1])

In [None]:
if __name__ == "__main__":
#   t_epinion()
   t_toy_correct()

0 0.7850324398340742 0.008330821990966797
50 0.7850309156871187 0.0016372203826904297
100 0.785029102448922 0.002238750457763672
stop in 101 step
error is  0.7850290661750954
K:4,lambdaU:0.02, lambdaV:0.02,lambdaT:0.1
rmse 1.7375782356560296
mae 0.3307669730422547


# target

In [None]:
# amazon review dataset magazines csv
path = r'/content/drive/MyDrive/Per_CD_RS/pers_fashion_filtered.csv'

with open(path, encoding="utf-8", errors='ignore') as infile:
  df = pd.read_csv(infile)
arr = np.array(df[['0', '1', '2', '3', '4']])

In [None]:
# amazon review dataset magazines csv
path = r'/content/drive/MyDrive/Per_CD_RS/Amazon_fashion_filtered.csv'
with open(path, encoding="utf-8", errors='ignore') as infile:
  az = pd.read_csv(infile)
az = az.rename(columns={"reviewerID":"userId", "asin":"itemId", "overall":"rating"})

In [None]:
# Tripadvisor review Dataset
path = r'/content/drive/MyDrive/Per_CD_RS/tripadvisor_reviews_with_country.csv'
with open(path, encoding="utf-8", errors='ignore') as infile:
  tr = pd.read_csv(infile)
tr = tr.rename(columns={"username":"userId", "taObject":"itemId"})
arr = tr[['open', 'cons', 'extra', 'agree', 'neuro', 'userId', 'itemId', 'rating']]

In [None]:
print(arr.shape, az.shape)
az = az[0:len(arr)]
print(arr.shape, az.shape)

(7841, 5) (7891, 6)
(7841, 5) (7841, 6)


In [None]:
arr = pd.DataFrame(arr)

In [None]:
arr['userId'] = az['userId']
arr['itemId'] = az['itemId']
arr['rating'] = az['rating']

In [None]:
arr.columns = ['open', 'cons', 'extra', 'agree', 'neuro', 'userId', 'itemId', 'rating']

In [None]:
# rating matrix wrt user u
ru = arr.pivot_table(index='userId',columns='itemId',values='rating')
ru = ru.fillna(0)
ru_m = ru > 0
ru_m = ru_m.replace(True, 1)
ru_m = ru_m.replace(False, 0)
ru = np.array(ru)
ru_m = np.array(ru_m)

In [None]:
# with only trust and rating test split

def tgt_pred(lr, lambda_p, lambda_t, t, k):

    lr, lambda_p, lambda_t, t, k = lr, lambda_p, lambda_t, t, k
    train_size = 0.8
    df_copy = arr.copy()
    train_set = df_copy.sample(frac=train_size).reset_index()
    user_features_train = np.array(train_set[['open', 'cons', 'extra', 'agree', 'neuro']].fillna(0))
    test_set = df_copy.drop(train_set.index).reset_index()
    user_features_test = np.array(test_set[['open', 'cons', 'extra', 'agree', 'neuro']].fillna(0))

    df = pd.DataFrame(user_features_train) 
    df.columns = ['open', 'cons', 'extra', 'agree', 'neuro']
    df['userId'] = train_set.userId
    df2 = df.groupby(by='userId').mean().reset_index()
    df3 = np.array(df2[["open", 'cons', 'extra', 'agree', 'neuro']])

    #ru = train_set.pivot_table(index='userId',columns='itemId',values='rating')
    #ru = ru.fillna(0)
    ru = train_set.groupby(['userId', 'itemId'])['rating'].sum().unstack()
    ru = pd.DataFrame(ru)
    ru = ru.replace(np.nan, 0)
    ru_m = ru > 0
    ru_m = ru_m.replace(True, 1)
    ru_m = ru_m.replace(False, 0)
    ru = np.array(ru)
    ru_m = np.array(ru_m)

    R0 = ru  # rating matrix
    max_r = 5.0    # max _rating   
    N,M= ru.shape[0],ru.shape[1]     # # n : no of users   m : no of items 
    K = k                         #k : latent dimension        (mx10)@(10xn)
    lambdaU,lambdaV,lambdaT=lambda_p, lambda_p, lambda_t
    keys = []

    R=sp.dok_matrix((N,M))   # create sparse matrix for user x item
    T=sp.dok_matrix((N,N))   # create sparse matrix for trust among users user x user
    for i in range(len(R0)):    # no of users
        for j in range(len(R0[i])):   # no of items
            if R0[i][j]>0:    # if rating is present   
                keys.append([i,j])
                R[i,j] = R0[i][j]      # normalise the rating matrix     R is the new normalised rating matrix which will be used *******

    dft = pd.DataFrame(user_features_test) 
    dft.columns = ['open', 'cons', 'extra', 'agree', 'neuro']
    dft['userId'] = test_set.userId
    df2t = dft.groupby(by='userId').mean().reset_index()
    df3t = np.array(df2t[["open", 'cons', 'extra', 'agree', 'neuro']])

    #rut = test_set.pivot_table(index='userId',columns='itemId',values='rating')
    #rut = rut.fillna(0)
    #ru_mt = rut > 0
    #ru_mt = ru_mt.replace(True, 1)
    #ru_mt = ru_mt.replace(False, 0)
    #rut = np.array(rut)
    #ru_mt = np.array(ru_mt)

    #R0t = rut    # rating matrix
    #max_rt = 5.0    # max _rating   
    #Nt,Mt= rut.shape[0],rut.shape[1]     # # n : no of users   m : no of items 
    K = k                        #k : latent dimension        (mx10)@(10xn)
    lambdaU,lambdaV,lambdaT=lambda_p, lambda_p, lambda_t
    keyst = []

    #Rt=sp.dok_matrix((Nt,Mt))   # create sparse matrix for user x item
    #for i in range(len(R0t)):    # no of users
    #    for j in range(len(R0t[i])):   # no of items
    #        if R0t[i][j]>0:    # if rating is present   
    #            keyst.append([i,j])
    #            Rt[i,j] = R0t[i][j]      # normalise the rating matrix     R is the new normalised rating matrix which will be used *******


    
    # get trust factor between users trust_uv(user x user)  and sim_uv(user x user) t_uv(user x user)


    trust_uv = 1 - spt.distance.cdist(df3, df3, 'cosine')
    trust_uv = np.nan_to_num(trust_uv)
    trust_uv_s = trust_uv.copy()
    trust_uv = pd.DataFrame(trust_uv > t)
    trust_uv = trust_uv.replace(True, 1)
    trust_uv = trust_uv.replace(False, 0)
    trust_uv = np.array(trust_uv)
    tknn = []
    for j in range(len(trust_uv)):
      row = []
      for x in range(len(trust_uv[j])):
        if trust_uv[j][x] == 1:
          row.append(x)
      tknn.append(row)
    tknn = np.array(tknn)
    #sim_uv = 1 - spt.distance.cdist(ru_m, ru_m, 'cosine')
    #sim_uv = np.nan_to_num(sim_uv)
    #t_uv = (np.add(trust_uv, sim_uv)/2)
    t_uv = trust_uv
    print(t_uv.shape)

    max_value = max(t_uv.flatten())
    T_df = pd.DataFrame(t_uv)
    T_df = T_df >= t * max_value
    T_df = T_df.replace(True, 1)
    T_df = T_df.replace(False, 0)
    T_df = np.array(T_df)
    T = sp.csr_matrix(T_df)
           # fill up trust matrix from user realtionships   *******
    keys = np.array(keys)        # get the keys
    R,T = R.tocsr(),T.tocsr()
    print("entring rste")
    print(lr, lambda_p, lambda_t, t, k)
    RSTE(R,T,N,M,K,lambdaU,lambdaV,lambdaT,R,keys[:, 0],keys[:, 1])

In [None]:
if __name__ == "__main__":
#   t_epinion()
#   t_toy_correct()
    tgt_pred()

In [None]:
# with tf idf of rating

def tgt_pred_log():
    R0 = ru    # rating matrix
    max_r = 5.0    # max _rating
    T0 = [[3,2],[1,3,4],[2],[1,5],[3]]     # trust relationship
    N,M= ru.shape[0],ru.shape[1]     # # n : no of users   m : no of items 
    K = 10                         #k : latent dimension        (mx10)@(10xn)
    lambdaU,lambdaV,lambdaT=0.02, 0.02, 0.1
    keys = []

    R=sp.dok_matrix((N,M))   # create sparse matrix for user x item
    T=sp.dok_matrix((N,N))   # create sparse matrix for trust among users user x user
    for i in range(len(R0)):    # no of users
        for j in range(len(R0[i])):   # no of items
            if R0[i][j]>0:    # if rating is present   
                keys.append([i,j])
                R[i,j] = R0[i][j]      # normalise the rating matrix     R is the new normalised rating matrix which will be used *******

    
    # get trust factor between users trust_uv(user x user)  and sim_uv(user x user) t_uv(user x user)
    df = pd.DataFrame(tgt_pers) 
    df.columns = ['open', 'cons', 'extra', 'agree', 'neuro']
    df['userId'] = tgt.userId
    df2 = df.groupby(by='userId').mean().reset_index()
    df2 = np.array(df2[["open", 'cons', 'extra', 'agree', 'neuro']])
    trust_uv = 1 - spt.distance.cdist(df2, df2, 'cosine')
    trust_uv = np.nan_to_num(trust_uv)
    trust_uv_s = trust_uv.copy()
    trust_uv = pd.DataFrame(trust_uv > 0.7)
    trust_uv = trust_uv.replace(True, 1)
    trust_uv = trust_uv.replace(False, 0)
    trust_uv = np.array(trust_uv)
    tknn = []
    for j in range(len(trust_uv)):
      row = []
      for k in range(len(trust_uv[j])):
        if trust_uv[j][k] == 1:
          row.append(k)
      tknn.append(row)
    tknn = np.array(tknn)
    sim_uv = 1 - spt.distance.cdist(ru_m, ru_m, 'cosine')
    sim_uv = np.nan_to_num(sim_uv)
    t_uv = (np.add(trust_uv, sim_uv)/2)
    #t_uv = np.multiply(t_uv, ru_tf)
    print(t_uv.shape)


    max_value = max(t_uv.flatten())
    T_df = pd.DataFrame(t_uv)
    T_df = T_df >= 0.7 * max_value
    T_df = T_df.replace(True, 1)
    T_df = T_df.replace(False, 0)
    T_df = np.array(T_df)
    T = sp.csr_matrix(T_df)      # fill up trust matrix from user realtionships   *******
    keys = np.array(keys)        # get the keys
    R,T = R.tocsr(),T.tocsr()
    print("entering RSTE")
    RSTE(R,T,N,M,K,lambdaU,lambdaV,lambdaT,R,keys[:, 0],keys[:, 1])

In [None]:
# only trust factor

def tgt_pred():
    R0 = ru    # rating matrix
    max_r = 5.0    # max _rating
    # trust relationship
    N,M= ru.shape[0],ru.shape[1]     # # n : no of users   m : no of items 
    K = 10                         #k : latent dimension        (mx10)@(10xn)
    lambdaU,lambdaV,lambdaT=0.02, 0.02, 0.01
    keys = []

    R=sp.dok_matrix((N,M))   # create sparse matrix for user x item
    T=sp.dok_matrix((N,N))   # create sparse matrix for trust among users user x user
    for i in range(len(R0)):    # no of users
        for j in range(len(R0[i])):   # no of items
            if R0[i][j]>0:    # if rating is present   
                keys.append([i,j])
                R[i,j] = R0[i][j]      # normalise the rating matrix     R is the new normalised rating matrix which will be used *******

    
    # get trust factor between users trust_uv(user x user)  and sim_uv(user x user) t_uv(user x user)
    df = pd.DataFrame(tgt_pers) 
    df.columns = ['open', 'cons', 'extra', 'agree', 'neuro']
    df['userId'] = src.userId
    df2 = df.groupby(by='userId').mean().reset_index()
    df2 = np.array(df2[["open", 'cons', 'extra', 'agree', 'neuro']])
    trust_uv = 1 - spt.distance.cdist(df2, df2, 'cosine')
    trust_uv = np.nan_to_num(trust_uv)
    trust_uv_s = trust_uv.copy()
    trust_uv = pd.DataFrame(trust_uv > 0.7)
    trust_uv = trust_uv.replace(True, 1)
    trust_uv = trust_uv.replace(False, 0)
    trust_uv = np.array(trust_uv)
    tknn = []
    for j in range(len(trust_uv)):
      row = []
      for k in range(len(trust_uv[j])):
        if trust_uv[j][k] == 1:
          row.append(k)
      tknn.append(row)
    tknn = np.array(tknn)
    sim_uv = 1 - spt.distance.cdist(ru_m, ru_m, 'cosine')
    sim_uv = np.nan_to_num(sim_uv)
    #t_uv = (np.add(trust_uv, sim_uv)/2)
    t_uv = trust_uv
    print(t_uv.shape)


    max_value = max(t_uv.flatten())
    T_df = pd.DataFrame(t_uv)
    T_df = T_df >= 0.7 * max_value
    T_df = T_df.replace(True, 1)
    T_df = T_df.replace(False, 0)
    T_df = np.array(T_df)
    T = sp.csr_matrix(T_df)      # fill up trust matrix from user realtionships   *******
    keys = np.array(keys)        # get the keys
    R,T = R.tocsr(),T.tocsr()
    print("entering RSTE")
    RSTE(R,T,N,M,K,lambdaU,lambdaV,lambdaT,R,keys[:, 0],keys[:, 1])

In [None]:
without log 3.4493965705680547     0.10282609135755258

log         3.4503007416729585     0.11241594781533616

yakchi pacis    3.4490502620041434   0.08701762665204069

In [None]:
# @params: 
# trust threshold (0.7)
# trust factor lambda (0.5)
# lambdaU,lambdaV,lambdaT (0.02, 0.02, 0.1)
# no of iterations (100)
# no of latent features K (10)
# train test split

# metrics
# MAE
# RMSE

# benchmark
# yakchi pacis
# tobias umap
# p2mf cdrup

# variations
# numerical personality values (0,1)
# binary personality values [0 and 1]
# with demography dbscan clustering
  # ensemble through average voting
  # ensemble through plurality voting 
# without clustering

In [None]:
# tuning

In [None]:
lr = [0.01]
K = [10]
lambda_p = [0.01]
lambda_t = [0.1]
thres = [0.8]

In [None]:
l, lp, lt, th, k = 0.01, 0.01, 0.1, 0.8, 10

In [None]:
if __name__ == "__main__":
     model = tgt_pred(l, lp, lt, th, k)

(2343, 2343)
entring rste
0.01 0.01 0.1 0.8 10
0 mae 0.04761326546100986 rmse 1.6493718978557383 2.679325819015503
5 mae 0.04728160692871591 rmse 1.637882909281794 2.63616943359375
10 mae 0.04601757319156814 rmse 1.5940954961773437 2.670991897583008
K:10,lambdaU:0.01, lambdaV:0.01,lambdaT:0.1
rmse 20.023104470335646
mae 3.7879757522079087
