<a href="https://colab.research.google.com/github/SomdeepAcharyya/Recommender-Systems/blob/main/SocialMF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# SocialMF
# A Matrix Factorization Technique with Trust Propagation for Recommendation in Social Networks
# M Jamali, M Ester

In [None]:
import numpy as np
import numba as nb
import scipy.sparse as sp
import time
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import scipy.spatial as spt
import statistics
import math
import json
from sklearn.model_selection import GridSearchCV
from random import sample

In [None]:
!pip install tweet-preprocessor
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY,p.OPT.MENTION,p.OPT.HASHTAG, p.OPT.ESCAPE_CHAR, p.OPT.RESERVED)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
from textblob import Word
import re
punctuation = re.compile(r'[-.?&!,:;()|0-9]')

In [None]:
def socialMF(R,S,N,M,K,lambdaU,lambdaV,lambdaT,R_test,ul,il):
    def sigmoid(z):
        return 1 + ((5.0 - 1.0) / (1+np.exp(-z)))
    def dsigmoid(z):
        return (5.0 - 1.0) * np.exp(-z)/np.power((1+np.exp(-z)),2)
    def rmse(U,V,R):
        keylist = []
        dok_keys = np.array(R.todok().keys()).T.reshape([1,1])[0][0]
        for k in dok_keys:
          row = []
          for l in k:
            row.append(l)
          keylist.append(row) 
        keylist = np.array(keylist)
        utl = keylist[:, 0]
        itl = keylist[:, 1]
        error = (get_csrmat(sigmoid(U.dot(V.T)),utl,itl)-R).power(2).sum()/R.nnz
        return 5*np.sqrt(error)
    def mae(U,V,R):
        keylist = []
        dok_keys = np.array(R.todok().keys()).T.reshape([1,1])[0][0]
        for k in dok_keys:
          row = []
          for l in k:
            row.append(l)
          keylist.append(row) 
        keylist = np.array(keylist)
        utl = keylist[:, 0]
        itl = keylist[:, 1]
        error = abs(get_csrmat(sigmoid(U.dot(V.T)),utl,itl)-R).sum()/R.nnz
        return error
    def get_csrmat(mat,ul,il):
        indx = ul*mat.shape[1]+il
        return sp.csr_matrix((np.take(np.array(mat),indx),(ul,il)),shape=(N,M))
    def costL(U,V):
        tmp = U.dot(V.T)
        Rx = get_csrmat(sigmoid(tmp),ul,il)
        cost = 0.5*((R - Rx).power(2)).sum()+0.5*lambdaU*np.linalg.norm(U)**2+0.5*lambdaV*np.linalg.norm(V)**2
        cost += 0.5*lambdaT*np.power(U-S.dot(U),2).sum()
        return cost
    def gradient(U,V):
        dU = np.zeros(U.shape)
        dV = np.zeros(V.shape)
        dU = lambdaU*U
        tmp = U.dot(V.T)
        Rv = get_csrmat(dsigmoid(tmp),ul,il)
        Rx = get_csrmat(sigmoid(tmp),ul,il)
        dU += Rv.multiply((Rx-R)).dot(V)
        dU += lambdaT*(U-S.dot(U))-lambdaT*S.T.dot((U-S.dot(U)))
        dV = lambdaV*V
        dV += (Rv.multiply((Rx-R))).T.dot(U)
        # print dU,dV
        if np.max(dU)>1:
            dU = dU/np.max(dU)
        if np.max(dV)>1:
            dV = dV/np.max(dV)
        return dU,dV


    def train(U,V):
        res=[]
        steps=10
        rate = 1e-2
        pregradU = 0
        pregradV = 0
        tol=1e-6
        momentum = 0.9
        stage = max(steps/100 , 1)
        for step in range(steps):
            start = time.time()
            dU,dV = gradient(U,V)
            dU = dU + momentum*pregradU
            dV = dV + momentum*pregradV
            pregradU = dU
            pregradV = dV
            if not step%stage and rate>0.001:
                rate = 0.95*rate
            U -= rate * dU
            V -= rate * dV
            e = costL(U,V) / (len(U) * len(V))
            res.append(e)
            if not step%stage:
                print(step,e,time.time() - start)
                #print("RMSE",rmse(U,V,R), "MAE",mae(U,V,R))
                e1 = e
            if step>150: #or abs(sum(res[-3:])-sum(res[-13:-10]))<tol:
                print("====================")
                print("stop in %d step"%(step))
                print("error is ",e)
                print("====================")
                break
        return U, V


    U = np.random.normal(0,0.01,size=(N,K))
    V = np.random.normal(0,0.01,size=(M,K))
    start = time.time()
    U,V = train(U,V)
    print("=================RESULT=======================")
    print('K:%d,lambdaU:%s, lambdaV:%s,lambdaT:%s' \
            %(K,lambdaU,lambdaV,lambdaT))
    print("rmse",rmse(U,V,R_test))
    print("mae",mae(U,V,R_test))
    print("time",time.time() - start)
    print("========================================")
    #print("mae recal", mean_absolute_error((U@V.T).toarray(), R_test))
    return 0

In [None]:
def calc_distance(A,A1):
  similarity = np.dot(A, A1.T)
  square_mag = np.diag(similarity)
  inv_square_mag = 1 / square_mag
  inv_square_mag[np.isinf(inv_square_mag)] = 0
  inv_mag = np.sqrt(inv_square_mag)
  cosine = similarity * inv_mag
  cosine = cosine.T * inv_mag
  return cosine

## examples

In [None]:
def t_yelp(limitu,limiti):
    #data from: http://www.trustlet.org/wiki/Epinions_datasets
    def getdata():
        N,M = limitu,limiti
        max_r = 5.0
        cNum = 8
        T=sp.dok_matrix((N,N))
        print('get T')
        for line in open('./yelp_data/users.txt','r'):
            u = int(line.split(':')[0])
            uf = line.split(':')[1][1:-1].split(',')
            if len(uf)>1:
                for x in line.split(':')[1][1:-1].split(',')[:-1]:
                    v = int(x)
                    if u<limitu and v<limitu:
                        T[u,v] = 1.0
        T = T.tocsr()
        print('get R_test')
        utl,itl,rtl = [],[],[]
        for line in open('./yelp_data/ratings-test.txt','r'):
            u,i,r = [int(x) for x in line.split('::')[:3]]
            if u<limitu and i<limiti:
                utl.append(u)
                itl.append(i)
                rtl.append(r/5.0)
        utl,itl = np.array(utl),np.array(itl)
        R_test = sp.csr_matrix((rtl,(utl,itl)),shape=(N,M))
        print('get R')
        ul,il,rl = [],[],[]
        for line in open('./yelp_data/ratings-train.txt','r'):
            u,i,r = [int(x) for x in line.split('::')[:3]]
            if u<limitu and i<limiti:
                ul.append(u)
                il.append(i)
                rl.append(r/5.0)
        ul,il = np.array(ul),np.array(il)
        R = sp.csr_matrix((rl,(ul,il)),shape=(N,M))
        # print "get Circle"
        # C = [[] for i in range(cNum)]
        # for line in open('./yelp_data/items-class.txt','r'):
        #     i,ci = [int(x) for x in line.split(' ')]
        #     if i<limit:
        #         C[ci].append(i)
        return R,T,N,M,R_test,ul,il
    R,T,N,M,R_test,ul,il = getdata()

    lambdaU,lambdaV,lambdaT,K = 0.01, 0.01, 0.3, 5
    socialMF(R,T,N,M,K,lambdaU,lambdaV,lambdaT,R_test,ul,il)

In [None]:
def t_toy():
    R0 = [
         [5,3,0,1],
         [4,0,0,1],
         [1,1,0,5],
         [1,0,0,4],
         [0,1,5,4],
        ]     # rating matrix
    max_r = 5.0    # max _rating
    T0 = [[3,2],[1,3,4],[2],[1,5],[3]]     # trust relationship
    N,M,K=5,4,4     # n : no of users   m : no of items k : latent dimension        (5x4)@(4x4)
    lambdaU,lambdaV,lambdaT=0.02, 0.02, 0.1

    R=sp.dok_matrix((N,M))   # create sparse matrix for user x item
    T=sp.dok_matrix((N,N))   # create sparse matrix for trust among users user x user
    for i in range(len(R0)):    # no of users
        for j in range(len(R0[0])):   # no of items
            if R0[i][j]>0:    # if rating is present   
                R[i,j] = 1.0 * R0[i][j] / max_r      # normalise the rating matrix     R is the new normalised rating matrix which will be used *******
    print(R.toarray())
    for i in range(len(T0)):    # no of users
        for j in T0[i]:         # no of trusted users of user i
            T[i,j-1]=1.0        # fill up trust matrix from user realtionships   *******
    print(T.toarray())
    keys = np.array(R.keys()).T
    print(np.array(keys[0]))
    R,T = R.tocsr(),T.tocsr()
    print(R.shape, T.shape)
    socialMF(R,T,N,M,K,lambdaU,lambdaV,lambdaT,R,np.array(keys[0]),np.array(keys[1]))

In [None]:
def t_toy_correct():
    R0 = [
         [5,3,0,1],
         [4,0,0,1],
         [1,1,0,5],
         [1,0,0,4],
         [0,1,5,4],
        ]     # rating matrix
    max_r = 5.0    # max _rating
    T0 = [[3,2],[1,3,4],[2],[1,5],[3]]     # trust relationship
    N,M,K=5,4,4     # n : no of users   m : no of items k : latent dimension        (5x4)@(4x4)
    lambdaU,lambdaV,lambdaT=0.02, 0.02, 0.1
    keys = []

    R=sp.dok_matrix((N,M))   # create sparse matrix for user x item
    T=sp.dok_matrix((N,N))   # create sparse matrix for trust among users user x user
    for i in range(len(R0)):    # no of users
        for j in range(len(R0[0])):   # no of items
            if R0[i][j]>0:    # if rating is present   
                keys.append([i,j])
                R[i,j] = 1.0 * R0[i][j] / max_r      # normalise the rating matrix     R is the new normalised rating matrix which will be used *******
    for i in range(len(T0)):    # no of users
        for j in T0[i]:         # no of trusted users of user i
            T[i, j-1]=1.0        # fill up trust matrix from user realtionships   *******
    keys = np.array(keys)        # get the keys
    R,T = R.tocsr(),T.tocsr()
    socialMF(R,T,N,M,K,lambdaU,lambdaV,lambdaT,R,keys[:, 0],keys[:, 1])

In [None]:
if __name__ == "__main__":
     t_toy_correct()
     # t_yelp(1000,20000)

# target

In [None]:
# Tripadvisor review Dataset
path = r'/content/drive/MyDrive/Per_CD_RS/tripadvisor_reviews_with_country.csv'
with open(path, encoding="utf-8", errors='ignore') as infile:
  tr = pd.read_csv(infile)
tr = tr.rename(columns={"username":"userId", "taObject":"itemId"})
arr = tr[['open', 'cons', 'extra', 'agree', 'neuro', 'userId', 'itemId', 'rating']]

In [None]:
# amazon review dataset magazines csv
path1 = r'/content/drive/MyDrive/Per_CD_RS/pers_fashion_filtered.csv'
path2 = r'/content/drive/MyDrive/Per_CD_RS/Aaamzon_fashion_ru_tf.csv'

with open(path1, encoding="utf-8", errors='ignore') as infile:
  df = pd.read_csv(infile)
arr = np.array(df[['0', '1', '2', '3', '4']])

with open(path2, encoding="utf-8", errors='ignore') as infile:
  ru_tf_df = pd.read_csv(infile)
ru_tf_df = ru_tf_df.drop(columns=['Unnamed: 0'])
ru_tf = np.array(ru_tf_df)

In [None]:
# amazon review dataset movies json
path = r'/content/drive/MyDrive/Per_CD_RS/Amazon_fashion_filtered.csv'
with open(path, encoding="utf-8", errors='ignore') as infile:
  az = pd.read_csv(infile)
#am = am.rename(columns={"reviewerID":"userId", "asin":"itemId", "overall":"rating"})

In [None]:
input_file =  r'/content/drive/MyDrive/Per_CD_RS/Amazon_Text_Video_Games.json'
with open(input_file) as f:
    lines = f.read().splitlines()
df_inter = pd.DataFrame(lines)
df_inter.columns = ['json_element']
df_inter['json_element'].apply(json.loads)
az = pd.json_normalize(df_inter['json_element'].apply(json.loads))

arr1 = az.rename(columns={"reviewerID":"userId", "asin":"itemId", "overall":"rating"})
x = pd.DataFrame(arr1.userId.value_counts()).reset_index()
y = x[x['userId']>=5]
arr2 = pd.DataFrame(y)
arr2 = arr2.rename(columns={"userId":"count","index":"userId"})
df = pd.merge(arr1, arr2, on='userId', how='inner')
df = df[df['rating']<6]
df = df[df['rating']>0]
x = pd.DataFrame(df.itemId.value_counts()).reset_index()
y = x[x['itemId']>=5]
arr3 = pd.DataFrame(y)
arr3 = arr3.rename(columns={"itemId":"count","index":"itemId"})
df2 = pd.merge(df, arr3, on='itemId', how='inner')
az = df2

tgt = az

tgt['processed_text'] = ""
tgt['reviewText'].fillna(" ")
array_text = []
for i in range(len(tgt)):
  x = tgt['reviewText'][i]
  word_tokens = word_tokenize(x) if type(x) != float else  " "
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  lemma_words = []
  for words in filtered_sentence:
      word = Word(words).lemmatize()
      lemma_words.append(word)
  punc_words = []
  for words in lemma_words:
      word =  punctuation.sub("", words)
      if len(word) > 0:
        punc_words.append(word.lower())
  line = ""
  for i in punc_words:
    line = line + " " + i.lower()
  array_text.append(line)

tgt['processed_text'] = array_text
az = tgt[['rating', 'userId', 'itemId', 'reviewText', 'processed_text']]
az = az.drop_duplicates(subset=None, keep='first')

In [None]:
print(arr.shape, az.shape)
az = az[0:len(arr)]
print(arr.shape, az.shape)

(7841, 5) (7891, 6)
(7841, 5) (7841, 6)


In [None]:
# amazon review dataset movies json
path = r'/content/drive/MyDrive/Per_CD_RS/Amazon_fashion_filtered.csv'
with open(path, encoding="utf-8", errors='ignore') as infile:
  az = pd.read_csv(infile)
az = az.rename(columns={"reviewerID":"userId", "asin":"itemId", "overall":"rating"})

In [None]:
# amazon review dataset magazines csv
path = r'/content/drive/MyDrive/Per_CD_RS/Amazon_Text_Movies_and_TV.json'
with open(path, encoding="utf-8", errors='ignore') as infile:
  az = pd.read_json(infile, lines=True, nrows=12000)
az = az.rename(columns={"reviewerID":"userId", "asin":"itemId", "overall":"rating"})

In [None]:
src = az[['userId', 'itemId', 'rating']]
arr = pd.DataFrame(arr)

In [None]:
arr['userId'] = az['userId']
arr['itemId'] = np.array(az['itemId'])
arr['rating'] = az['rating']

In [None]:
arr.columns = ['open', 'cons', 'extra', 'agree', 'neuro', 'userId', 'itemId', 'rating']

In [None]:
# rating matrix wrt user u
ru = arr.pivot_table(index='userId',columns='itemId',values='rating')
ru = ru.fillna(0)
ru_m = ru > 0
ru_m = ru_m.replace(True, 1)
ru_m = ru_m.replace(False, 0)
ru = np.array(ru)
ru_m = np.array(ru_m)

In [None]:
ru = arr.groupby(['userId', 'itemId'])['rating'].sum().unstack()
ru = pd.DataFrame(ru)

In [None]:
ru.shape

(40639, 16794)

In [None]:
set1  = ru.iloc[: , :int(ru.shape[1]/10)]
set1 = set1.replace(np.nan, 0)

In [None]:
for i in range(1,3):
  set2 = ru.iloc[: , i*int(ru.shape[1]/10):(i+1)*int(ru.shape[1]/10)]
  set2 = set2.replace(np.nan, 0)
  set1 = pd.concat([set1, set2], axis=1)
  print(set2.shape, set1.shape)

(40639, 1679) (40639, 3358)
(40639, 1679) (40639, 5037)


In [None]:
set1  = ru.iloc[: , :int(ru.shape[1]/10)]
set1 = set1.replace(np.nan, 0)
set1

In [None]:
set1

In [None]:
set2  = ru.iloc[: , int(ru.shape[1]/10): 2*int(ru.shape[1]/10)]
set2 = set2.replace(np.nan, 0)
set2

In [None]:
pd.concat([set1, set2], axis=1)

In [None]:
# with only trust and rating use this

def tgt_pred(l, k, a, lp, t, th):

    train_size = 0.8
    lr, k, alpha, lambda_p, lambda_t, thr = l, k, a, lp, t, th
    df_copy = arr.copy()
    train_set = sample(df_copy, frac=train_size).reset_index()       # train set
    user_features_train = np.array(train_set[['open', 'cons', 'extra', 'agree', 'neuro']].fillna(0))
    test_set = df_copy.drop(train_set.index).reset_index()
    user_features_test = np.array(test_set[['open', 'cons', 'extra', 'agree', 'neuro']].fillna(0))

    df = pd.DataFrame(user_features_train) 
    df.columns = ['open', 'cons', 'extra', 'agree', 'neuro']
    df['userId'] = train_set.userId
    df2 = df.groupby(by='userId').mean().reset_index()
    df3 = np.array(df2[["open", 'cons', 'extra', 'agree', 'neuro']])
    

    #ru = train_set.pivot_table(index='userId',columns='itemId',values='rating')
    ru = train_set.groupby(['userId', 'itemId'])['rating'].sum().unstack()
    ru = pd.DataFrame(ru)
    set1 = ru.iloc[: , :int(ru.shape[1]/10)]
    set1 = set1.replace(np.nan, 0)
    #for i in range(1,2):
    # set2 = ru.iloc[: , i*int(ru.shape[1]/10):(i+1)*int(ru.shape[1]/10)]
    #  set2 = set2.replace(np.nan, 0)
    #  set1 = pd.concat([set1, set2], axis=1)
    ru = set1
    #ru = ru.fillna(0)
    ru_m = ru > 0
    ru_m = ru_m.replace(True, 1)
    ru_m = ru_m.replace(False, 0)
    ru = np.array(ru)
    ru_m = np.array(ru_m)

    R0 = ru    # rating matrix
    max_r = 5.0    # max _rating   
    N,M= ru.shape[0],ru.shape[1]     # # n : no of users   m : no of items 
    K = k                       #k : latent dimension        (mx10)@(10xn)
    lambdaU,lambdaV,lambdaT= lambda_p, lambda_p, lambda_t
    keys = []

    R=sp.dok_matrix((N,M))   # create sparse matrix for user x item
    T=sp.dok_matrix((N,N))   # create sparse matrix for trust among users user x user
    for i in range(len(R0)):    # no of users
        for j in range(len(R0[i])):   # no of items
            if R0[i][j]>0:    # if rating is present   
                keys.append([i,j])
                R[i,j] = R0[i][j]     

    dft = pd.DataFrame(user_features_test) 
    dft.columns = ['open', 'cons', 'extra', 'agree', 'neuro']
    dft['userId'] = test_set.userId
    df2t = dft.groupby(by='userId').mean().reset_index()
    df3t = np.array(df2t[["open", 'cons', 'extra', 'agree', 'neuro']])

    rut = test_set.pivot_table(index='userId',columns='itemId',values='rating')
    rut = test_set.groupby(['userId', 'itemId'])['rating'].sum().unstack()
    rut = pd.DataFrame(rut)
    rut = rut.fillna(0)
    ru_mt = rut > 0
    ru_mt = ru_mt.replace(True, 1)
    ru_mt = ru_mt.replace(False, 0)
    rut = np.array(rut)
    ru_mt = np.array(ru_mt)

    R0t = rut    # rating matrix
    max_rt = 5.0    # max _rating   
    Nt,Mt= rut.shape[0],rut.shape[1]     # # n : no of users   m : no of items 
    K = k                        #k : latent dimension        (mx10)@(10xn)
    lambdaU,lambdaV,lambdaT=0.02, 0.02, 0.1
    keyst = []

    Rt=sp.dok_matrix((Nt,Mt))   # create sparse matrix for user x item
    for i in range(len(R0t)):    # no of users
        for j in range(len(R0t[i])):   # no of items
            if R0t[i][j]>0:    # if rating is present   
                keyst.append([i,j])
                Rt[i,j] = R0t[i][j]      

    
    # get trust factor between users trust_uv(user x user)  and sim_uv(user x user) t_uv(user x user)


    print("done0")
    trust_uv = 1 - calc_distance(df3, df3)
    #trust_uv = 1 - spt.distance.cdist(df3, df3, 'cosine')
    print("done1")
    trust_uv = np.nan_to_num(trust_uv)
    print("done2")
    trust_uv_s = trust_uv.copy()
    print("done3")
    trust_uv = pd.DataFrame(trust_uv >= thr)
    print("done4")
    trust_uv = trust_uv.replace(True, 1)
    trust_uv = trust_uv.replace(False, 0)
    print("done5")
    trust_uv = np.array(trust_uv)
    tknn = []
    for j in range(len(trust_uv)):
      row = []
      for m in range(len(trust_uv[j])):
        if trust_uv[j][m] == 1:
          row.append(m)
      tknn.append(row)
    tknn = np.array(tknn)
    sim_uv = 1- calc_distance(ru, ru)
    #sim_uv = 1 - spt.distance.cdist(ru, ru, 'cosine')
    sim_uv = np.nan_to_num(sim_uv)
    ru_tf2 = ru_tf[0:trust_uv.shape[0]]
    ru_tf2 = ru_tf[:,:len(trust_uv)]
    print(sim_uv.shape, trust_uv.shape, R.shape)
    #t_uv = np.add(alpha * trust_uv_s, (1-alpha)* sim_uv)
    t_uv = np.multiply(np.add(alpha * trust_uv_s, (1-alpha)* sim_uv), ru_tf)
    t_uv = trust_uv_s

    max_value = max(t_uv.flatten())
    min_value = min(t_uv.flatten())
    T_df = pd.DataFrame(t_uv)
    T_df = T_df >=  np.subtract(T_df, min_value) /(max_value - min_value)   # thr is threshold value
    T_df = T_df.replace(True, 1)
    T_df = T_df.replace(False, 0)
    T_df = np.array(T_df)
    T = sp.csr_matrix(T_df)
           # fill up trust matrix from user realtionships   *******
    keys = np.array(keys)        # get the keys
    R,T = R.tocsr(),T.tocsr()
    print("entring smf")
    print(l, k, a, lp, t, th)
    socialMF(R,T,N,M,K,lambdaU,lambdaV,lambdaT,Rt,keys[:, 0],keys[:, 1])

In [None]:
lr = [0.0001]
K = [10]
alpha = [0.3]
lambda_p = [0.04]
lambda_t = [0.1]
thres = [0.8]

In [None]:
# thresh 0.7, 0.8*, 
# lambda_t 0.1*, 0.5, 0.8
# lambda_p 0.01
# alpha = 0.5

# run till 30 k

In [None]:
l, k, a, lp, lt, th = 0.0001, 20, 0.5, 0.03, 0.8, 0.8

In [None]:
if __name__ == "__main__":
   for i in range(5): 
    model = tgt_pred(l, k, a, lp, lt, th)

done0
done1
done2
done3
done4
done5
(2344, 2344) (2344, 2344) (2344, 44)
entring smf
0.0001 20 0.5 0.03 0.8 0.8
0 mae 0.02305873117024209 rmse 0.7843752572910565 0.020981311798095703
1 mae 0.02307682992193197 rmse 0.7849909118519508 0.030782699584960938
2 mae 0.0231272568709124 rmse 0.7867062556316629 0.020181655883789062
3 mae 0.023126990225893273 rmse 0.7866971853253247 0.01968669891357422
4 mae 0.023081741371106922 rmse 0.7851579817215735 0.019634485244750977
5 mae 0.023039178111184472 rmse 0.7837101324142403 0.019206523895263672
6 mae 0.023055098298457363 rmse 0.784251680034363 0.020754337310791016
7 mae 0.023075952277290264 rmse 0.7849610575318425 0.019543886184692383
8 mae 0.023050867368834774 rmse 0.7841077589969565 0.02220749855041504
9 mae 0.023004290810543 rmse 0.7825233916861919 0.019937515258789062
K:20,lambdaU:0.03, lambdaV:0.03,lambdaT:0.8
rmse 10.332567816053073
mae 1.743289557450823
time 0.2563180923461914


In [None]:
# @params: 
# trust threshold (0.7) t  ***
# mixing factor alpha (0.5)   ***
# lambdaU,lambdaV,lambdaT (0.02, 0.02, 0.1)
# no of iterations (100) +++
# no of latent features K (10) ***
# train test split  
# learning rate

# metrics
# MAE
# RMSE

# benchmark
# yakchi pacis
# tobias umap
# p2mf cdrup

# variations
# numerical personality values (0,1)
# binary personality values [0 and 1]
# with demography dbscan clustering
  # ensemble through average voting
  # ensemble through plurality voting 
# without clustering

In [None]:
# with tf idf of rating

def tgt_pred_log():
    R0 = ru    # rating matrix
    max_r = 5.0    # max _rating
    T0 = [[3,2],[1,3,4],[2],[1,5],[3]]     # trust relationship
    N,M= ru.shape[0],ru.shape[1]     # # n : no of users   m : no of items 
    K = 10                         #k : latent dimension        (mx10)@(10xn)
    lambdaU,lambdaV,lambdaT=0.02, 0.02, 0.1
    keys = []

    R=sp.dok_matrix((N,M))   # create sparse matrix for user x item
    T=sp.dok_matrix((N,N))   # create sparse matrix for trust among users user x user
    for i in range(len(R0)):    # no of users
        for j in range(len(R0[i])):   # no of items
            if R0[i][j]>0:    # if rating is present   
                keys.append([i,j])
                R[i,j] = R0[i][j]      # normalise the rating matrix     R is the new normalised rating matrix which will be used *******

    
    # get trust factor between users trust_uv(user x user)  and sim_uv(user x user) t_uv(user x user)
    df = pd.DataFrame(src_pers[0]) 
    df.columns = ['open', 'cons', 'extra', 'agree', 'neuro']
    df['userId'] = src.userId
    df2 = df.groupby(by='userId').mean().reset_index()
    df2 = np.array(df2[["open", 'cons', 'extra', 'agree', 'neuro']])
    trust_uv = 1 - spt.distance.cdist(df2, df2, 'cosine')
    trust_uv = np.nan_to_num(trust_uv)
    trust_uv_s = trust_uv.copy()
    trust_uv = pd.DataFrame(trust_uv > 0.7)
    trust_uv = trust_uv.replace(True, 1)
    trust_uv = trust_uv.replace(False, 0)
    trust_uv = np.array(trust_uv)
    tknn = []
    for j in range(len(trust_uv)):
      row = []
      for k in range(len(trust_uv[j])):
        if trust_uv[j][k] == 1:
          row.append(k)
      tknn.append(row)
    tknn = np.array(tknn)
    sim_uv = 1 - spt.distance.cdist(ru_m, ru_m, 'cosine')
    sim_uv = np.nan_to_num(sim_uv)
    #t_uv = (np.add(trust_uv, sim_uv)/2)
    #t_uv = np.multiply(t_uv, ru_tf)
    t_uv = np.add(trust_uv, np.multiply(sim_uv, ru_tf)) / 2
    print(t_uv.shape)

    max_value = max(t_uv.flatten())
    T_df = pd.DataFrame(t_uv)
    T_df = T_df >= 0.7 * max_value
    T_df = T_df.replace(True, 1)
    T_df = T_df.replace(False, 0)
    T_df = np.array(T_df)
    T = sp.csr_matrix(T_df)
           # fill up trust matrix from user realtionships   *******
    keys = np.array(keys)        # get the keys
    R,T = R.tocsr(),T.tocsr()
    print("entring smf")
    socialMF(R,T,N,M,K,lambdaU,lambdaV,lambdaT,R,keys[:, 0],keys[:, 1])