<a href="https://colab.research.google.com/github/SomdeepAcharyya/Recommender-Systems/blob/main/Probabilistic_matrix_factorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import scipy, cmake
import numpy as np
import warnings
from sklearn import svm
warnings.filterwarnings('ignore')

In [None]:
# amazon review dataset magazines csv
path = r'/content/drive/MyDrive/Per_CD_RS/digital_music_pers_num.csv'
path2 = r'/content/drive/MyDrive/Per_CD_RS/Aaamazon_fashion_ru_tf.csv'

with open(path, encoding="utf-8", errors='ignore') as infile:
  df = pd.read_csv(infile)
arr = np.array(df[['0', '1', '2', '3', '4']])

In [None]:
# amazon review dataset magazines csv
path = r'/content/drive/MyDrive/Per_CD_RS/Amazon_Text_Digital_Music.json'
with open(path, encoding="utf-8", errors='ignore') as infile:
  az = pd.read_json(infile, lines=True, nrows=12000)
az = az.rename(columns={"reviewerID":"userId", "asin":"itemId", "overall":"rating"})

In [None]:
# Tripadvisor review Dataset
path = r'/content/drive/MyDrive/Per_CD_RS/tripadvisor_reviews_with_country.csv'
with open(path, encoding="utf-8", errors='ignore') as infile:
  tr = pd.read_csv(infile)
tr = tr.rename(columns={"username":"userId", "taObject":"itemId"})

In [None]:
src = az[['userId', 'itemId', 'rating']]

In [None]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# matrix factorization(MF) via Probabilistic Matrix Factorization model
import surprise

reader = surprise.Reader(rating_scale=(1,5)) 
#into surprise:
data_s = surprise.Dataset.load_from_df(src,reader)

class ProbabilisticMatrixFactorization(surprise.AlgoBase):
# Randomly initializes two Matrices, Stochastic Gradient Descent to be able to optimize the best factorization for ratings.
    def __init__(self,learning_rate,num_epochs,num_factors):
       # super(surprise.AlgoBase)
        self.alpha = learning_rate #learning rate for Stochastic Gradient Descent
        self.num_epochs = num_epochs
        self.num_factors = num_factors
    def fit(self,train):
        #randomly initialize user/item factors from a Gaussian
        P = np.random.normal(0,.1,(train.n_users,self.num_factors))
        Q = np.random.normal(0,.1,(train.n_items,self.num_factors))
        #print('fit')

        for epoch in range(self.num_epochs):
            for u,i,r_ui in train.all_ratings():
                residual = r_ui - np.dot(P[u],Q[i])
                temp = P[u,:] # we want to update them at the same time, so we make a temporary variable. 
                P[u,:] +=  self.alpha * residual * Q[i]
                Q[i,:] +=  self.alpha * residual * temp 

                
        self.P = P
        self.Q = Q

        self.trainset = train
    
    
    def estimate(self,u,i):
        #returns estimated rating for user u and item i. Prerequisite: Algorithm must be fit to training set.
        #check to see if u and i are in the train set:
        #print('gahh')

        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            #print(u,i, '\n','yep:', self.P[u],self.Q[i])
            #return scalar product of P[u] and Q[i]
            nanCheck = np.dot(self.P[u],self.Q[i])
            
            if np.isnan(nanCheck):
                return self.trainset.global_mean
            else:
                return np.dot(self.P[u,:],self.Q[i,:])
        else:# if its not known we'll return the general average. 
           # print('global mean')
            return self.trainset.global_mean

In [None]:
Alg_s = ProbabilisticMatrixFactorization(learning_rate=0.0001,num_epochs=100,num_factors=10)
data_s1 = data_s.build_full_trainset()
Alg_s.fit(data_s1)

In [None]:
us = Alg_s.P
vs = Alg_s.Q

In [None]:
train_size = 0.8
df_copy = src.copy()
train_set = df_copy.sample(frac=train_size).reset_index()
#user_features_train = np.array(train_set[['open', 'cons', 'extra', 'agree', 'neuro']].fillna(0))
test_set = df_copy.drop(train_set.index).reset_index()
#user_features_test = np.array(test_set[['open', 'cons', 'extra', 'agree', 'neuro']].fillna(0))

In [None]:
# rating matrix wrt user u
ru_src = test_set.pivot_table(index='userId',columns='itemId',values='rating')    # U
ru_src = ru_src.fillna(0)
ru_m_src = ru_src > 0
ru_m_src = ru_m_src.replace(True, 1)
ru_m_src = ru_m_src.replace(False, 0)
ru_src = np.array(ru_src)
ru_m_src = np.array(ru_m_src)   # U

In [None]:
arr = test_set.pivot_table(index='userId',columns='itemId',values='rating').fillna(0).reset_index()
cols = arr.columns
cols = cols[1:len(arr.T)]

In [None]:
pred = []
for i in range(len(arr)):
  row = []
  for j in range(len(cols)):
    p = Alg_s.estimate(arr.userId.iloc[i],cols[j])
    row.append(p)
  pred.append(row)

In [None]:
pred = np.array(pred)
pred.shape

(2295, 157)

In [None]:
ru_src.shape

(2295, 157)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
mae = mean_absolute_error(ru_src, pred)
rmse = mean_squared_error(ru_src, pred)
print(mae, rmse)

3.9299098113224895 15.503761853845731


In [None]:
# tuning

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
learning_rate = [0.01, 0.001, 0.0001]
epochs=[60,80,100,120,140,150]
num_factors = [10,20,30,40]

In [None]:
for l in learning_rate:
  for e in epochs:
    for k in num_factors:
      Alg_s = ProbabilisticMatrixFactorization(learning_rate=l,num_epochs=e,num_factors=k)
      data_s1 = data_s.build_full_trainset()
      Alg_s.fit(data_s1)
      us = Alg_s.P
      vs = Alg_s.Q
      pred = []
      for i in range(len(arr)):
        row = []
        for j in range(len(cols)):
          p = Alg_s.estimate(arr.userId.iloc[i],cols[j])
          row.append(p)
        pred.append(row)
      pred = np.array(pred)
      mae = mean_absolute_error(ru_src, pred)
      rmse = mean_squared_error(ru_src, pred) 
      print("lr", l,"epochs", e, "k", k)
      print("mae", mae)
      print("rmse", rmse)
      print("----------------------------------------")

lr 0.01 epochs 60 k 10
mae 3.820819276239892
rmse 16.436914185049062
----------------------------------------
lr 0.01 epochs 60 k 20
mae 3.820819276239892
rmse 16.436914185049062
----------------------------------------
lr 0.01 epochs 60 k 30
mae 3.820819276239892
rmse 16.436914185049062
----------------------------------------
lr 0.01 epochs 60 k 40
mae 3.820819276239892
rmse 16.436914185049062
----------------------------------------
lr 0.01 epochs 80 k 10
mae 3.820819276239892
rmse 16.436914185049062
----------------------------------------
lr 0.01 epochs 80 k 20
mae 3.820819276239892
rmse 16.436914185049062
----------------------------------------
lr 0.01 epochs 80 k 30
mae 3.820819276239892
rmse 16.436914185049062
----------------------------------------
lr 0.01 epochs 80 k 40
mae 3.820819276239892
rmse 16.436914185049062
----------------------------------------
lr 0.01 epochs 100 k 10
mae 3.820819276239892
rmse 16.436914185049062
----------------------------------------
lr 0.01 e