In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

# Datos

In [2]:
users = pd.read_csv('users_table.csv')
ratings = pd.read_csv('users_ratings_table.csv')
items = pd.read_csv('base_final_lugares.csv')

In [3]:
ratings = ratings[ratings['rating']!='None']

In [4]:
ratings.loc[:, 'rating'] = ratings.loc[:, 'rating'].astype('int')

# Modelo

In [5]:
#Se usar columna de reviewer ya que user id es muy pesada
n_users = users.reviewer_id.unique().shape[0]
n_items = ratings.place_id.unique().shape[0]

In [6]:
n_users

1000

In [7]:
n_items

278

### Crear matriz de elementos de usuario

In [8]:
table = pd.pivot_table(ratings, values='rating', index=['reviewer_id'],
                    columns=['place_id'], aggfunc=np.mean)

In [9]:
table = table.fillna(0)

In [10]:
data_matrix = table.values

### Calcular similutud del coseno

In [11]:
data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 4., 0.]])

In [12]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [26]:
user_similarity.shape

(1000, 1000)

In [14]:
item_similarity.shape

(278, 278)

In [27]:
data_matrix.shape

(1000, 278)

### Hacer predicciones basadas en las similitudes

In [17]:
def predict(ratings, similarity, type='user'):
    if type=='user':
        mean_user_rating = ratings.mean(axis=1)
        
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type=='item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [18]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [29]:
data_matrix[0]

array([0., 0., 0., 0., 0., 0., 5., 0., 0., 0., 0., 0., 0., 5., 2., 0., 0.,
       0., 0., 0., 5., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 5., 0., 0., 0., 0., 3., 0., 5., 0., 0., 0., 5., 0., 0.,
       5., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 3., 0., 0., 0., 5.,
       0., 0., 0., 0., 0., 5., 0., 4., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 4., 0., 0., 4., 5., 0., 0., 5., 0., 0., 0.,
       5., 3., 0., 0., 5., 0., 5., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 5., 0., 0., 0., 0., 4., 0., 0., 0., 0., 5., 0.,
       0., 0., 0., 0., 0., 0., 0., 4., 5., 0., 0., 5., 0., 5., 0., 0., 0.,
       5., 0., 3., 0., 4., 0., 0., 0., 0., 0., 0., 0., 5., 0., 0., 0., 0.,
       0., 0., 5., 4., 0., 0., 3., 0., 3., 5., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 5., 0., 5., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 5., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 4.,
       0., 5., 5., 5., 0.

In [30]:
item_similarity[0]

array([0.        , 0.8450772 , 0.84727639, 0.77651841, 0.8668098 ,
       0.82386739, 0.80143115, 0.89492943, 0.864087  , 0.85358901,
       0.79179914, 0.84711108, 0.83243534, 0.84677839, 0.80145413,
       0.82562445, 0.85147617, 0.8431737 , 0.78148862, 0.82655341,
       0.82521403, 0.87372337, 0.83242096, 0.85426345, 0.8011639 ,
       0.87066407, 0.86297936, 0.88008392, 0.80169455, 0.86452397,
       0.84062319, 0.82215808, 0.85658329, 0.82530618, 0.84517706,
       0.87504733, 0.86516225, 0.91537964, 0.83427825, 0.89033837,
       0.84205111, 0.88945357, 0.867504  , 0.87321584, 0.71535296,
       0.84659953, 0.87332178, 0.96486737, 0.85870032, 0.86336954,
       0.8535765 , 0.87772339, 0.86169388, 0.81828968, 0.8382857 ,
       0.85142193, 0.80436164, 0.88137808, 0.85967672, 0.83804271,
       0.85828269, 0.8565112 , 0.79889365, 0.83303436, 0.85197784,
       0.90554473, 0.84829576, 0.88786256, 0.8176395 , 0.82922585,
       0.83248614, 0.86674554, 0.87203213, 0.78526128, 0.85029

In [24]:
user_prediction[0]

array([0.98141568, 0.92306954, 0.92816324, 0.92584508, 0.59782414,
       0.95970945, 0.93547475, 0.49206728, 0.62990954, 0.92788945,
       0.9343175 , 0.96546343, 0.93945503, 0.87750311, 0.90258553,
       0.92034788, 0.92798472, 0.92076506, 0.9194557 , 0.85963635,
       0.87184337, 0.91780846, 0.90314368, 0.98022581, 0.91797108,
       0.9184031 , 0.88773162, 0.74753362, 0.92113609, 0.92325015,
       0.95841994, 0.98851706, 0.84270426, 0.9111804 , 0.92119788,
       0.53627702, 0.95350497, 0.92794688, 0.91218257, 0.32324483,
       0.88076361, 0.87239466, 0.95057458, 0.90861729, 2.14785338,
       0.96023602, 0.95389834, 0.23508466, 0.9117353 , 0.89680079,
       0.91144658, 0.62577239, 0.90520156, 0.96787316, 0.90589967,
       0.89784631, 0.95211284, 0.95556781, 0.89604001, 0.961631  ,
       0.90382868, 0.68582728, 0.94858152, 0.83953378, 0.92574598,
       0.6191252 , 0.98108247, 0.60713553, 0.96687137, 0.89746163,
       1.59123361, 0.75128555, 0.91624271, 0.92793798, 0.89212

# Motor de recomendaciones utilizando factorización de matrices

In [55]:
class MF():

  # Inicializando la matriz de ratings user-movie, no. de características latentes, alpha y beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        
    # Inicializando user-feature y movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Inicializando los terminos bias 
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
        
        # List of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]
        
        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
        self.sgd()
        mse = self.mse()
        training_process.append((i, mse))
        if (i+1) % 20 == 0:
            print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process
    
    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)
    
    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])
            
    # Ratings for user i and moive j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

In [56]:
mf = MF(data_matrix, K=20, alpha=0.001, beta=0.01, iterations=100)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()

Iteration: 100 ; error = 166.7259

P x Q:
[[4.60209848 4.58003542 4.58266375 ... 4.58115797 4.55518431 4.58200066]
 [4.61085851 4.58551757 4.59965225 ... 4.57731785 4.56635112 4.58701583]
 [4.61178664 4.58278715 4.59202618 ... 4.58805295 4.54720992 4.57832123]
 ...
 [4.60452328 4.56607337 4.56663106 ... 4.57400445 4.54011379 4.59218177]
 [4.61849226 4.59966058 4.58032301 ... 4.56230443 4.53378029 4.58050659]
 [4.58535637 4.57392836 4.58979775 ... 4.55175156 4.53438135 4.57758209]]



In [57]:
mf.full_matrix().mean(axis=1)

array([4.57342063, 4.57159713, 4.57708222, 4.57126378, 4.58075804,
       4.56867966, 4.5798508 , 4.5811524 , 4.5690783 , 4.57072207,
       4.5786838 , 4.57961702, 4.57285577, 4.57332341, 4.5779751 ,
       4.58146246, 4.57373582, 4.58315589, 4.57837839, 4.58055218,
       4.57418298, 4.58152593, 4.57398011, 4.58078336, 4.57258953,
       4.57588747, 4.57515964, 4.57338657, 4.56950239, 4.56324736,
       4.57408551, 4.56749409, 4.56520319, 4.57008194, 4.57756093,
       4.57053323, 4.57561234, 4.57375206, 4.57963488, 4.57387719,
       4.57075838, 4.57031871, 4.57620751, 4.56971265, 4.56904515,
       4.58340205, 4.58249334, 4.56809478, 4.57585153, 4.57092782,
       4.56835521, 4.57539923, 4.57756355, 4.57856905, 4.57117001,
       4.57180007, 4.57975516, 4.57712837, 4.58019791, 4.57610613,
       4.58114804, 4.57215262, 4.56771252, 4.57261928, 4.5723324 ,
       4.56540907, 4.57146975, 4.5663908 , 4.57422388, 4.57773931,
       4.57320579, 4.57372473, 4.57962763, 4.57984771, 4.57187