In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

# Datos

In [2]:
users = pd.read_csv('users_table.csv')
ratings = pd.read_csv('users_ratings_table.csv')
items = pd.read_csv('base_final_lugares.csv')

In [13]:
ratings = ratings[ratings['rating']!='None']

In [17]:
ratings.loc[:, 'rating'] = ratings.loc[:, 'rating'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


# Modelo

In [18]:
#Se usar columna de reviewer ya que user id es muy pesada
n_users = users.reviewer_id.unique().shape[0]
n_items = ratings.place_id.unique().shape[0]

In [19]:
n_users

1000

In [20]:
n_items

278

### Crear matriz de elementos de usuario

In [48]:
table = pd.pivot_table(ratings, values='rating', index=['reviewer_id'],
                    columns=['place_id'], aggfunc=np.mean)

In [49]:
table = table.fillna(0)

In [50]:
data_matrix = table.values

### Calcular similutud del coseno

In [51]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

### Hacer predicciones basadas en las similitudes

In [52]:
def predict(ratings, similarity, type='user'):
    if type=='user':
        mean_user_rating = ratings.mean(axis=1)
        
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type=='item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [53]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [54]:
item_prediction

array([[0.86185948, 0.86367306, 0.86456184, ..., 0.86731638, 0.86100663,
        0.86651566],
       [0.60374162, 0.62974484, 0.6284047 , ..., 0.63529782, 0.63104744,
        0.61253486],
       [0.71355034, 0.71215463, 0.71557001, ..., 0.71710585, 0.71388903,
        0.71864901],
       ...,
       [0.72875393, 0.73379097, 0.7272502 , ..., 0.72563372, 0.72900475,
        0.73467554],
       [0.63920509, 0.64756895, 0.61726957, ..., 0.64886562, 0.63736887,
        0.64402185],
       [0.83304051, 0.83210362, 0.82648208, ..., 0.82748311, 0.807267  ,
        0.82026265]])

# Motor de recomendaciones utilizando factorización de matrices

In [55]:
class MF():

  # Inicializando la matriz de ratings user-movie, no. de características latentes, alpha y beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        
    # Inicializando user-feature y movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Inicializando los terminos bias 
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
        
        # List of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]
        
        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
        self.sgd()
        mse = self.mse()
        training_process.append((i, mse))
        if (i+1) % 20 == 0:
            print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process
    
    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)
    
    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])
            
    # Ratings for user i and moive j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

In [56]:
mf = MF(data_matrix, K=20, alpha=0.001, beta=0.01, iterations=100)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()

Iteration: 100 ; error = 166.7259

P x Q:
[[4.60209848 4.58003542 4.58266375 ... 4.58115797 4.55518431 4.58200066]
 [4.61085851 4.58551757 4.59965225 ... 4.57731785 4.56635112 4.58701583]
 [4.61178664 4.58278715 4.59202618 ... 4.58805295 4.54720992 4.57832123]
 ...
 [4.60452328 4.56607337 4.56663106 ... 4.57400445 4.54011379 4.59218177]
 [4.61849226 4.59966058 4.58032301 ... 4.56230443 4.53378029 4.58050659]
 [4.58535637 4.57392836 4.58979775 ... 4.55175156 4.53438135 4.57758209]]



In [57]:
mf.full_matrix().mean(axis=1)

array([4.57342063, 4.57159713, 4.57708222, 4.57126378, 4.58075804,
       4.56867966, 4.5798508 , 4.5811524 , 4.5690783 , 4.57072207,
       4.5786838 , 4.57961702, 4.57285577, 4.57332341, 4.5779751 ,
       4.58146246, 4.57373582, 4.58315589, 4.57837839, 4.58055218,
       4.57418298, 4.58152593, 4.57398011, 4.58078336, 4.57258953,
       4.57588747, 4.57515964, 4.57338657, 4.56950239, 4.56324736,
       4.57408551, 4.56749409, 4.56520319, 4.57008194, 4.57756093,
       4.57053323, 4.57561234, 4.57375206, 4.57963488, 4.57387719,
       4.57075838, 4.57031871, 4.57620751, 4.56971265, 4.56904515,
       4.58340205, 4.58249334, 4.56809478, 4.57585153, 4.57092782,
       4.56835521, 4.57539923, 4.57756355, 4.57856905, 4.57117001,
       4.57180007, 4.57975516, 4.57712837, 4.58019791, 4.57610613,
       4.58114804, 4.57215262, 4.56771252, 4.57261928, 4.5723324 ,
       4.56540907, 4.57146975, 4.5663908 , 4.57422388, 4.57773931,
       4.57320579, 4.57372473, 4.57962763, 4.57984771, 4.57187