In [33]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
import numpy.ma as ma

class KNNRec():

    def __init__(self,R,k=10,mode="item",similarity_func="cosine",damping_factor=25):
        """
        User-based or Item-based collaborative filtering model that operates on
        dataframes with at least a user-like, item-like, and a rating-like column

        Parameters
        ----------
        mode : str, ['item | 'user'], default='item'
            Parameter to decide between use-based content filtering or item-based content filtering
        k : int, default=10
            Number of neares neighboors
        similarity_func : str, ['cosine | 'pearson'], default='cosine'
            Tells the model to choose either cosine simiarity or pearson correlation to compute the 
            "distances" between items or users.

        Attributes
        ----------
        rating_matrix : 2d numpy array, shape=(n_users, n_items)
            Rating matrix minus baselines
        user_map : pandas Series, shape=(n_users,)
            Mapping from the original user id to an integer in the range [0,n_users)
        item_map : pandas Series, shape=(n_items,)
            Mapping from the original item id to an integer in the range [0,n_items)
        similarities: pandas Dataframe, shape=(index=[movieId|userId],columns=[movieId,userId])
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        self.k = k
        self.mu = self.R["rating"].mean()
        self.damping_factor = damping_factor
        
    def show(self):
        self.baseline_predictor()
        print(self.b_um)
    
    def _compute_similarity_matrix(self):
        arr = np.array(np.meshgrid(self.R["userId"].unique(),self.R["userId"].unique())).T
        print(arr)
    def fit(self):
        self._compute_similarity_matrix()
        return 1
    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)
    
    def baseline_predictor(self):
        num_ratings_per_movie = self.R["movieId"].value_counts()
        num_ratings_per_user = self.R["userId"].value_counts()
        sum_ratings_per_user = self.R.groupby("userId")["rating"].sum()
        sum_ratings_per_movie = self.R.groupby("movieId")["rating"].sum()
        
        self.b_u = (sum_ratings_per_user-(self.mu*num_ratings_per_user))/(num_ratings_per_user+self.damping_factor)
        
        self.b_m = (sum_ratings_per_movie-(self.mu*num_ratings_per_movie)-self.b_u.sum())/(num_ratings_per_movie+self.damping_factor)
    
    def create_baseline(self):
        self.baseline_predictor()
        
        single_baseline = lambda user,movie: self.b_u[user]+self.b_m[movie]+self.mu
        
        self.R["baseline"] = self.R.apply (lambda row: single_baseline(row["userId"],row["movieId"]), axis=1)
        print(self.R)
        


    def compute_similarity(x,y,simil_func="cosine",pearson_simil_threshold=50.0):
        if simil_func == "cosine":
            x_nan_mask = isnan(x)
            y_nan_mask = isnan(y)
            x[x_nan_mask] = 0
            y[y_nan_mask] = 0
            
            return np.dot(x, y)/(norm(x)*norm(y))
        
        if simil_func == "pearson":
            xm = ma.masked_invalid(x)
            ym = ma.masked_invalid(y)

            msk = (~xm.mask & ~ym.mask)
            
            num_common = msk.size()
            
            pears_sim = ma.corrcoef(xm[msk],ym[msk])
            
            #scale similarity by threshold to prevent the inflation of the similarity scores when comparing users with fewer co-rated items
            
            return pears_sim*min(num_common/co_simil_threshold,1)


    def sgd(self):
        
        for row in X.itertuples():
            index, user, item, rating = row[:4]
            pred = self.predict_1_train(user, item)
            err = pred - self.R[user, item]
            self.b_u[user] -= self.learning_rate * (err + self.user_bias_reg * self.b_u[user])
            self.b_i[item] -= self.learning_rate * (err + self.item_bias_reg * self.b_i[item])
            self.U[user, :] -= self.learning_rate * (err * self.I[item, :] + self.user_reg * self.U[user, :])
            self.I[item, :] -= self.learning_rate * (err * self.U[user, :] + self.item_reg * self.I[item, :])
            
    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)

In [125]:
3/4

0.75

In [6]:
import pandas as pd
ratings = pd.read_csv("datasets_small/ratings.csv")
ratings = ratings.loc[:,["userId","movieId","rating"]]
ratings.head(10)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
5,1,70,3.0
6,1,101,5.0
7,1,110,4.0
8,1,151,5.0
9,1,157,5.0


In [30]:
import numpy as np
# R = np.array([
#     [5, 3, np.nan, 1],
#     [4, np.nan, np.nan, 1],
#     [1, 1, np.nan, 5],
#     [1, np.nan, np.nan, 4],
#     [np.nan, 1, 5, 4],
# ])
KNNRec(ratings).create_baseline()

        userId  movieId  rating  baseline
0            1        1     4.0  4.391650
1            1        3     4.0  3.288865
2            1        6     4.0  4.136043
3            1       47     5.0  4.423806
4            1       50     5.0  4.658988
5            1       70     3.0  3.488559
6            1      101     5.0  3.085468
7            1      110     4.0  4.517829
8            1      151     5.0  3.384015
9            1      157     5.0  2.312058
10           1      163     5.0  3.622773
11           1      216     5.0  3.302710
12           1      223     3.0  4.072394
13           1      231     5.0  3.506196
14           1      235     4.0  3.739949
15           1      260     5.0  4.714134
16           1      296     3.0  4.732892
17           1      316     3.0  3.787538
18           1      333     5.0  3.615750
19           1      349     4.0  3.892763
20           1      356     4.0  4.717501
21           1      362     5.0  3.215086
22           1      367     4.0  3

In [34]:
KNNRec(ratings).fit()

[[[  1   1]
  [  1   2]
  [  1   3]
  ...
  [  1 608]
  [  1 609]
  [  1 610]]

 [[  2   1]
  [  2   2]
  [  2   3]
  ...
  [  2 608]
  [  2 609]
  [  2 610]]

 [[  3   1]
  [  3   2]
  [  3   3]
  ...
  [  3 608]
  [  3 609]
  [  3 610]]

 ...

 [[608   1]
  [608   2]
  [608   3]
  ...
  [608 608]
  [608 609]
  [608 610]]

 [[609   1]
  [609   2]
  [609   3]
  ...
  [609 608]
  [609 609]
  [609 610]]

 [[610   1]
  [610   2]
  [610   3]
  ...
  [610 608]
  [610 609]
  [610 610]]]


1

In [112]:
class DampedUserMovieBaselineModel():
    """Baseline model that of the form mu + b_u + b_i,
    where mu is the overall average, b_u is a damped user
    average rating residual, and b_i is a damped item (movie)
    average rating residual. See eqn 2.1 of
    http://files.grouplens.org/papers/FnT%20CF%20Recsys%20Survey.pdf
    Parameters
    ----------
    damping_factor : float, default=0
        Factor to bring residuals closer to 0. Must be positive.
    Attributes
    ----------
    mu : float
        Average rating over all training samples
    b_u : pandas Series, shape = [n_users]
        User residuals
    b_i : pandas Series, shape = [n_movies]
        Movie residuals
    damping_factor : float, default=0
        Factor to bring residuals closer to 0. Must be >= 0.
    """
    def __init__(self, damping_factor=25):
        self.damping_factor = damping_factor

    def fit(self, X):
        """Fit training data.
        Parameters
        ----------
        X : DataFrame, shape = [n_samples, >=3]
            User, movie, rating dataFrame. Columns beyond 3 are ignored
        Returns
        -------
        self : object
        """
        X = X.iloc[:, :3].copy()
        X.columns = ['user', 'item', 'rating']
        self.mu = np.mean(X['rating'])
        print(self.mu)
        user_counts = X['user'].value_counts()
        movie_counts = X['item'].value_counts()
        b_u = (
            X[['user', 'rating']]
            .groupby('user')['rating']
            .sum()
            .subtract(user_counts * self.mu)
            .divide(user_counts + self.damping_factor)
            .rename('b_u')
        )
        X = X.join(b_u, on='user')
        X['item_residual'] = X['rating'] - X['b_u'] - self.mu
        b_i = (
            X[['item', 'item_residual']]
            .groupby('item')['item_residual']
            .sum()
            .divide(movie_counts + self.damping_factor)
            .rename('b_i')
        )
        self.b_u = b_u
        self.b_i = b_i
        return self

    def predict(self, X):
        """Return rating predictions
        Parameters
        ----------
        X : DataFrame, shape = (n_ratings, 2)
            User, item dataframe
        Returns
        -------
        y_pred : numpy array, shape = (n_ratings,)
            Array of n_samples rating predictions
        """
        X = X.iloc[:, :2].copy()
        X.columns = ['user', 'item']
        X = X.join(self.b_u, on='user').fillna(0)
        X = X.join(self.b_i, on='item').fillna(0)
        return (self.mu + X['b_u'] + X['b_i']).values

In [113]:
baseline_algo = DampedUserMovieBaselineModel()
baseline_algo.fit(ratings)

3.501556983616962


<__main__.DampedUserMovieBaselineModel at 0x29de51d2ef0>

In [114]:
baseline_algo.b_u

1      0.780696
2      0.239905
3     -0.649386
4      0.048397
5      0.085964
6     -0.007342
7     -0.232975
8      0.047595
9     -0.155938
10    -0.189200
11     0.201128
12     0.499126
13     0.079495
14    -0.069517
15    -0.045064
16     0.177621
17     0.571819
18     0.219579
19    -0.863454
20     0.080986
21    -0.227970
22    -0.768648
23     0.121997
24     0.120954
25     0.665873
26    -0.120276
27     0.039311
28    -0.461155
29     0.489376
30     0.710967
         ...   
581    0.537503
582    0.363121
583   -0.143052
584    0.466396
585    0.591919
586    0.771142
587    0.411806
588   -0.173916
589    0.322119
590   -0.140948
591   -0.152963
592    0.065997
593   -0.188753
594    0.381863
595    0.310419
596   -0.006055
597    0.450449
598    0.140594
599   -0.850922
600   -0.493893
601    0.740815
602   -0.091939
603    0.006231
604   -0.017246
605   -0.261561
606    0.152425
607    0.250985
608   -0.356652
609   -0.138026
610    0.183476
Name: b_u, Length: 610, 

In [4]:
[[3.23297966 3.03297966 3.03297966 3.366313  ]
 [3.09504863 2.89504863 2.89504863 3.22838196]
 [3.16401415 2.96401415 2.96401415 3.29734748]
 [3.09504863 2.89504863 2.89504863 3.22838196]
 [3.26746242 3.06746242 3.06746242 3.40079576]]


NameError: name 'num_items' is not defined