# Collaborative Filtering (User-User based) Recommendation with movies dataset

## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [2]:
!pip install pymongo




## Load data from database (user & rating)

In [9]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient("mongodb+srv://baosurgeous:testDatabase@testcluster.dfxfjru.mongodb.net/?")
db = client["moviesDB"]
movies_collection = db["movies"]
ratings_collection = db["ratings"]
users_collection = db["users"]
def convert_collection_to_pandas_dataframe(collection):
    # Retrieve data from MongoDB
    data_from_mongo = collection.find({}, {"_id": 0})

    # Convert data to a list of dictionaries
    data_list = list(data_from_mongo)

    # Convert to DataFrame
    df = pd.DataFrame(data_list)
    
    return df
    
movies_df = convert_collection_to_pandas_dataframe(movies_collection)
ratings_df = convert_collection_to_pandas_dataframe(ratings_collection)
users_df = convert_collection_to_pandas_dataframe(users_collection)

## Class Collaborative Filtering

In [10]:
import time
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k # number of neighbor points
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
        self.items_id = [int(item) for item in np.unique(self.Y_data[:, 1])]

    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)

    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1]
            # and the corresponding ratings
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important
        # for both memory and computing efficiency. For example, if #user = 1M,
        # #item = 100k, then shape of the rating matrix would be (100k, 1M),
        # you may not have enough memory to store this. Then, instead, we store
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)

    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity()

    def fit(self):
        self.refresh()

    def __pred(self, u, i, normalized = 1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2:
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        users_rated_i_to_list = [int(x) for x in users_rated_i]
        # Step 3: find similarity btw the current user and others
        # who already rated i
        sim = self.S[int(u), users_rated_i_to_list]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:]
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)
        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[int(u)]


    def pred(self, u, i, normalized = 1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)

    def recommend(self, u, normalized=1):
        """
        Determine all items should be recommended for user u. (uuCF = 1)
        or all users who might have interest in item u (uuCF = 0)
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []

        for i in self.items_id:
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0:
                    recommended_items.append((i, rating))

        # Sort recommended items based on ratings in descending order
        recommended_items.sort(key=lambda x: x[1], reverse=True)

        # Return only the top 5 items
        top5_recommendations = [item[0] + 1 for item in recommended_items[:5]]

        return top5_recommendations

    def print_recommendation(self):
        """
        print all items which should be recommended for each user
        """
        print('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print('Recommend item(s):', recommended_items, 'to user', u)
            else:
                print('Recommend item', u, 'to user(s) : ', recommended_items)

        # def recommend(self, u, normalized = 1):
    #     """
    #     Determine all items should be recommended for user u. (uuCF =1)
    #     or all users who might have interest on item u (uuCF = 0)
    #     The decision is made based on all i such that:
    #     self.pred(u, i) > 0. Suppose we are considering items which
    #     have not been rated by u yet.
    #     """
    #     ids = np.where(self.Y_data[:, 0] == u)[0]
    #     items_rated_by_u = self.Y_data[ids, 1].tolist()
    #     recommended_items = []
    #     for i in range(self.n_items):
    #         if i not in items_rated_by_u:
    #             rating = self.__pred(u, i)
    #             if rating > 0:
    #                 recommended_items.append(i)

    #     return recommended_items

## Divide the ratings data

In [11]:
from sklearn.model_selection import train_test_split

def split_data(df, test_size=0.1):
    train_set, test_set = pd.DataFrame(), pd.DataFrame()
    unique_users = ratings_df['user_id'].unique()

    for user in unique_users:
        user_data = df[df['user_id'] == user]
        train_user, test_user = train_test_split(user_data, test_size=test_size, random_state=42)

        train_set = pd.concat([train_set, train_user])
        test_set = pd.concat([test_set, test_user])

    return train_set, test_set

rating_train, rating_test = split_data(ratings_df, test_size=0.1)

In [12]:
rate_train = rating_train.values
rate_test = rating_test.values

# # indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [13]:
rs = CF(rate_train, k = 30, uuCF = 1)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(int(rate_test[n, 0]), int(rate_test[n, 1]), normalized = 0)
    SE += (pred - rate_test[n, 2])**2

RMSE = np.sqrt(SE/n_tests)
print('User-user CF, RMSE =', RMSE)

User-user CF, RMSE = 1.068754618399911


## Update the result into database

In [14]:
# # Get the unique users in database
# # users_id = users_df['id'].unique()
# users_id = [1, 2, 672]
# # Update recommendation list for each user
# for user_id in users_id:
#     print(rs.recommend(user_id - 1))
#     users_collection.update_one(
#     {"id": int(user_id)},
#     {"$set": {"recommendation_list": rs.recommend(user_id - 1)}}
#     )

[4789, 3112, 4140, 2984, 1939]
[309, 764, 3112, 9010, 2330]
[309, 2330, 3112, 764, 9010]
