# Collaborative Filtering

In [1]:
import numpy as np
import pandas as pd

In [2]:
class SimilarityMetric:

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        pass

In [3]:
class CosineSimilarity(SimilarityMetric):
    def calculateSimilarity(self, u, v, u_mean, v_mean):
        u = np.nan_to_num(u, nan=0)
        v = np.nan_to_num(v, nan=0)
        return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v) + 1e-9)

class PCCSimilarity(SimilarityMetric):
    def calculateSimilarity(self, u, v, u_mean, v_mean):

        u_new = u - u_mean
        v_new = v - v_mean

        u_new = np.nan_to_num(u_new, nan=0)
        v_new = np.nan_to_num(v_new, nan=0)

        return np.dot(u_new, v_new) / (np.linalg.norm(u_new) * np.linalg.norm(v_new) + 1e-9)

In [4]:
class WeightedCosineSimilarity(SimilarityMetric):

    def __init__(self, weights):
        self.weights = weights

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        u = np.nan_to_num(u, nan=0)
        u_weight_root = u * np.sqrt(self.weights)
        v = np.nan_to_num(v, nan=0)
        v_weight_root = v * np.sqrt(self.weights)

        return np.dot(u_weight_root, v_weight_root) / (np.linalg.norm(u_weight_root) * np.linalg.norm(v_weight_root) + 1e-9)


In [5]:
class WeightedPCCSimilarity(SimilarityMetric):

    def __init__(self, weights):
        self.weights = weights

    def calculateSimilarity(self, u, v, u_mean, v_mean):

        u_new = u - u_mean
        v_new = v - v_mean

        u_new = np.nan_to_num(u_new, nan=0)
        u_weight_root = u_new * np.sqrt(self.weights)
        v_new = np.nan_to_num(v_new, nan=0)
        v_weight_root = v_new * np.sqrt(self.weights)

        return np.dot(u_weight_root, v_weight_root) / (np.linalg.norm(u_weight_root) * np.linalg.norm(v_weight_root) + 1e-9)
        

In [6]:
class CollaborativeFiltering:

    def __init__(self, data, metric:SimilarityMetric):
        """
        Initialize CollaborativeFiltering object with user-item rating data.
        
        Parameters:
        - data: DataFrame containing user-item ratings
        """
        self.data = data
        self.similarity_matrix = None
        self.metric = metric
        self.movies = pd.read_csv("movies.csv")
        self.movies.set_index('MovieID', inplace=True)

    def calculate_similarity_matrix(self):
        pass

    def predict_ratings(self, user_id):
        pass
    
    def getMovies(self, user_id, movie_id_list):

        recommendations = []
    
        for i in movie_id_list:

            user = pd.DataFrame(self.data.loc[user_id], index = self.data.columns, columns = ['a'])
            rating = user.loc[i]
            movie = self.movies.loc[i]
            recommendations.append([i, rating['a'], movie['Title'], movie['Genres']])

        return recommendations
    
    def getSimilarityMatrix(self):
        return self.similarity_matrix

In [7]:
class CollaborativeFilteringItemItem(CollaborativeFiltering):

    def __init__(self, data, metric:SimilarityMetric):
        super().__init__(data, metric)
        self.means = self.data.mean(axis=0)
        n_movies = self.data.shape[1]
        self.similarity_matrix = np.zeros((n_movies, n_movies))
    
    def calculate_similarity_matrix(self):
        
        # user_item_matrix = self.train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)
        
        n_movies = self.data.shape[1]
        similarity_matrix = np.zeros((n_movies, n_movies))
        for i in range(n_movies):
            for j in range(i+1):
                temp = self.metric.calculateSimilarity(self.data.iloc[:, i], self.data.iloc[:, j], self.means[self.data.columns[i]], self.means[self.data.columns[j]])
                similarity_matrix[i, j] = temp
                similarity_matrix[j, i] = temp

        self.similarity_matrix = pd.DataFrame(similarity_matrix, index=self.data.columns, columns=self.data.columns)

    def predict_ratings(self, user_id):
        """
        Predict ratings for items for a given user.

        Parameters:
        - user_id: ID of the user for whom to predict ratings

        Returns:
        - DataFrame containing predicted ratings for each item
        """

        user_ratings = self.data.loc[user_id]
        predicted_ratings = pd.DataFrame(index=self.data.columns, columns=['PredictedRating'])

        for movie_id in predicted_ratings.index:
            numerator = 0
            denominator = 0

            for other_movie_id in predicted_ratings.index:
                if (other_movie_id != movie_id):
                    similarity = self.similarity_matrix.loc[movie_id, other_movie_id]
                    other_movie_rating = user_ratings[other_movie_id]

                    if not np.isnan(other_movie_rating):
                        numerator += similarity * (other_movie_rating) 
                        denominator += abs(similarity)
            
            predicted_ratings.loc[movie_id, 'PredictedRating'] = numerator / (denominator + 1e-9)
            
        return predicted_ratings
    
    def setSimilarityMatrix(self, matrix):
        self.similarity_matrix = pd.DataFrame(matrix, index = self.data.columns, columns = self.data.columns)
        

In [8]:
class CollaborativeFilteringUserUser(CollaborativeFiltering):
    def __init__(self, data, metric:SimilarityMetric):
        
        super().__init__(data, metric)
        self.means = self.data.mean(axis=1)
        
        n_users = self.data.shape[0]
        self.similarity_matrix = np.zeros((n_users, n_users))

    def calculate_similarity_matrix(self):
        
        # user_item_matrix = self.train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

        n_users = self.data.shape[0]
        similarity_matrix = np.zeros((n_users, n_users))
        for i in range(n_users):
            for j in range(i+1):
                temp = self.metric.calculateSimilarity(self.data.iloc[i, :], self.data.iloc[j, :], self.means[self.data.index[i]], self.means[self.data.index[j]])
                similarity_matrix[i, j] = temp
                similarity_matrix[j, i] = temp
                
        self.similarity_matrix = pd.DataFrame(similarity_matrix, index=self.data.index, columns=self.data.index)
    
    def convert_user_to_row(self, user_dict):
        row = pd.Series(np.nan, index=self.data.columns)
        for key in user_dict:
            row[key] = user_dict[key]

        return row

    def add_new_row(self, row, user_id):
        
        self.data.loc[user_id] = row

        temp_df = pd.DataFrame(row, columns=['a'])
        self.means = self.means.append(pd.Series(temp_df['a'].mean(), index=[user_id]))

        n, _ = self.similarity_matrix.shape

        new_shape = (n+1, n+1)
        similarity_matrix_new = np.zeros(new_shape)
        similarity_matrix_new[:n, :n] = self.similarity_matrix

        for i in range(n+1):
            temp = self.metric.calculateSimilarity(self.data.iloc[i, :], self.data.iloc[-1], self.means[self.data.index[i]], self.means[self.data.index[-1]])
            similarity_matrix_new[i,-1] = temp
            similarity_matrix_new[-1, i] = temp
        
        self.similarity_matrix = pd.DataFrame(similarity_matrix_new, index=self.data.index, columns=self.data.index)

    def predict_ratings(self, user_id):
        """
        Predict ratings for items for a given user.

        Parameters:
        - user_id: ID of the user for whom to predict ratings

        Returns:
        - DataFrame containing predicted ratings for each item
        """

        predicted_ratings = pd.DataFrame(index=self.data.columns, columns=['PredictedRating'])

        for movie_id in predicted_ratings.index:
            numerator = 0
            denominator = 0

            for other_user_id in self.data.index:
                other_user_rating = self.data.loc[other_user_id, movie_id]
                similarity = self.similarity_matrix.loc[user_id , other_user_id]

                if not np.isnan(other_user_rating):
                    numerator += similarity * (other_user_rating - self.means[other_user_id])
                    denominator += abs(similarity)

            predicted_ratings.loc[movie_id, 'PredictedRating'] = numerator / (denominator + 1e-9) + self.means[user_id]
            
        return predicted_ratings
    
    def setSimilarityMatrix(self, matrix):
        self.similarity_matrix = pd.DataFrame(matrix, index = self.data.index, columns = self.data.index)


In [9]:
class WeightsProvider:
    
    def getWeightsArray(self):
        pass

In [10]:
class IDFWeightsProvider(WeightsProvider):
    
    def __init__(self, data):

        data = data.fillna(0)
        arrayM = np.full((data.shape[1],), data.shape[0])
        watched = np.count_nonzero(data, axis=0)
        weights = np.log(arrayM / (watched + 1e-9))

        self.weights = np.nan_to_num(weights, nan=0)

        print(self.weights.shape)

    def getWeightsArray(self):
        return self.weights

In [11]:
class VarianceWeightsProvider(WeightsProvider):
    
    def __init__(self, data):

        temp = data.replace(0, np.NaN)
        weights = np.array(temp.var(axis=0))
        self.weights = np.nan_to_num(weights, nan=0)

        # print(self.weights.shape)
    
    def getWeightsArray(self):
        return self.weights

In [12]:
data=pd.read_csv("EncodedCombined1.csv")

user_item_matrix = data.pivot(index='UserID', columns='MovieID', values='Rating')
# user_item_matrix = user_item_matrix.loc[0:1000,0:500]

## Cosine Similarity with Item Item Filtering

In [13]:
# metric = CosineSimilarity()
# # Create CollaborativeFiltering instance
# cf = CollaborativeFilteringItemItem(user_item_matrix_new, metric)

# cf.calculate_similarity_matrix()

# cf.getSimilarityMatrix()

In [14]:
# cf.getSimilarityMatrix().to_csv("Item_Item_Cosine_collaborative_filtering_similarity_matix.csv")

In [15]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating'].tail(10)).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]} , Rating : {movie[1]} , Genres : {movie[3]}")

## PCC Similarity with Item Item Filtering 

In [16]:
# metric = PCCSimilarity()
# cf = CollaborativeFilteringItemItem(user_item_matrix, metric)

# cf.calculate_similarity_matrix()
# cf.getSimilarityMatrix()

In [17]:
# cf.getSimilarityMatrix().to_csv("Item_Item_PCC_collaborative_filtering_similarity_matix.csv")

In [18]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]}, Genres : {movie[3]}")

## Cosine Similarity with User User Filtering

In [19]:
# metric = CosineSimilarity()
# # Create CollaborativeFiltering instance
# cf = CollaborativeFilteringUserUser(user_item_matrix_new, metric)

# cf.calculate_similarity_matrix()
# cf.getSimilarityMatrix()

In [20]:
# cf.getSimilarityMatrix().to_csv("User_User_cosine_collaborative_filtering_similarity_matix.csv")

In [21]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating'].tail(10)).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]} , Rating : {movie[1]} , Genres : {movie[3]}")

## PCC Similarity with User User Filtering

In [22]:
metric = PCCSimilarity()
cf = CollaborativeFilteringUserUser(user_item_matrix, metric)

# cf.calculate_similarity_matrix()
# cf.getSimilarityMatrix()


matrix = pd.read_csv("Similarity_demo.csv")
matrix.set_index("UserID", inplace=True)
# matrix.shape

cf.setSimilarityMatrix(matrix.values)

user_id = 6041
row = {1:4, 2:5, 1193:2, 661:5, 914:4, 594:10, 919:1, 2321:1, 720:1, 3105:4}
new_row = cf.convert_user_to_row(row)

cf.add_new_row(new_row, user_id)
cf.getSimilarityMatrix()

  self.means = self.means.append(pd.Series(temp_df['a'].mean(), index=[user_id]))


UserID,1,2,3,4,5,6,7,8,9,10,...,6032,6033,6034,6035,6036,6037,6038,6039,6040,6041
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.082041,0.098679,0.119309,0.076328,0.194363,0.050620,0.131492,0.198703,0.240358,...,0.074222,0.053300,0.024611,0.094191,0.169677,0.118343,0.000000,0.183805,0.113509,0.072682
2,0.082041,1.000000,0.140620,0.160555,0.107722,0.103986,0.292533,0.200150,0.181151,0.220953,...,0.079917,0.259086,0.011187,0.180327,0.215216,0.180011,0.076486,0.067280,0.198564,-0.001882
3,0.098679,0.140620,1.000000,0.127482,0.058443,0.074325,0.137137,0.076719,0.122259,0.195897,...,0.122041,0.139584,0.000000,0.090789,0.127601,0.098741,0.112675,0.082593,0.123667,0.000000
4,0.119309,0.160555,0.127482,1.000000,0.041318,0.013341,0.130671,0.096879,0.091798,0.111051,...,0.101594,0.358500,0.000000,0.080044,0.159024,0.118856,0.066695,0.061785,0.132121,0.000000
5,0.076328,0.107722,0.058443,0.041318,1.000000,0.045247,0.115625,0.212391,0.239664,0.110586,...,0.032713,0.060225,0.045592,0.173867,0.293889,0.166299,0.022439,0.021742,0.231778,-0.020187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6037,0.118343,0.180011,0.098741,0.118856,0.166299,0.062156,0.107781,0.140843,0.209928,0.224630,...,0.266286,0.113203,0.103547,0.132241,0.327988,1.000000,0.043878,0.230415,0.403680,-0.012088
6038,0.000000,0.076486,0.112675,0.066695,0.022439,0.070347,0.000000,0.018253,0.075615,0.097827,...,0.096957,0.040765,0.000000,0.078436,0.121435,0.043878,1.000000,0.138670,0.099446,0.000000
6039,0.183805,0.067280,0.082593,0.061785,0.021742,0.187522,0.013298,0.045258,0.038281,0.282949,...,0.232977,0.044041,0.080142,0.069883,0.202747,0.230415,0.138670,1.000000,0.209328,-0.021528
6040,0.113509,0.198564,0.123667,0.132121,0.231778,0.080297,0.071899,0.138700,0.198517,0.229803,...,0.281321,0.100043,0.105750,0.161167,0.391048,0.403680,0.099446,0.209328,1.000000,-0.032709


In [23]:
# cf.getSimilarityMatrix().to_csv("User_User_PCC_collaborative_filtering_similarity_matix.csv")

In [24]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]} , Genres : {movie[3]}")

## Variance Weighted PCC Similarity with User User Filtering 

In [25]:
# WeightProvider = VarianceWeightsProvider(user_item_matrix)

# metric = WeightedPCCSimilarity(WeightProvider.getWeightsArray())
# cf = CollaborativeFilteringUserUser(user_item_matrix, metric)
# cf.calculate_similarity_matrix()

# cf.getSimilarityMatrix()

In [26]:
# cf.getSimilarityMatrix().to_csv("Variance_Weighted_PCC_collaborative_filtering_similarity_matix.csv")

In [27]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]}, Genres : {movie[3]}")

## IDF Weighted PCC Similarity with User User Filtering

In [28]:
# WeightProvider = IDFWeightsProvider(user_item_matrix)

# metric = WeightedPCCSimilarity(WeightProvider.getWeightsArray())
# cf = CollaborativeFilteringUserUser(user_item_matrix, metric)

# cf.calculate_similarity_matrix()
# cf.getSimilarityMatrix()

In [29]:
# cf.getSimilarityMatrix().to_csv("IDF_Weighted_PCC_collaborative_filtering_similarity_matix.csv")

In [30]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]}, Genres : {movie[3]}")