In [81]:
import numpy as np
import pandas as pd

In [82]:
class SimilarityMetric:

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        pass

In [83]:
class WeightedCosineSimilarity(SimilarityMetric):

    def __init__(self, weights):
        self.weights = weights
        # print(self.weights)

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        u = np.nan_to_num(u, nan=0)
        u_weight_root = u * np.sqrt(self.weights)
        v = np.nan_to_num(v, nan=0)
        v_weight_root = v * np.sqrt(self.weights)

        return np.dot(u_weight_root, v_weight_root) / (np.linalg.norm(u_weight_root) * np.linalg.norm(v_weight_root) + 1e-9)
    
class WeightedPCCSimilarity(SimilarityMetric):

    def __init__(self, weights):
        self.weights = weights

    def calculateSimilarity(self, u, v, u_mean, v_mean):

        u_new = u - u_mean
        v_new = v - v_mean

        u_new = np.nan_to_num(u_new, nan=0)
        u_weight_root = u_new * np.sqrt(self.weights)

        v_new = np.nan_to_num(v_new, nan=0)
        v_weight_root = v_new * np.sqrt(self.weights)

        return np.dot(u_weight_root, v_weight_root) / (np.linalg.norm(u_weight_root) * np.linalg.norm(v_weight_root) + 1e-9)
        
    

In [84]:
class CollaborativeFiltering:

    def __init__(self, data, metric:SimilarityMetric):
        """
        Initialize CollaborativeFiltering object with user-item rating data.
        
        Parameters:
        - data: DataFrame containing user-item ratings
        """
        self.data = data
        self.similarity_matrix = None
        self.metric = metric
        self.movies = pd.read_csv("movies.csv")
        self.movies.set_index('MovieID', inplace=True)
    
    def calculate_similarity_matrix(self):
        pass

    def predict_ratings(self, user_id):
        pass
    
    def getMovies(self, user_id, movie_id_list):

        recommendations = []

        for i in movie_id_list:

            user = pd.DataFrame(self.data.loc[user_id], index = self.data.columns, columns = ['a'])
            rating = user.loc[i]
            movie = self.movies.loc[i]
            recommendations.append([i, rating['a'], movie['Title'], movie['Genres']])

        return recommendations
    
    def getSimilarityMatrix(self):
        return self.similarity_matrix

In [85]:
class CollaborativeFilteringUserUser(CollaborativeFiltering):
    def __init__(self, data, metric:SimilarityMetric):
        
        super().__init__(data, metric)
        self.means = self.data.mean(axis=1)

    def calculate_similarity_matrix(self):
        
        # user_item_matrix = self.train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

        n_users = self.data.shape[0]
        similarity_matrix = np.zeros((n_users, n_users))
        for i in range(n_users):
            for j in range(i+1):
                temp = self.metric.calculateSimilarity(self.data.iloc[i, :], self.data.iloc[j, :], self.means[self.data.index[i]], self.means[self.data.index[j]])
                similarity_matrix[i, j] = temp
                similarity_matrix[j, i] = temp
                
        self.similarity_matrix = pd.DataFrame(similarity_matrix, index=self.data.index, columns=self.data.index)

    def predict_ratings(self, user_id):
        """
        Predict ratings for items for a given user.

        Parameters:
        - user_id: ID of the user for whom to predict ratings

        Returns:
        - DataFrame containing predicted ratings for each item
        """

        predicted_ratings = pd.DataFrame(index=self.data.columns, columns=['PredictedRating'])

        for movie_id in predicted_ratings.index:
            numerator = 0
            denominator = 0

            for other_user_id in self.data.index:
                other_user_rating = self.data.loc[other_user_id, movie_id]
                similarity = self.similarity_matrix.loc[user_id , other_user_id]

                if not np.isnan(other_user_rating):
                    numerator += similarity * (other_user_rating - self.means[other_user_id])
                    denominator += abs(similarity)

            predicted_ratings.loc[movie_id, 'PredictedRating'] = numerator / (denominator + 1e-9) + self.means[user_id]
            
        return predicted_ratings
    
    

In [86]:
class WeightsProvider:
    
    def getWeightsArray(self):
        pass

In [87]:
class IDFWeightsProvider(WeightsProvider):
    
    def __init__(self, data):

        data = data.fillna(0)
        arrayM = np.full((data.shape[1],), data.shape[0])
        watched = np.count_nonzero(data, axis=0)
        weights = np.log(arrayM / (watched + 1e-9))

        self.weights = np.nan_to_num(weights, nan=0)

        print(self.weights.shape)

    def getWeightsArray(self):
        return self.weights

In [88]:
data=pd.read_csv("EncodedCombined.csv")

user_item_matrix = data.pivot(index='UserID', columns='MovieID', values='Rating')
# user_item_matrix = user_item_matrix.loc[0:100,0:500]

  data=pd.read_csv("EncodedCombined.csv")


In [89]:
WeightProvider = IDFWeightsProvider(user_item_matrix)

metric = WeightedPCCSimilarity(WeightProvider.getWeightsArray())
cf = CollaborativeFilteringUserUser(user_item_matrix, metric)

cf.calculate_similarity_matrix()
cf.getSimilarityMatrix()

(3706,)


UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.021146,-0.008175,0.005383,-0.013787,0.042910,0.006870,0.006475,0.018927,-0.010583,...,0.004656,0.014650,0.030834,-0.001537,0.035058,-0.040415,0.000528,0.000000,0.016035,-0.001365
2,0.021146,1.000000,0.013503,0.027119,-0.019769,-0.007677,0.060327,-0.006276,0.034259,-0.009728,...,-0.003423,0.006305,0.045960,-0.007353,0.050340,0.055355,0.035613,-0.013776,0.024301,-0.005660
3,-0.008175,0.013503,1.000000,0.021900,-0.016451,-0.005590,0.043988,-0.022609,-0.008825,0.004226,...,-0.007465,0.007002,-0.019640,0.000000,0.021344,0.004241,-0.003356,0.037131,0.017791,-0.025360
4,0.005383,0.027119,0.021900,1.000000,0.003468,0.014386,-0.006658,0.024799,0.014559,-0.037057,...,-0.018660,-0.001913,0.001916,0.000000,-0.013799,0.035240,0.009808,-0.057017,-0.005423,0.013867
5,-0.013787,-0.019769,-0.016451,0.003468,1.000000,-0.017507,0.008468,0.024069,0.022458,-0.001244,...,0.006033,0.008758,0.003673,0.010518,0.033688,0.025639,-0.011555,-0.025203,0.012816,0.043316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,-0.040415,0.055355,0.004241,0.035240,0.025639,-0.021150,0.027825,0.057204,0.023984,0.032471,...,0.001971,0.024155,0.045578,0.031718,0.072574,1.000000,0.012543,0.000140,0.030844,0.088522
6037,0.000528,0.035613,-0.003356,0.009808,-0.011555,-0.002215,-0.014379,-0.002931,0.036959,0.001779,...,-0.017184,0.043127,0.007101,0.015585,0.018597,0.012543,1.000000,-0.015011,0.027634,0.029184
6038,0.000000,-0.013776,0.037131,-0.057017,-0.025203,-0.043552,0.000000,0.000157,0.031946,0.058128,...,-0.059637,-0.012599,0.033721,0.000000,0.021893,0.000140,-0.015011,1.000000,0.053320,-0.041784
6039,0.016035,0.024301,0.017791,-0.005423,0.012816,-0.012164,-0.000552,0.000799,0.001215,0.008950,...,-0.041333,-0.003412,0.002555,0.051849,0.004470,0.030844,0.027634,0.053320,1.000000,0.056116


In [90]:
cf.getSimilarityMatrix().to_csv("IDF_Weighted_PCC_collaborative_filtering_similarity_matix.csv")

In [91]:
prediction_df = cf.predict_ratings(2)

ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
movies = cf.getMovies(2, ids)

for movie in movies:
    print(f"Title : {movie[2]} , MovieID : {movie[0]}, Genres : {movie[3]}")

Title : Even Dwarfs Started Small (Auch Zwerge haben klein angefangen) (1971) , MovieID : 3202, Genres : Drama
Title : Identification of a Woman (Identificazione di una donna) (1982) , MovieID : 1360, Genres : Drama
Title : Trois (2000) , MovieID : 3291, Genres : Thriller
Title : Low Life, The (1994) , MovieID : 730, Genres : Drama
Title : Foreign Student (1994) , MovieID : 572, Genres : Drama
Title : Zachariah (1971) , MovieID : 3236, Genres : Western
Title : Wirey Spindell (1999) , MovieID : 3228, Genres : Comedy
Title : Crude Oasis, The (1995) , MovieID : 821, Genres : Romance
Title : Little Indian, Big City (Un indien dans la ville) (1994) , MovieID : 641, Genres : Comedy
Title : Loves of Carmen, The (1948) , MovieID : 3209, Genres : Drama
