In [269]:
import numpy as np
import pandas as pd
import math

In [270]:
class SimilarityMetric:

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        pass

In [271]:
class WeightedCosineSimilarity(SimilarityMetric):

    def __init__(self, weights):
        self.weights = weights
        # print(self.weights)

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        u = np.nan_to_num(u, nan=0)
        u_weight_root = u * np.sqrt(self.weights)
        v = np.nan_to_num(v, nan=0)
        v_weight_root = v * np.sqrt(self.weights)

        return np.dot(u_weight_root, v_weight_root) / (np.linalg.norm(u_weight_root) * np.linalg.norm(v_weight_root) + 1e-9)
    

In [272]:
class WeightedPCCSimilarity(SimilarityMetric):

    def __init__(self, weights):
        self.weights = weights

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        return self.__weighted_pearson_correlation(u, v, u_mean, v_mean)
    
    def __weighted_pearson_correlation(self, x, y, x_mean, y_mean):
        x_df = pd.DataFrame(x, columns = ['a'])
        y_df = pd.DataFrame(y, columns = ['a'])

        x_indices = x_df[x_df['a'].notnull()].index
        y_indices = y_df[y_df['a'].notnull()].index

        indices = x_indices.intersection(y_indices)

        x_reqd = x_df.iloc[indices]
        y_reqd = y_df.iloc[indices]

        w_reqd = pd.DataFrame(self.weights, columns = ['a']).iloc[indices]

        # Compute weighted covariance
        weighted_covariance = sum((w_reqd['a'] * (x_reqd['a'] - x_mean)) * (x_reqd['a'] - y_mean))

        # Compute weighted standard deviations
        weighted_std_x = math.sqrt(sum(w_reqd['a'] * (x_reqd['a'] - x_mean)**2))
        weighted_std_y = math.sqrt(sum(w_reqd['a'] * (y_reqd['a'] - y_mean)**2))

        # Compute weighted Pearson's correlation coefficient
        if weighted_std_x == 0 or weighted_std_y == 0:
            return 0  # Handle division by zero
        else:
            weighted_corr = weighted_covariance / (weighted_std_x * weighted_std_y)
            return weighted_corr

In [273]:
class CollaborativeFiltering:

    def __init__(self, data, metric:SimilarityMetric):
        """
        Initialize CollaborativeFiltering object with user-item rating data.
        
        Parameters:
        - data: DataFrame containing user-item ratings
        """
        self.data = data
        self.similarity_matrix = None
        self.metric = metric
        self.movies = pd.read_csv("movies.csv")
        self.movies.set_index('MovieID', inplace=True)
    
    def calculate_similarity_matrix(self):
        pass

    def predict_ratings(self, user_id):
        pass
    
    def getMovies(self, user_id, movie_id_list):

        recommendations = []

        for i in movie_id_list:

            user = pd.DataFrame(self.data.loc[user_id], index = self.data.columns, columns = ['a'])
            rating = user.loc[i]
            movie = self.movies.loc[i]
            recommendations.append([i, rating, movie['Title'], movie['Genres']])

        return recommendations
    
    def getSimilarityMatrix(self):
        return self.similarity_matrix

In [274]:
class CollaborativeFilteringUserUser(CollaborativeFiltering):
    def __init__(self, data, metric:SimilarityMetric):
        
        super().__init__(data, metric)
        self.means = self.data.mean(axis=1)

    def calculate_similarity_matrix(self):
        
        # user_item_matrix = self.train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

        n_users = self.data.shape[0]
        similarity_matrix = np.zeros((n_users, n_users))
        for i in range(n_users):
            for j in range(i+1):
                temp = self.metric.calculateSimilarity(self.data.iloc[i, :], self.data.iloc[j, :], self.means[self.data.index[i]], self.means[self.data.index[j]])
                similarity_matrix[i, j] = temp
                similarity_matrix[j, i] = temp
                
        self.similarity_matrix = pd.DataFrame(similarity_matrix, index=self.data.index, columns=self.data.index)

    def predict_ratings(self, user_id):
        """
        Predict ratings for items for a given user.

        Parameters:
        - user_id: ID of the user for whom to predict ratings

        Returns:
        - DataFrame containing predicted ratings for each item
        """

        predicted_ratings = pd.DataFrame(index=self.data.columns, columns=['PredictedRating'])

        for movie_id in predicted_ratings.index:
            numerator = 0
            denominator = 0

            for other_user_id in self.data.index:
                other_user_rating = self.data.loc[other_user_id, movie_id]
                similarity = self.similarity_matrix.loc[user_id , other_user_id]

                if not np.isnan(other_user_rating):
                    numerator += similarity * (other_user_rating - self.means[other_user_id])
                    denominator += abs(similarity)

            predicted_ratings.loc[movie_id, 'PredictedRating'] = numerator / (denominator + 1e-9) + self.means[user_id]
            
        return predicted_ratings
    
    

In [275]:
class WeightsProvider:
    
    def getWeightsArray(self):
        pass

In [276]:
class IDFWeightsProvider(WeightsProvider):
    
    def __init__(self, data):

        data = data.fillna(0)
        arrayM = np.full((data.shape[1],), data.shape[0])
        watched = np.count_nonzero(data, axis=0)
        self.weights = np.log(arrayM / (watched + 1e-9))
        print(self.weights.shape)

    def getWeightsArray(self):
        return self.weights

In [277]:
class VarianceWeightsProvider(WeightsProvider):
    
    def __init__(self, data):

        temp = data.replace(0, np.NaN)
        self.weights = temp.var(axis=0)
    
    def getWeightsArray(self):
        return self.weights

In [278]:
data=pd.read_csv("EncodedCombined.csv")

user_item_matrix = data.pivot(index='UserID', columns='MovieID', values='Rating')
user_item_matrix = user_item_matrix[0:1000][0:500]

  data=pd.read_csv("EncodedCombined.csv")


In [279]:
WeightProvider = IDFWeightsProvider(user_item_matrix)

metric = WeightedCosineSimilarity(WeightProvider.getWeightsArray())
cf = CollaborativeFilteringUserUser(user_item_matrix, metric)

cf.calculate_similarity_matrix()
cf.getSimilarityMatrix()

# Evaluate the model
# mse = cf.evaluate()
# print("Mean Squared Error:", mse)

(3706,)


UserID,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.081493,0.076210,0.079299,0.052797,0.161451,0.031941,0.112351,0.171782,0.208806,...,0.054271,0.013976,0.039000,0.023347,0.093862,0.100674,0.054685,0.134661,0.108862,0.239587
2,0.081493,1.000000,0.081954,0.132941,0.071850,0.066957,0.221516,0.139305,0.124168,0.175331,...,0.141056,0.037066,0.168473,0.098759,0.117804,0.091393,0.026310,0.096652,0.193828,0.022019
3,0.076210,0.081954,1.000000,0.071651,0.030472,0.059627,0.090351,0.037591,0.078428,0.142340,...,0.065968,0.030792,0.058716,0.120616,0.095596,0.167706,0.017989,0.085765,0.098026,0.042372
4,0.079299,0.132941,0.071651,1.000000,0.031545,0.005146,0.079988,0.058403,0.055796,0.074111,...,0.057258,0.024580,0.029120,0.005916,0.076674,0.070385,0.025714,0.093442,0.047176,0.040518
5,0.052797,0.071850,0.030472,0.031545,1.000000,0.025351,0.083827,0.161202,0.186885,0.072158,...,0.057272,0.097763,0.174967,0.043408,0.084744,0.098886,0.018230,0.166319,0.067516,0.099930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,0.100674,0.091393,0.167706,0.070385,0.098886,0.042155,0.145761,0.099118,0.158280,0.191693,...,0.088420,0.079224,0.068071,0.135181,0.276555,1.000000,0.088661,0.126345,0.089528,0.061304
497,0.054685,0.026310,0.017989,0.025714,0.018230,0.000000,0.024122,0.034129,0.028298,0.106786,...,0.022316,0.010392,0.022291,0.027865,0.017683,0.088661,1.000000,0.029010,0.087474,0.044322
498,0.134661,0.096652,0.085765,0.093442,0.166319,0.072135,0.027916,0.113257,0.198154,0.182670,...,0.045766,0.106277,0.131095,0.041947,0.097179,0.126345,0.029010,1.000000,0.173487,0.150298
499,0.108862,0.193828,0.098026,0.047176,0.067516,0.160406,0.035174,0.154516,0.160265,0.254573,...,0.057857,0.050069,0.126341,0.092509,0.053355,0.089528,0.087474,0.173487,1.000000,0.126314


In [280]:
# cf.getSimilarityMatrix().to_csv("IDF_Weighted_Cosine_collaborative_filtering_similarity_matix.csv")

In [281]:
prediction_df = cf.predict_ratings(2)

ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
movies = cf.getMovies(2, ids)

for movie in movies:
    print(f"Title : {movie[2]} , MovieID : {movie[0]}, Genres : {movie[3]}")

Title : When Night Is Falling (1995) , MovieID : 49, Genres : Drama|Romance
Title : Perfect Blue (1997) , MovieID : 2810, Genres : Animation|Mystery
Title : West Beirut (West Beyrouth) (1998) , MovieID : 2839, Genres : Drama
Title : Curdled (1996) , MovieID : 1000, Genres : Crime
Title : Dream With the Fishes (1997) , MovieID : 1563, Genres : Drama
Title : Onegin (1999) , MovieID : 3161, Genres : Drama
Title : Hour of the Pig, The (1993) , MovieID : 578, Genres : Drama|Mystery
Title : Time of the Gypsies (Dom za vesanje) (1989) , MovieID : 2931, Genres : Drama
Title : Soft Fruit (1999) , MovieID : 3410, Genres : Comedy|Drama
Title : Vie est belle, La (Life is Rosey) (1987) , MovieID : 771, Genres : Comedy|Drama


In [282]:
# WeightProvider = VarianceWeightsProvider(user_item_matrix)
# # print(WeightProvider.getWeightsArray())

# metric = WeightedPCCSimilarity(WeightProvider.getWeightsArray())
# cf = CollaborativeFilteringUserUser(user_item_matrix, metric)
# cf.calculate_similarity_matrix()

# cf.getSimilarityMatrix()

In [283]:
# cf.getSimilarityMatrix().to_csv("Variance_Weighted_PCC_collaborative_filtering_similarity_matix.csv")

In [284]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]} , Rating : {movie[1]} , Genres : {movie[3]}")