In [237]:
import numpy as np
import pandas as pd
import math

In [238]:
class SimilarityMetric:

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        pass

In [239]:
class WeightedCosineSimilarity(SimilarityMetric):

    def __init__(self, weights):
        self.weights = weights
        print(self.weights)

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        u = np.nan_to_num(u, nan=0)
        u_weight_root = u * np.sqrt(self.weights)
        v = np.nan_to_num(v, nan=0)
        v_weight_root = v * np.sqrt(self.weights)

        return np.dot(u_weight_root, v_weight_root) / (np.linalg.norm(u_weight_root) * np.linalg.norm(v_weight_root) + 1e-9)
    
    def weighted_norm(self, array):
        # Square each element of the array
        squared_array = array*array
        
        # Multiply each squared element by its corresponding weight
        weighted_squared_array = squared_array * self.weights
        
        # Sum up the weighted squared elements
        sum_weighted_squared = np.sum(weighted_squared_array)
        
        # Take the square root of the sum
        weighted_norm_result = np.sqrt(sum_weighted_squared)
        
        return weighted_norm_result

In [240]:
class WeightedPCCSimilarity(SimilarityMetric):

    def __init__(self, weights):
        self.weights = weights

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        return self.__weighted_pearson_correlation(u, v, u_mean, v_mean)
    
    def __weighted_pearson_correlation(self, x, y, x_mean, y_mean):
        x_df = pd.DataFrame(x, columns = ['a'])
        y_df = pd.DataFrame(y, columns = ['a'])

        x_indices = x_df[x_df['a'].notnull()].index
        y_indices = y_df[y_df['a'].notnull()].index

        indices = x_indices.intersection(y_indices)

        x_reqd = x_df.iloc[indices]
        y_reqd = y_df.iloc[indices]

        w_reqd = pd.DataFrame(self.weights, columns = ['a']).iloc[indices]

        # Compute weighted covariance
        weighted_covariance = sum((w_reqd['a'] * (x_reqd['a'] - x_mean)) * (x_reqd['a'] - y_mean))

        # Compute weighted standard deviations
        weighted_std_x = math.sqrt(sum(w_reqd['a'] * (x_reqd['a'] - x_mean)**2))
        weighted_std_y = math.sqrt(sum(w_reqd['a'] * (y_reqd['a'] - y_mean)**2))

        # Compute weighted Pearson's correlation coefficient
        if weighted_std_x == 0 or weighted_std_y == 0:
            return 0  # Handle division by zero
        else:
            weighted_corr = weighted_covariance / (weighted_std_x * weighted_std_y)
            return weighted_corr

In [241]:
class CollaborativeFiltering:

    def __init__(self, data, metric:SimilarityMetric):
        """
        Initialize CollaborativeFiltering object with user-item rating data.
        
        Parameters:
        - data: DataFrame containing user-item ratings
        """
        self.data = data
        self.similarity_matrix = None
        self.metric = metric
        self.movies = pd.read_csv("movies.csv")
        self.movies.set_index('MovieID', inplace=True)
    
    def calculate_similarity_matrix(self):
        pass

    def predict_ratings(self, user_id):
        pass
    
    def getMovies(self, user_id, movie_id_list):

        recommendations = []

        for i in movie_id_list:

            user = pd.DataFrame(self.data.loc[user_id], index = self.data.columns, columns = ['a'])
            rating = user.loc[i]
            movie = self.movies.loc[i]
            recommendations.append([i, rating, movie['Title'], movie['Genres']])

        return recommendations
    
    def getSimilarityMatrix(self):
        return self.similarity_matrix

In [242]:
class CollaborativeFilteringUserUser(CollaborativeFiltering):
    def __init__(self, data, metric:SimilarityMetric):
        
        super().__init__(data, metric)
        self.means = self.data.mean(axis=1)

    def calculate_similarity_matrix(self):
        
        # user_item_matrix = self.train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

        n_users = self.data.shape[0]
        similarity_matrix = np.zeros((n_users, n_users))
        for i in range(n_users):
            for j in range(i+1):
                temp = self.metric.calculateSimilarity(self.data.iloc[i, :], self.data.iloc[j, :], self.means[self.data.index[i]], self.means[self.data.index[j]])
                similarity_matrix[i, j] = temp
                similarity_matrix[j, i] = temp
                
        self.similarity_matrix = pd.DataFrame(similarity_matrix, index=self.data.index, columns=self.data.index)

    def predict_ratings(self, user_id):
        """
        Predict ratings for items for a given user.

        Parameters:
        - user_id: ID of the user for whom to predict ratings

        Returns:
        - DataFrame containing predicted ratings for each item
        """

        predicted_ratings = pd.DataFrame(index=self.data.columns, columns=['PredictedRating'])

        for movie_id in predicted_ratings.index:
            numerator = 0
            denominator = 0

            for other_user_id in self.data.index:
                other_user_rating = self.data.loc[other_user_id, movie_id]
                similarity = self.similarity_matrix.loc[user_id , other_user_id]

                if not np.isnan(other_user_rating):
                    numerator += similarity * (other_user_rating - self.means[other_user_id])
                    denominator += abs(similarity)

            predicted_ratings.loc[movie_id, 'PredictedRating'] = numerator / (denominator + 1e-9) + self.means[user_id]
            
        return predicted_ratings
    
    

In [243]:
class WeightsProvider:
    
    def getWeightsArray(self):
        pass

In [244]:
class IDFWeightsProvider(WeightsProvider):
    
    def __init__(self, data):

        data = data.fillna(0)
        arrayM = np.full((data.shape[1],), data.shape[0])
        watched = np.count_nonzero(data, axis=0)
        self.weights = np.log(arrayM / (watched + 1e-9))

    def getWeightsArray(self):
        return self.weights

In [245]:
class VarianceWeightsProvider(WeightsProvider):
    
    def __init__(self, data):

        temp = data.replace(0, np.NaN)
        self.weights = temp.var(axis=0)
    
    def getWeightsArray(self):
        return self.weights

In [246]:
data=pd.read_csv("EncodedCombined.csv")

user_item_matrix = data.pivot(index='UserID', columns='MovieID', values='Rating')

  data=pd.read_csv("EncodedCombined.csv")


In [247]:
WeightProvider = IDFWeightsProvider(user_item_matrix)

metric = WeightedCosineSimilarity(WeightProvider.getWeightsArray())
cf = CollaborativeFilteringUserUser(user_item_matrix, metric)

cf.calculate_similarity_matrix()
cf.getSimilarityMatrix()

# Evaluate the model
# mse = cf.evaluate()
# print("Mean Squared Error:", mse)

[1.06747947 2.1536514  2.53654856 ... 4.71717524 5.01727984 2.74515395]


UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.084484,0.075832,0.083963,0.055020,0.162170,0.030650,0.113073,0.172809,0.209859,...,0.111214,0.063873,0.037573,0.015217,0.062753,0.138885,0.075537,0.000000,0.140437,0.077882
2,0.084484,1.000000,0.086194,0.134384,0.074114,0.070908,0.224665,0.141356,0.125606,0.177867,...,0.081615,0.072911,0.189741,0.016869,0.123112,0.170843,0.164086,0.035828,0.050346,0.175118
3,0.075832,0.086194,1.000000,0.076058,0.030401,0.060753,0.095328,0.040380,0.080430,0.145451,...,0.055673,0.092572,0.092500,0.000000,0.061018,0.083294,0.058600,0.079645,0.048914,0.082035
4,0.083963,0.134384,0.076058,1.000000,0.033972,0.005451,0.084441,0.064194,0.062287,0.072383,...,0.104544,0.072529,0.308233,0.000000,0.046726,0.115305,0.083844,0.034987,0.027546,0.085477
5,0.055020,0.074114,0.030401,0.033972,1.000000,0.026793,0.085494,0.161550,0.188045,0.072803,...,0.054139,0.021286,0.040539,0.031561,0.131879,0.244968,0.104380,0.010099,0.018522,0.181709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.138885,0.170843,0.083294,0.115305,0.244968,0.069811,0.077426,0.179164,0.174382,0.262761,...,0.084944,0.166798,0.132074,0.089192,0.207149,1.000000,0.273745,0.113259,0.190880,0.350160
6037,0.075537,0.164086,0.058600,0.083844,0.104380,0.041654,0.075001,0.089862,0.151109,0.175699,...,0.084459,0.235793,0.080206,0.093854,0.082221,0.273745,1.000000,0.029959,0.194890,0.321674
6038,0.000000,0.035828,0.079645,0.034987,0.010099,0.048498,0.000000,0.006568,0.086326,0.081185,...,0.064381,0.071678,0.018975,0.000000,0.037622,0.113259,0.029959,1.000000,0.155490,0.094032
6039,0.140437,0.050346,0.048914,0.027546,0.018522,0.139573,0.006967,0.026475,0.033776,0.252113,...,0.077685,0.208198,0.025421,0.094645,0.041759,0.190880,0.194890,0.155490,1.000000,0.170260


In [248]:
cf.getSimilarityMatrix().to_csv("IDF_Weighted_Cosine_collaborative_filtering_similarity_matix.csv")

In [249]:
prediction_df = cf.predict_ratings(2)

ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
movies = cf.getMovies(2, ids)

for movie in movies:
    print(f"Title : {movie[2]} , MovieID : {movie[0]}, Genres : {movie[3]}")

Title : Chain of Fools (2000) , MovieID : 3323, Genres : Comedy|Crime
Title : Apple, The (Sib) (1998) , MovieID : 2503, Genres : Drama
Title : Gate of Heavenly Peace, The (1995) , MovieID : 787, Genres : Documentary
Title : Jar, The (Khomreh) (1992) , MovieID : 758, Genres : Drama
Title : I Am Cuba (Soy Cuba/Ya Kuba) (1964) , MovieID : 3245, Genres : Drama
Title : Follow the Bitch (1998) , MovieID : 1830, Genres : Comedy
Title : Schlafes Bruder (Brother of Sleep) (1995) , MovieID : 989, Genres : Drama
Title : Foreign Student (1994) , MovieID : 572, Genres : Drama
Title : Mamma Roma (1962) , MovieID : 557, Genres : Drama
Title : Song of Freedom (1936) , MovieID : 3382, Genres : Drama


In [250]:
# WeightProvider = VarianceWeightsProvider(user_item_matrix)
# # print(WeightProvider.getWeightsArray())

# metric = WeightedPCCSimilarity(WeightProvider.getWeightsArray())
# cf = CollaborativeFilteringUserUser(user_item_matrix, metric)
# cf.calculate_similarity_matrix()

# cf.getSimilarityMatrix()

In [251]:
# cf.getSimilarityMatrix().to_csv("Variance_Weighted_PCC_collaborative_filtering_similarity_matix.csv")

In [252]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]} , Rating : {movie[1]} , Genres : {movie[3]}")