In [32]:
import numpy as np
import pandas as pd
import math

In [33]:
class SimilarityMetric:

    def calculateSimilarity(self, u, v):
        pass

In [34]:
class CosineSimilarity(SimilarityMetric):
    def calculateSimilarity(self, u, v, u_mean, v_mean):
        u = np.nan_to_num(u, nan=0)
        v = np.nan_to_num(v, nan=0)
        return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v) + 1e-9)

class PCCSimilarity(SimilarityMetric):
    def calculateSimilarity(self, u, v, u_mean, v_mean):
        return self.__pearson_correlation(u, v, u_mean, v_mean)
    
    def __pearson_correlation(self, x, y, x_mean, y_mean):

        x_df = pd.DataFrame(x, columns = ['a'])
        y_df = pd.DataFrame(y, columns = ['a'])

        x_indices = x_df[x_df['a'].notnull()].index
        y_indices = y_df[y_df['a'].notnull()].index

        indices = x_indices.intersection(y_indices)

        x_reqd = x_df.iloc[indices]
        y_reqd = y_df.iloc[indices]
        
        covariance = sum((x_reqd['a'] - x_mean) * (y_reqd['a'] - y_mean))
        std_x = math.sqrt(sum((x_reqd['a'] - x_mean)**2))
        std_y = math.sqrt(sum((y_reqd['a'] - y_mean)**2))

        if (std_x == 0 or std_y == 0):
            return 0
        
        return covariance / (std_x * std_y)

In [35]:
class CollaborativeFiltering:

    def __init__(self, data, metric:SimilarityMetric):
        """
        Initialize CollaborativeFiltering object with user-item rating data.
        
        Parameters:
        - data: DataFrame containing user-item ratings
        """
        self.data = data
        self.similarity_matrix = None
        self.metric = metric
        self.movies = pd.read_csv("movies.csv")
        self.movies.set_index('MovieID', inplace=True)
    
    def calculate_similarity_matrix(self):
        pass

    def predict_ratings(self, user_id):
        pass
    
    def getMovies(self, user_id, movie_id_list):

        recommendations = []

        for i in movie_id_list:

            user = pd.DataFrame(self.data.loc[user_id], index = self.data.columns, columns = ['a'])
            rating = user.loc[i]
            movie = self.movies.loc[i]
            recommendations.append([i, rating, movie['Title'], movie['Genres']])

        return recommendations
    
    def getSimilarityMatrix(self):
        return self.similarity_matrix

In [36]:
class CollaborativeFilteringUserUser(CollaborativeFiltering):
    def __init__(self, data, metric:SimilarityMetric):
        super().__init__(data, metric)
        self.means = self.data.mean(axis=1)

    def calculate_similarity_matrix(self):
        
        # user_item_matrix = self.train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

        n_users = self.data.shape[0]
        similarity_matrix = np.zeros((n_users, n_users))
        for i in range(n_users):
            for j in range(i+1):
                temp = self.metric.calculateSimilarity(self.data.iloc[i, :], self.data.iloc[j, :], self.means[self.data.index[i]], self.means[self.data.index[i]])
                similarity_matrix[i, j] = temp
                similarity_matrix[j, i] = temp
                
        self.similarity_matrix = pd.DataFrame(similarity_matrix, index=self.data.index, columns=self.data.index)

    def predict_ratings(self, user_id):
        """
        Predict ratings for items for a given user.

        Parameters:
        - user_id: ID of the user for whom to predict ratings

        Returns:
        - DataFrame containing predicted ratings for each item
        """

        predicted_ratings = pd.DataFrame(index=self.data.columns, columns=['PredictedRating'])

        for movie_id in predicted_ratings.index:
            numerator = 0
            denominator = 0

            for other_user_id in self.data.index:
                other_user_rating = self.data.loc[other_user_id, movie_id]
                similarity = self.similarity_matrix.loc[user_id , other_user_id]

                if not pd.isnull(other_user_rating):
                    numerator += similarity * (other_user_rating - self.means[other_user_id])
                    denominator += abs(similarity)

            predicted_ratings.loc[movie_id, 'PredictedRating'] = numerator / (denominator + 1e-9) + self.means[user_id] 
            
        return predicted_ratings

In [37]:
data=pd.read_csv("EncodedCombined.csv")

user_item_matrix = data.pivot(index='UserID', columns='MovieID', values='Rating')
user_item_matrix_new = user_item_matrix

  data=pd.read_csv("EncodedCombined.csv")


In [38]:
# metric = CosineSimilarity()
# # Create CollaborativeFiltering instance
# cf = CollaborativeFilteringUserUser(user_item_matrix_new, metric)

# cf.calculate_similarity_matrix()
# cf.getSimilarityMatrix()

In [39]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating'].tail(10)).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]} , Rating : {movie[1]} , Genres : {movie[3]}")

PredictedRating

MovieID   

3382           4.999999

1830                5.0

2480                5.0

3656                5.0

989                 5.0

3881                5.0

3607                5.0

3172                5.0

3233                5.0

787                 5.0

In [40]:
metric = CosineSimilarity()
cf = CollaborativeFilteringUserUser(user_item_matrix_new, metric)

cf.calculate_similarity_matrix()
cf.getSimilarityMatrix()

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.096382,0.120610,0.132455,0.090158,0.179222,0.059678,0.138241,0.226148,0.255288,...,0.170588,0.082006,0.069807,0.033663,0.114877,0.186329,0.135979,0.000000,0.174604,0.133590
2,0.096382,1.000000,0.151479,0.171176,0.114394,0.100865,0.305787,0.203337,0.190198,0.226861,...,0.112503,0.091222,0.268565,0.014286,0.183384,0.228241,0.206274,0.066118,0.066457,0.218276
3,0.120610,0.151479,1.000000,0.151227,0.062907,0.074603,0.138332,0.077656,0.126457,0.213655,...,0.092960,0.125864,0.161507,0.000000,0.097308,0.143264,0.107744,0.120234,0.094675,0.133144
4,0.132455,0.171176,0.151227,1.000000,0.045094,0.013529,0.130339,0.100856,0.093651,0.120738,...,0.163629,0.093041,0.382803,0.000000,0.082097,0.170583,0.127464,0.062907,0.064634,0.137968
5,0.090158,0.114394,0.062907,0.045094,1.000000,0.047449,0.126257,0.220817,0.261330,0.117052,...,0.100652,0.035732,0.061806,0.054151,0.179083,0.293365,0.172686,0.020459,0.027689,0.241437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.186329,0.228241,0.143264,0.170583,0.293365,0.093583,0.122441,0.227400,0.239607,0.338072,...,0.131294,0.209843,0.186426,0.103431,0.267405,1.000000,0.341462,0.124174,0.219115,0.411891
6037,0.135979,0.206274,0.107744,0.127464,0.172686,0.065788,0.111673,0.144395,0.225055,0.246902,...,0.142309,0.276134,0.129985,0.118749,0.141676,0.341462,1.000000,0.049015,0.252146,0.428240
6038,0.000000,0.066118,0.120234,0.062907,0.020459,0.065711,0.000000,0.019242,0.093470,0.113789,...,0.108837,0.106897,0.040689,0.000000,0.063967,0.124174,0.049015,1.000000,0.161714,0.099300
6039,0.174604,0.066457,0.094675,0.064634,0.027689,0.167303,0.014977,0.044660,0.046434,0.296776,...,0.118776,0.250994,0.053750,0.102168,0.068399,0.219115,0.252146,0.161714,1.000000,0.228332


In [41]:
cf.getSimilarityMatrix().to_csv("User_User_PCC_collaborative_filtering_similarity_matix.csv")

In [42]:
prediction_df = cf.predict_ratings(2)

ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
movies = cf.getMovies(2, ids)

for movie in movies:
    print(f"Title : {movie[2]} , MovieID : {movie[0]} , Genres : {movie[3]}")

Title : Apple, The (Sib) (1998) , MovieID : 2503 , Genres : Drama
Title : Gate of Heavenly Peace, The (1995) , MovieID : 787 , Genres : Documentary
Title : Hour of the Pig, The (1993) , MovieID : 578 , Genres : Drama|Mystery
Title : Jar, The (Khomreh) (1992) , MovieID : 758 , Genres : Drama
Title : I Am Cuba (Soy Cuba/Ya Kuba) (1964) , MovieID : 3245 , Genres : Drama
Title : Follow the Bitch (1998) , MovieID : 1830 , Genres : Comedy
Title : Schlafes Bruder (Brother of Sleep) (1995) , MovieID : 989 , Genres : Drama
Title : Foreign Student (1994) , MovieID : 572 , Genres : Drama
Title : Mamma Roma (1962) , MovieID : 557 , Genres : Drama
Title : Song of Freedom (1936) , MovieID : 3382 , Genres : Drama
