In [116]:
import numpy as np
import pandas as pd

In [117]:
class SimilarityMetric:

    def calculateSimilarity(self, u, v):
        pass

In [118]:
class CosineSimilarity(SimilarityMetric):
    def calculateSimilarity(self, u, v, u_mean, v_mean):
        u = np.nan_to_num(u, nan=0)
        v = np.nan_to_num(v, nan=0)
        return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v) + 1e-9)

class PCCSimilarity(SimilarityMetric):
    def calculateSimilarity(self, u, v, u_mean, v_mean):

        u_new = u - u_mean
        v_new = v - v_mean

        u_new = np.nan_to_num(u_new, nan=0)
        v_new = np.nan_to_num(v_new, nan=0)

        return np.dot(u_new, v_new) / (np.linalg.norm(u_new) * np.linalg.norm(v_new) + 1e-9)

In [119]:
class CollaborativeFiltering:

    def __init__(self, data, metric:SimilarityMetric):
        """
        Initialize CollaborativeFiltering object with user-item rating data.
        
        Parameters:
        - data: DataFrame containing user-item ratings
        """
        self.data = data
        self.similarity_matrix = None
        self.metric = metric
        self.movies = pd.read_csv("movies.csv")
        self.movies.set_index('MovieID', inplace=True)
    
    def calculate_similarity_matrix(self):
        pass

    def predict_ratings(self, user_id):
        pass
    
    def getMovies(self, user_id, movie_id_list):

        recommendations = []

        for i in movie_id_list:

            user = pd.DataFrame(self.data.loc[user_id], index = self.data.columns, columns = ['a'])
            rating = user.loc[i]
            movie = self.movies.loc[i]
            recommendations.append([i, rating['a'], movie['Title'], movie['Genres']])

        return recommendations
    
    def getSimilarityMatrix(self):
        return self.similarity_matrix

In [120]:
class CollaborativeFilteringUserUser(CollaborativeFiltering):
    def __init__(self, data, metric:SimilarityMetric):
        super().__init__(data, metric)
        self.means = self.data.mean(axis=1)

    def calculate_similarity_matrix(self):
        
        # user_item_matrix = self.train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

        n_users = self.data.shape[0]
        # print(n_users)
        similarity_matrix = np.zeros((n_users, n_users))
        for i in range(n_users):
            for j in range(i+1):
                temp = self.metric.calculateSimilarity(self.data.iloc[i, :], self.data.iloc[j, :], self.means[self.data.index[i]], self.means[self.data.index[i]])
                similarity_matrix[i, j] = temp
                similarity_matrix[j, i] = temp
                
        self.similarity_matrix = pd.DataFrame(similarity_matrix, index=self.data.index, columns=self.data.index)

    def predict_ratings(self, user_id):
        """
        Predict ratings for items for a given user.

        Parameters:
        - user_id: ID of the user for whom to predict ratings

        Returns:
        - DataFrame containing predicted ratings for each item
        """

        predicted_ratings = pd.DataFrame(index=self.data.columns, columns=['PredictedRating'])

        for movie_id in predicted_ratings.index:
            numerator = 0
            denominator = 0

            for other_user_id in self.data.index:
                other_user_rating = self.data.loc[other_user_id, movie_id]
                similarity = self.similarity_matrix.loc[user_id , other_user_id]

                if not pd.isnull(other_user_rating):
                    numerator += similarity * (other_user_rating - self.means[other_user_id])
                    denominator += abs(similarity)

            predicted_ratings.loc[movie_id, 'PredictedRating'] = numerator / (denominator + 1e-9) + self.means[user_id] 
            
        return predicted_ratings

In [121]:
data=pd.read_csv("EncodedCombined.csv")

user_item_matrix = data.pivot(index='UserID', columns='MovieID', values='Rating')
# user_item_matrix = user_item_matrix.loc[0:1000, 0:500]

  data=pd.read_csv("EncodedCombined.csv")


In [122]:
# metric = CosineSimilarity()
# # Create CollaborativeFiltering instance
# cf = CollaborativeFilteringUserUser(user_item_matrix_new, metric)

# cf.calculate_similarity_matrix()
# cf.getSimilarityMatrix()

In [123]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating'].tail(10)).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]} , Rating : {movie[1]} , Genres : {movie[3]}")

PredictedRating

MovieID   

3382           4.999999

1830                5.0

2480                5.0

3656                5.0

989                 5.0

3881                5.0

3607                5.0

3172                5.0

3233                5.0

787                 5.0

In [124]:
metric = PCCSimilarity()
cf = CollaborativeFilteringUserUser(user_item_matrix, metric)

cf.calculate_similarity_matrix()
cf.getSimilarityMatrix()

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.054691,-0.017442,0.016189,-0.019737,0.072479,0.011723,0.039008,0.123948,0.007161,...,0.040488,0.033284,0.070327,-0.004119,0.099887,0.082070,0.046890,0.000000,0.002317,0.025816
2,0.054691,1.000000,0.025116,-0.010935,-0.036202,-0.007277,0.017834,-0.015903,0.059291,-0.018796,...,-0.000964,0.006078,0.060800,0.002037,0.104585,0.100390,0.054392,-0.012413,0.032326,-0.007675
3,-0.017442,0.025116,1.000000,0.056862,-0.002398,-0.011015,0.052288,-0.039915,-0.006267,-0.001387,...,-0.010679,0.002722,-0.025340,0.000000,0.051272,0.045843,0.003531,0.050993,0.048950,-0.046980
4,0.016189,-0.010935,0.056862,1.000000,-0.006186,0.023384,-0.029337,0.044195,0.040960,-0.039347,...,0.010149,0.001805,0.034095,0.000000,0.020035,0.122240,0.049415,-0.093673,0.007107,0.018185
5,-0.019737,-0.036202,-0.002398,-0.006186,1.000000,-0.013689,0.033956,-0.005112,-0.023651,0.007656,...,0.012490,0.003658,-0.006942,0.011725,0.098661,0.039679,-0.011637,-0.005426,0.008427,0.051407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.082070,0.100390,0.045843,0.122240,0.039679,-0.008010,0.024948,0.060516,0.079440,0.068960,...,0.008615,0.118290,0.062269,0.085592,0.101248,1.000000,0.043366,-0.012250,0.025279,0.100180
6037,0.046890,0.054392,0.003531,0.049415,-0.011637,-0.022855,-0.057099,0.001183,0.047877,0.031351,...,-0.004327,0.085792,0.030493,0.006251,0.033030,0.043366,1.000000,-0.014615,0.040580,0.091949
6038,0.000000,-0.012413,0.050993,-0.093673,-0.005426,-0.037948,0.000000,0.000761,0.028573,0.064949,...,-0.045550,-0.030354,0.044443,0.000000,0.009820,-0.012250,-0.014615,1.000000,0.037705,-0.051672
6039,0.002317,0.032326,0.048950,0.007107,0.008427,-0.018885,0.000385,0.005016,0.001080,0.024885,...,-0.026724,-0.013113,0.005442,0.076257,0.010459,0.025279,0.040580,0.037705,1.000000,0.093119


In [125]:
cf.getSimilarityMatrix().to_csv("User_User_PCC_collaborative_filtering_similarity_matix.csv")

In [126]:
prediction_df = cf.predict_ratings(2)

ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
movies = cf.getMovies(2, ids)

for movie in movies:
    print(f"Title : {movie[2]} , MovieID : {movie[0]} , Genres : {movie[3]}")

Title : Gate of Heavenly Peace, The (1995) , MovieID : 787 , Genres : Documentary
Title : I Am Cuba (Soy Cuba/Ya Kuba) (1964) , MovieID : 3245 , Genres : Drama
Title : Leather Jacket Love Story (1997) , MovieID : 1851 , Genres : Drama|Romance
Title : Identification of a Woman (Identificazione di una donna) (1982) , MovieID : 1360 , Genres : Drama
Title : Wirey Spindell (1999) , MovieID : 3228 , Genres : Comedy
Title : Trois (2000) , MovieID : 3291 , Genres : Thriller
Title : Foreign Student (1994) , MovieID : 572 , Genres : Drama
Title : Zachariah (1971) , MovieID : 3236 , Genres : Western
Title : Low Life, The (1994) , MovieID : 730 , Genres : Drama
Title : Loves of Carmen, The (1948) , MovieID : 3209 , Genres : Drama
