In [30]:
import numpy as np
import pandas as pd

In [31]:
class SimilarityMetric:

    def calculateSimilarity(self, u, v, u_mean, v_mean):
        pass

In [32]:
class CosineSimilarity(SimilarityMetric):
    def calculateSimilarity(self, u, v, u_mean, v_mean):
        u = np.nan_to_num(u, nan=0)
        v = np.nan_to_num(v, nan=0)
        return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v) + 1e-9)

class PCCSimilarity(SimilarityMetric):
    def calculateSimilarity(self, u, v, u_mean, v_mean):

        u_new = u - u_mean
        v_new = v - v_mean

        u_new = np.nan_to_num(u_new, nan=0)
        v_new = np.nan_to_num(v_new, nan=0)

        return np.dot(u_new, v_new) / (np.linalg.norm(u_new) * np.linalg.norm(v_new) + 1e-9)

In [33]:
class CollaborativeFiltering:

    def __init__(self, data, metric:SimilarityMetric):
        """
        Initialize CollaborativeFiltering object with user-item rating data.
        
        Parameters:
        - data: DataFrame containing user-item ratings
        """
        self.data = data
        self.similarity_matrix = None
        self.metric = metric
        self.movies = pd.read_csv("movies.csv")
        self.movies.set_index('MovieID', inplace=True)

    def calculate_similarity_matrix(self):
        pass

    def predict_ratings(self, user_id):
        pass
    
    def getMovies(self, user_id, movie_id_list):

        recommendations = []
    
        for i in movie_id_list:

            user = pd.DataFrame(self.data.loc[user_id], index = self.data.columns, columns = ['a'])
            rating = user.loc[i]
            movie = self.movies.loc[i]
            recommendations.append([i, rating['a'], movie['Title'], movie['Genres']])

        return recommendations
    
    def getSimilarityMatrix(self):
        return self.similarity_matrix

In [34]:
class CollaborativeFilteringItemItem(CollaborativeFiltering):

    def __init__(self, data, metric:SimilarityMetric):
        super().__init__(data, metric)
        self.means = self.data.mean(axis=0)
    
    def calculate_similarity_matrix(self):
        
        # user_item_matrix = self.train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)
        
        n_movies = self.data.shape[1]
        similarity_matrix = np.zeros((n_movies, n_movies))
        for i in range(n_movies):
            for j in range(i+1):
                temp = self.metric.calculateSimilarity(self.data.iloc[:, i], self.data.iloc[:, j], self.means[self.data.columns[i]], self.means[self.data.columns[j]])
                similarity_matrix[i, j] = temp
                similarity_matrix[j, i] = temp

        self.similarity_matrix = pd.DataFrame(similarity_matrix, index=self.data.columns, columns=self.data.columns)

    def predict_ratings(self, user_id):
        """
        Predict ratings for items for a given user.

        Parameters:
        - user_id: ID of the user for whom to predict ratings

        Returns:
        - DataFrame containing predicted ratings for each item
        """

        user_ratings = self.data.loc[user_id]
        predicted_ratings = pd.DataFrame(index=self.data.columns, columns=['PredictedRating'])

        for movie_id in predicted_ratings.index:
            numerator = 0
            denominator = 0

            for other_movie_id in predicted_ratings.index:
                if (other_movie_id != movie_id):
                    similarity = self.similarity_matrix.loc[movie_id, other_movie_id]
                    other_movie_rating = user_ratings[other_movie_id]

                    if not np.isnan(other_movie_rating):
                        numerator += similarity * (other_movie_rating) 
                        denominator += abs(similarity)
            
            predicted_ratings.loc[movie_id, 'PredictedRating'] = numerator / (denominator + 1e-9)
            
        return predicted_ratings
        

In [35]:
data=pd.read_csv("EncodedCombined.csv")

user_item_matrix = data.pivot(index='UserID', columns='MovieID', values='Rating')
# user_item_matrix_new = user_item_matrix.loc[0:1000,0:500]

  data=pd.read_csv("EncodedCombined.csv")


In [36]:
# metric = CosineSimilarity()
# # Create CollaborativeFiltering instance
# cf = CollaborativeFilteringItemItem(user_item_matrix_new, metric)

# cf.calculate_similarity_matrix()

# cf.getSimilarityMatrix()

In [37]:
# prediction_df = cf.predict_ratings(2)

# ids = prediction_df.sort_values(by=['PredictedRating'].tail(10)).index
# movies = cf.getMovies(2, ids)

# for movie in movies:
#     print(f"Title : {movie[2]} , MovieID : {movie[0]} , Rating : {movie[1]} , Genres : {movie[3]}")

        PredictedRating

MovieID             

1316            4.28598

3290           4.347007

1709           4.347007

3209           4.384623

1815           4.563809

133            4.676238

642            4.854247

127            4.999999

3323           4.999999

3382           4.999999

In [38]:
metric = PCCSimilarity()
cf = CollaborativeFilteringItemItem(user_item_matrix, metric)

cf.calculate_similarity_matrix()
cf.getSimilarityMatrix()

# Evaluate the model
# mse = cf.evaluate()
# print("Mean Squared Error:", mse)

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.075213,0.045759,0.072382,0.042931,0.019035,0.043848,0.022814,-0.003923,0.055628,...,-0.012756,0.000674,0.016965,0.011334,0.032400,0.044028,0.032424,0.013296,0.011609,0.028432
2,0.075213,1.000000,0.035494,0.011151,0.087786,0.026290,0.062671,0.056895,0.072512,0.105428,...,-0.012555,-0.008621,0.015643,0.010805,-0.005961,0.051580,0.006089,-0.012165,-0.006408,0.005518
3,0.045759,0.035494,1.000000,0.055360,0.131041,0.045253,0.070055,0.033114,0.026078,0.072420,...,0.002717,0.010599,-0.015799,0.015002,0.016092,0.058393,0.000644,0.026179,0.017762,0.014745
4,0.072382,0.011151,0.055360,1.000000,0.125895,0.002592,0.016793,0.007391,-0.048294,-0.001551,...,0.033399,0.015378,0.009584,0.014684,-0.005587,0.032913,-0.052771,0.010593,-0.011509,0.011976
5,0.042931,0.087786,0.131041,0.125895,1.000000,0.017884,0.082446,0.050699,0.084568,0.052407,...,-0.017954,0.027439,0.000280,0.014614,0.028859,0.067740,-0.016527,0.012363,0.000163,-0.003150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.044028,0.051580,0.058393,0.032913,0.067740,0.022106,0.012014,0.028173,0.002223,0.044335,...,0.011368,0.015026,0.039762,0.050507,0.001428,1.000000,0.027872,0.005337,-0.021239,0.042972
3949,0.032424,0.006089,0.000644,-0.052771,-0.016527,0.011033,0.001405,-0.043032,-0.000663,0.006650,...,0.049955,0.062052,-0.027785,0.020629,0.028917,0.027872,1.000000,0.040611,0.136187,0.049344
3950,0.013296,-0.012165,0.026179,0.010593,0.012363,-0.020330,0.015483,0.072943,-0.006883,0.005883,...,-0.007589,0.057976,-0.068629,-0.097038,0.044449,0.005337,0.040611,1.000000,0.039772,0.066690
3951,0.011609,-0.006408,0.017762,-0.011509,0.000163,-0.005432,0.029534,-0.000114,0.000000,-0.006369,...,0.035248,0.112513,-0.039542,0.071272,0.021298,-0.021239,0.136187,0.039772,1.000000,0.110400


In [39]:
cf.getSimilarityMatrix().to_csv("Item_Item_PCC_collaborative_filtering_similarity_matix.csv")

In [40]:
prediction_df = cf.predict_ratings(2)

ids = prediction_df.sort_values(by=['PredictedRating']).tail(10).index
movies = cf.getMovies(2, ids)

for movie in movies:
    print(f"Title : {movie[2]} , MovieID : {movie[0]}, Genres : {movie[3]}")

Title : Pawnbroker, The (1965) , MovieID : 3789, Genres : Drama
Title : Cinema Paradiso (1988) , MovieID : 1172, Genres : Comedy|Drama|Romance
Title : Rain Man (1988) , MovieID : 1961, Genres : Drama
Title : E.T. the Extra-Terrestrial (1982) , MovieID : 1097, Genres : Children's|Drama|Fantasy|Sci-Fi
Title : My Fair Lady (1964) , MovieID : 914, Genres : Musical|Romance
Title : King and I, The (1956) , MovieID : 2565, Genres : Musical
Title : Good Will Hunting (1997) , MovieID : 1704, Genres : Drama
Title : Shawshank Redemption, The (1994) , MovieID : 318, Genres : Drama
Title : Schindler's List (1993) , MovieID : 527, Genres : Drama|War
Title : Jeanne and the Perfect Guy (Jeanne et le garon formidable) (1998) , MovieID : 2591, Genres : Comedy|Romance
