In [167]:
import numpy as np
import pandas as pd
import numpy.ma as ma

In [168]:
class SimilarityMetric:

    def calculateSimilarity(self, u, v):
        pass

In [169]:
class WeightedPCCSimilarity(SimilarityMetric):

    def __init__(self, weights):
        self.weights = weights

    def calculateSimilarity(self, u, v):
        return self.__weighted_pearson_correlation(u, v)
    
    def __weighted_pearson_correlation(self, x, y):
        """
        Calculate the weighted Pearson's correlation coefficient between two arrays x and y.

        Parameters:
        - x: First array of values
        - y: Second array of values
        - weights: Array of weights for each pair of values

        Returns:
        - Weighted Pearson's correlation coefficient
        """

        x_masked = ma.masked_invalid(x)
        y_masked = ma.masked_invalid(y)

        # Create a mask where both x_masked and y_masked are null
        combined_mask = ~x_masked.mask & ~y_masked.mask

        # Find the indices where both x_masked and y_masked are null
        non_null_indices = np.where(combined_mask)
                
        if len(non_null_indices) <= 2:  # Require at least 2 common items for correlation
            return 0

        # Compute weighted means
        weighted_mean_x = ma.average(x_masked, weights=self.weights)
        weighted_mean_y = ma.average(y_masked, weights=self.weights)

        # Compute weighted covariance
        weighted_covariance = ma.sum(self.weights * (x_masked - weighted_mean_x) * (y_masked - weighted_mean_y))

        # Compute weighted standard deviations
        weighted_std_x = ma.sqrt(ma.sum(self.weights * (x_masked - weighted_mean_x)**2))
        weighted_std_y = ma.sqrt(ma.sum(self.weights * (y_masked - weighted_mean_y)**2))

        # Compute weighted Pearson's correlation coefficient
        if weighted_std_x == 0 or weighted_std_y == 0:
            return 0  # Handle division by zero
        else:
            weighted_corr = weighted_covariance / (weighted_std_x * weighted_std_y)
            return weighted_corr

In [170]:
class CollaborativeFiltering:

    def __init__(self, data, metric:SimilarityMetric):
        """
        Initialize CollaborativeFiltering object with user-item rating data.
        
        Parameters:
        - data: DataFrame containing user-item ratings
        """
        self.data = data
        self.train_data = None
        self.test_data = None
        self.similarity_matrix = None
        self.metric = metric
    
    def calculate_similarity_matrix(self):
        pass

    def predict_ratings(self, user_id):
        pass

    def train_test_split(self, test_size = 0.2):
        """
        Split the data into training and test sets.
        
        Parameters:
        - test_size: Fraction of the data to be used for testing
        """
        np.random.seed(42)  # for reproducibility
        mask = np.random.rand(len(self.data)) < 1 - test_size
        self.train_data = self.data[mask]
        self.test_data = self.data[~mask]

    def evaluate(self):
        """
        Evaluate the Collaborative Filtering model on the test set.

        Returns:
        - Mean squared error (MSE) of the predictions
        """

        mse_sum = 0
        total_predictions = 0

        for user_id in self.test_data.index:
            user_test_ratings = self.test_data.loc[user_id]
            user_predicted_ratings = self.predict_ratings(user_id)
            for movie_id, actual_rating in user_test_ratings.items():
                predicted_rating = user_predicted_ratings.loc[movie_id, 'PredictedRating']
                if (not np.isnan(predicted_rating)) and (not np.isnan(actual_rating)):
                    total_predictions += 1
                    mse_sum += (actual_rating - predicted_rating) ** 2

        mse = mse_sum / total_predictions
        return mse

In [171]:
class CollaborativeFilteringUserUser(CollaborativeFiltering):
    def __init__(self, data, metric:SimilarityMetric):
        
        super().__init__(data, metric)

    def calculate_similarity_matrix(self):
        
        # user_item_matrix = self.train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

        n_users = self.data.shape[0]
        similarity_matrix = np.zeros((n_users, n_users))
        for i in range(n_users):
            for j in range(i+1):
                temp = self.metric.calculateSimilarity(self.data.iloc[i, :], self.data.iloc[j, :])
                similarity_matrix[i, j] = temp
                similarity_matrix[j, i] = temp
                
        self.similarity_matrix = pd.DataFrame(similarity_matrix, index=self.data.index, columns=self.data.index)

    def predict_ratings(self, user_id):
        """
        Predict ratings for items for a given user.

        Parameters:
        - user_id: ID of the user for whom to predict ratings

        Returns:
        - DataFrame containing predicted ratings for each item
        """

        predicted_ratings = pd.DataFrame(index=self.data.columns, columns=['PredictedRating'])

        for movie_id in predicted_ratings.index:
            numerator = 0
            denominator = 0

            for other_user_id in self.data.index:
                other_user_rating = self.data.loc[other_user_id, movie_id]
                similarity = self.similarity_matrix.loc[user_id , other_user_id]

                if not np.isnan(other_user_rating):
                    numerator += similarity * (other_user_rating)
                    denominator += abs(similarity)

            predicted_ratings.loc[movie_id, 'PredictedRating'] = numerator / (denominator + 1e-9) 
            
        return predicted_ratings
    
    def getSimilarityMatrix(self):
        return self.similarity_matrix

In [172]:
class WeightsProvider:
    
    def getWeightsArray(self):
        pass

In [173]:
class IDFWeightsProvider(WeightsProvider):
    
    def __init__(self, data):

        arrayM = np.full((data.shape[1],), data.shape[1])
        watched = np.count_nonzero(data, axis=0)

        self.weights = np.log(arrayM / (watched + 1e-9))
    
    def getWeightsArray(self):
        return self.weights

In [174]:
class VarianceWeightsProvider(WeightsProvider):
    
    def __init__(self, data):

        temp = data.replace(0, np.NaN)
        self.weights = temp.var(axis=1)
    
    def getWeightsArray(self):
        return self.weights

In [175]:
data=pd.read_csv("EncodedCombined.csv")

user_item_matrix = data.pivot(index='UserID', columns='MovieID', values='Rating')

# user_item_matrix_new = user_item_matrix.iloc[0:2000, 0:1500]

  data=pd.read_csv("EncodedCombined.csv")


In [176]:
WeightProvider = IDFWeightsProvider(user_item_matrix)

metric = WeightedPCCSimilarity(WeightProvider.getWeightsArray())
cf = CollaborativeFilteringUserUser(user_item_matrix, metric)

cf.train_test_split(test_size=0.2)
cf.calculate_similarity_matrix()

cf.getSimilarityMatrix()

# Evaluate the model
# mse = cf.evaluate()
# print("Mean Squared Error:", mse)

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [177]:
prediction_df = cf.predict_ratings(2)
print(prediction_df.sort_values(by=['PredictedRating']).tail(10))

        PredictedRating
MovieID                
1332                0.0
1333                0.0
1334                0.0
1335                0.0
1336                0.0
1337                0.0
1339                0.0
1340                0.0
1342                0.0
3952                0.0


In [178]:
WeightProvider = VarianceWeightsProvider(user_item_matrix)
# print(WeightProvider.getWeightsArray())

metric = WeightedPCCSimilarity(WeightProvider.getWeightsArray())
cf = CollaborativeFilteringUserUser(user_item_matrix, metric)

cf.train_test_split(test_size=0.2)
cf.calculate_similarity_matrix()

cf.getSimilarityMatrix()
# # Evaluate the model
# mse = cf.evaluate()
# print("Mean Squared Error:", mse)

In [None]:
prediction_df = cf.predict_ratings(2)
print(prediction_df.sort_values(by=['PredictedRating']).tail(10))