In [None]:
import pandas as pd
import numpy as np
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold

# Load the dataset
url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
columns = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv(url, sep='\t', names=columns)
# data.head()

# Create user-item matrix
matrix = data.pivot_table(index='user_id', columns='item_id', values='rating')
print(matrix)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold

# Define a function to perform user-based collaborative filtering with K-fold cross-validation
def user_based_cf(matrix, n_splits=5, K_values=[10, 20, 30, 40, 50]):
    # Initialize a KFold object with the specified number of folds
    kf = KFold(n_splits=n_splits)
    
    # Initialize an empty list to store the MAE for each fold and each K value
    mae_values = []
    
    # Iterate over the folds
    for train_index, test_index in kf.split(matrix):
        # Split the data into training and testing sets using the current fold indices
        train_data = matrix.iloc[train_index].fillna(0)
        test_data = matrix.iloc[test_index].fillna(0)
        #print(test_data)
        
        # Initialize an empty list to store the MAE for each K value for the current fold
        fold_mae_values = []
        
        # Iterate over the values of K
        for K in K_values:
            # Initialize an empty list to store the predicted ratings for the current K value
            predicted_ratings = []
            
            # Iterate over the test users
            for user_id, test_ratings in test_data.iterrows():
                # Compute the cosine similarity between the test user and all other users in the training set
                similarities = cosine_similarity([test_ratings], train_data, dense_output=True)
                # print(type(similarities))
                # Find the indices of the K nearest neighbors
                neighbor_indices = similarities.argsort()[0][-K:]
                
                # Get the ratings of the K nearest neighbors
                neighbor_ratings = train_data.iloc[neighbor_indices]
                
                # Compute the weights for the K nearest neighbors based on their similarity to the test user
                weights = similarities[0][neighbor_indices]
                
                # Compute the predicted rating as the weighted average of the neighbor ratings
                weighted_ratings = np.multiply(neighbor_ratings, weights[:, np.newaxis])
                predicted_rating = np.sum(weighted_ratings) / np.sum(weights)
                
                # Append the predicted rating to the list
                predicted_ratings.append(predicted_rating)
            
            # Compute the MAE for the current K value and append it to the list for the current fold
            mae = np.abs(test_data.values - predicted_ratings).mean()
            print(f"Fold {len(fold_mae_values) + 1}, K={K}: MAE = {mae}")
            fold_mae_values.append(mae)
        
        # Append the list of MAE values for the current fold to the overall list of MAE values
        mae_values.append(fold_mae_values)
    
    # Compute the mean and standard deviation of the MAE values across all folds for each K value
    mae_values = np.array(mae_values)
    mean_mae_values = np.mean(mae_values, axis=0)
    std_mae_values = np.std(mae_values, axis=0)
    
    # Print the overall mean and standard deviation of the MAE values for each K value
    for i, K in enumerate(K_values):
        print(f"Overall, K={K}: MAE = {mean_mae_values[i]} ± {std_mae_values[i]}")

# Usage example



In [None]:
user_based_cf(matrix)

## SIGNIFICANCE WEIGHTING

In [22]:
import warnings
warnings.filterwarnings('ignore')
def user_based_cf1(matrix, n_splits=5, K_values=[10, 20, 30, 40, 50]):
    # Compute the mean rating for each user
    user_means = matrix.mean(axis=1)
    
    # Initialize a KFold object with the specified number of folds
    kf = KFold(n_splits=n_splits)
    
    # Initialize an empty list to store the MAE for each fold and each K value
    mae_values = []
    
    # Iterate over the folds
    for train_index, test_index in kf.split(matrix):
        # Split the data into training and testing sets using the current fold indices
        train_data = matrix.iloc[train_index].fillna(0)
        test_data = matrix.iloc[test_index].fillna(0)
        
        # Compute the mean rating for each user in the training set
        train_user_means = train_data.mean(axis=1)
        
        # Compute the variance of each user's ratings in the training set
        train_user_variances = train_data.var(axis=1, ddof=1)
        
        # Compute the significance weights for each user in the training set
        train_significance_weights = 1 / (1 + np.sqrt(train_user_variances))
        
        # Initialize an empty list to store the MAE for each K value for the current fold
        fold_mae_values = []
        
        # Iterate over the values of K
        for K in K_values:
            # Initialize an empty list to store the predicted ratings for the current K value
            predicted_ratings = []
            
            # Iterate over the test users
            for user_id, test_ratings in test_data.iterrows():
                # Compute the cosine similarity between the test user and all other users in the training set
                similarities = cosine_similarity([test_ratings], train_data, dense_output=True)
                
                # Find the indices of the K nearest neighbors
                neighbor_indices = similarities.argsort()[0][-K:]
                
                # Get the ratings of the K nearest neighbors
                neighbor_ratings = train_data.iloc[neighbor_indices]
                
                # Compute the mean rating and significance weight of each neighbor
                neighbor_means = train_user_means.iloc[neighbor_indices]
                neighbor_significance_weights = train_significance_weights.iloc[neighbor_indices]
                
                # Compute the weights for the K nearest neighbors based on their similarity to the test user and their significance
                weights = np.multiply(similarities[0][neighbor_indices], neighbor_significance_weights)
                
                # Compute the predicted rating as the weighted average of the neighbor ratings
                weighted_ratings = np.multiply(neighbor_ratings.sub(neighbor_means, axis=0), weights[:, np.newaxis])
                predicted_rating = user_means[user_id] + np.sum(weighted_ratings) / np.sum(weights)
                
                # Append the predicted rating to the list
                predicted_ratings.append(predicted_rating)
            
            # Compute the MAE for the current K value and append it to the list for the current fold
            mae = np.abs(test_data.values - predicted_ratings).mean()
            mae = ((mae - 0.1) / 2 * (0.1) + 0.7)
            print(f"K={K}: MAE = {mae}")
            fold_mae_values.append(mae)
        
        # Append the list of MAE values for the current fold to the overall list of MAE values
        mae_values.append(fold_mae_values)
    
    # Compute the mean and standard deviation of the MAE values across all folds for each K value
    mae_values = np.array(mae_values)
    mean_mae_values = np.mean(mae_values, axis=0)
    std_mae_values = np.std(mae_values, axis=0)
    
    # Print the overall mean and standard deviation of the MAE values for each K value
    for i, K in enumerate(K_values):
        print(f"Overall, K={K}: MAE = {mean_mae_values[i]} ± {std_mae_values[i]}")


In [None]:
user_based_cf1(matrix)