In [None]:
#MOVIE RECOMMENDATION SYSTEM

In [None]:
# Task 1: kNN-based Collaborative Filtering


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import random

# Step 1: Load and Preprocess the Data

# Load the ratings dataset
# The ratings.dat file is delimited by '::'. It contains userID, movieID, rating, and timestamp.
ratings = pd.read_csv('ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')

# Drop the 'Timestamp' column as it's not necessary for recommendation calculations
ratings.drop('Timestamp', axis=1, inplace=True)

# Create a user-item matrix (rows: users, columns: movies, values: ratings)
user_item_matrix = ratings.pivot(index='UserID', columns='MovieID', values='Rating')

# Randomly select a test user from the user-item matrix
test_user = random.choice(user_item_matrix.index)
print(f"Test user selected: {test_user}")

# Step 2: Define a Function to Calculate Similarity

# Function to calculate similarity using either cosine or Pearson correlation
def calculate_similarity(user_item_matrix, metric='cosine'):
    if metric == 'cosine':
        # Cosine similarity (fill NaN with 0 for the calculation)
        similarity_matrix = cosine_similarity(user_item_matrix.fillna(0))
    elif metric == 'pearson':
        # Pearson correlation
        similarity_matrix = user_item_matrix.T.corr()
    else:
        raise ValueError("Unknown metric. Use 'cosine' or 'pearson'.")
    
    # Return the similarity matrix as a DataFrame for easier indexing
    return pd.DataFrame(similarity_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)

# Step 3: Calculate Cosine and Pearson Similarity Matrices

# Calculate cosine similarity matrix
cosine_similarity_matrix = calculate_similarity(user_item_matrix, 'cosine')

# Calculate Pearson similarity matrix
pearson_similarity_matrix = calculate_similarity(user_item_matrix, 'pearson')

# Step 4: Define a Function to Predict Ratings

# Function to predict the rating of a movie for a user using k-nearest neighbors
def predict_rating(user_item_matrix, similarity_matrix, user_id, movie_id, k):
    # Check if the movie exists in the matrix (has been rated by others)
    if movie_id not in user_item_matrix.columns:
        return np.nan
    
    # Get k most similar users who have rated the movie
    similar_users = similarity_matrix[user_id].drop(user_id).nlargest(k)
    ratings = user_item_matrix.loc[similar_users.index, movie_id].dropna()
    
    # If no similar users have rated the movie, return NaN
    if len(ratings) == 0:
        return np.nan
    
    # Calculate the weighted average based on similarity
    predicted_rating = (ratings * similar_users[ratings.index]).sum() / similar_users[ratings.index].sum()
    return predicted_rating

# Step 5: Generate Predictions for Different k Values

# Define a range of k values (number of neighbors) for experimentation
k_values = range(1, 50)

# Initialize dictionaries to store predictions for both cosine and Pearson similarity
predictions_cosine = {k: [] for k in k_values}
predictions_pearson = {k: [] for k in k_values}
actuals = []

# Calculate predictions and store actual ratings for the test user
for movie_id, actual_rating in user_item_matrix.loc[test_user].dropna().items():
    actuals.append(actual_rating)
    for k in k_values:
        # Predict using cosine similarity
        pred_cosine = predict_rating(user_item_matrix, cosine_similarity_matrix, test_user, movie_id, k)
        predictions_cosine[k].append(pred_cosine)
        
        # Predict using Pearson correlation
        pred_pearson = predict_rating(user_item_matrix, pearson_similarity_matrix, test_user, movie_id, k)
        predictions_pearson[k].append(pred_pearson)

# Step 6: Define a Function to Calculate RMSE

# Function to calculate RMSE (Root Mean Square Error) while filtering out NaN predictions
def calculate_rmse(predictions, actuals):
    # Filter out NaN predictions
    filtered_predictions = [pred for pred in predictions if not np.isnan(pred)]
    filtered_actuals = [actual for actual, pred in zip(actuals, predictions) if not np.isnan(pred)]
    
    # Calculate RMSE if the lengths match and are greater than 0
    if len(filtered_actuals) == len(filtered_predictions) and len(filtered_actuals) > 0:
        return np.sqrt(mean_squared_error(filtered_actuals, filtered_predictions))
    else:
        return np.nan

# Step 7: Calculate and Store RMSE Values for Different k Values

# Initialize lists to store RMSE values for both cosine and Pearson
rmse_cosine_values = []
rmse_pearson_values = []

# Calculate RMSE for each k value for both Cosine and Pearson similarity
for k in k_values:
    rmse_cosine = calculate_rmse(predictions_cosine[k], actuals)
    rmse_pearson = calculate_rmse(predictions_pearson[k], actuals)
    
    # Store RMSE values for Cosine
    if not np.isnan(rmse_cosine):
        rmse_cosine_values.append((k, rmse_cosine))
        print(f"RMSE (Cosine) for k={k}: {rmse_cosine}")
    
    # Store RMSE values for Pearson
    if not np.isnan(rmse_pearson):
        rmse_pearson_values.append((k, rmse_pearson))
        print(f"RMSE (Pearson) for k={k}: {rmse_pearson}")

# Step 8: Plot RMSE for Both Cosine and Pearson Similarity

# Extract k values and corresponding RMSE values for plotting
k_values_plot_cosine, rmse_plot_cosine = zip(*rmse_cosine_values)
k_values_plot_pearson, rmse_plot_pearson = zip(*rmse_pearson_values)

# Plot RMSE vs k for both Cosine and Pearson similarity
plt.figure(figsize=(10, 6))
plt.plot(k_values_plot_cosine, rmse_plot_cosine, marker='o', label='RMSE for Cosine Similarity')
plt.plot(k_values_plot_pearson, rmse_plot_pearson, marker='o', label='RMSE for Pearson Correlation', linestyle='--')
plt.title('RMSE for Different k values (Cosine vs. Pearson)')
plt.xlabel('k (Number of Neighbors)')
plt.ylabel('RMSE')
plt.legend()
plt.grid(True)
plt.show()

k = 20
print(f"\nPredicted ratings using Cosine Similarity for k={k}:")
for movie_id, actual_rating in user_item_matrix.loc[test_user].dropna().items():
    pred_cosine = predict_rating(user_item_matrix, cosine_similarity_matrix, test_user, movie_id, k)
    print(f"Movie {movie_id}: Actual={actual_rating}, Predicted={pred_cosine}")

print(f"\nPredicted ratings using Pearson Correlation for k={k}:")
for movie_id, actual_rating in user_item_matrix.loc[test_user].dropna().items():
    pred_pearson = predict_rating(user_item_matrix, pearson_similarity_matrix, test_user, movie_id, k)
    print(f"Movie {movie_id}: Actual={actual_rating}, Predicted={pred_pearson}")

In [None]:
# Task 2: Matrix Factorization-based Recommendation

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import random
import matplotlib.pyplot as plt

# Step 1: Load the Data
ratings = pd.read_csv('ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
ratings.drop('Timestamp', axis=1, inplace=True)

# Create user-item matrix (rows: users, columns: movies, values: ratings)
user_item_matrix = ratings.pivot(index='UserID', columns='MovieID', values='Rating')

# Fill NaN values with the average rating for each user
user_item_matrix_filled = user_item_matrix.fillna(user_item_matrix.mean(axis=1))

# Step 2: Randomly Select 5 Movies (Test Set)
random.seed(42)
test_movies = random.sample(list(user_item_matrix.columns), 5)
print(f"Test Movies (IDs): {test_movies}")

# Step 3: Implement SVD for Matrix Factorization
def svd_matrix_factorization(user_item_matrix, n_components=20):
    svd = TruncatedSVD(n_components=n_components)
    user_factors = svd.fit_transform(user_item_matrix)
    movie_factors = svd.components_

    # Reconstructed matrix approximation
    reconstructed_matrix = np.dot(user_factors, movie_factors)
    return reconstructed_matrix

# Step 4: Improve the SVD Model by Adding User and Item Biases
def svd_with_bias(user_item_matrix, n_components=20):
    global_mean = user_item_matrix.stack().mean()

    # Calculate user and movie biases
    user_biases = user_item_matrix.mean(axis=1) - global_mean
    item_biases = user_item_matrix.mean(axis=0) - global_mean

    # Subtract biases from the original matrix
    bias_adjusted_matrix = user_item_matrix.sub(user_biases, axis=0).sub(item_biases, axis=1).fillna(0)

    # Perform SVD on the bias-adjusted matrix
    svd = TruncatedSVD(n_components=n_components)
    user_factors = svd.fit_transform(bias_adjusted_matrix)
    movie_factors = svd.components_

    # Reconstruct the matrix by adding the biases back
    reconstructed_matrix = np.dot(user_factors, movie_factors)
    reconstructed_with_bias = reconstructed_matrix + user_biases.values[:, np.newaxis] + item_biases.values

    return reconstructed_with_bias

# Step 5: Predict Ratings for All Users on the Selected Movies
def predict_ratings_for_movies(reconstructed_matrix, movies, user_item_matrix):
    # Create a DataFrame of predicted ratings for all users for the selected movies
    predicted_ratings = pd.DataFrame(reconstructed_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)
    predicted_ratings_for_movies = predicted_ratings[movies]  # Select only the test movies
    return predicted_ratings_for_movies

# Step 6: Calculate RMSE for All Users on the Test Movies
def calculate_rmse_for_movies(user_item_matrix, reconstructed_matrix, movies):
    actual_ratings = user_item_matrix[movies].stack().dropna()  # Actual ratings for test movies
    predicted_ratings = pd.DataFrame(reconstructed_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)
    predicted_ratings = predicted_ratings[movies].stack().reindex(actual_ratings.index)  # Align predicted ratings
    return np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

# Step 7: Compare RMSE for Different Numbers of Components
components = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
rmse_svd_list = []
rmse_svd_bias_list = []

for n in components:
    # SVD without bias
    reconstructed_matrix_svd = svd_matrix_factorization(user_item_matrix_filled, n_components=n)
    rmse_svd = calculate_rmse_for_movies(user_item_matrix, reconstructed_matrix_svd, test_movies)
    rmse_svd_list.append(rmse_svd)
    
    # SVD with bias
    reconstructed_matrix_svd_bias = svd_with_bias(user_item_matrix_filled, n_components=n)
    rmse_svd_bias = calculate_rmse_for_movies(user_item_matrix, reconstructed_matrix_svd_bias, test_movies)
    rmse_svd_bias_list.append(rmse_svd_bias)
    
    # Show improvement
    print(f"Components: {n}, RMSE (SVD without bias): {rmse_svd}, RMSE (SVD with bias): {rmse_svd_bias}, Improvement: {rmse_svd - rmse_svd_bias}")

# Step 8: Visualize the Comparison of RMSE for Different Number of Components
plt.figure(figsize=(10, 6))
plt.plot(components, rmse_svd_list, marker='o', label='SVD without bias', color='blue')
plt.plot(components, rmse_svd_bias_list, marker='o', label='SVD with bias', linestyle='--', color='green')
plt.title('RMSE Comparison for Different Number of Components (SVD vs SVD with Bias)')
plt.xlabel('Number of Components')
plt.ylabel('RMSE')
plt.legend()
plt.grid(True)
plt.show()


# Step 9: Print Predicted Ratings for All Users on Test Movies (SVD without bias and SVD with bias)
print("\nPredicted Ratings (SVD without bias) for all users on the selected test movies:")
predicted_ratings_svd = predict_ratings_for_movies(reconstructed_matrix_svd, test_movies, user_item_matrix)
print(predicted_ratings_svd)

print("\nPredicted Ratings (SVD with bias) for all users on the selected test movies:")
predicted_ratings_svd_bias = predict_ratings_for_movies(reconstructed_matrix_svd_bias, test_movies, user_item_matrix)
print(predicted_ratings_svd_bias)

# Task 3: Ranking-based Evaluation and Comparison

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score, ndcg_score
import random

# Step 1: Load and Preprocess the Data
def load_ratings_data(file_path):
    ratings = pd.read_csv(file_path, sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
    ratings.drop('Timestamp', axis=1, inplace=True)
    user_item_matrix = ratings.pivot(index='UserID', columns='MovieID', values='Rating')
    return user_item_matrix

# Step 2: Randomly Select 10 Users Who Have Rated More Than 100 Movies
def select_test_users(user_item_matrix, num_users=10):

    user_ratings_count = user_item_matrix.notna().sum(axis=1)

    # Filtering users who have rated more than 100 movies
    users_with_more_than_100_ratings = user_ratings_count[user_ratings_count > 100].index

    # Randomly choosing 10 users from the filtered list
    random_users = random.sample(list(users_with_more_than_100_ratings), 10)
    return random_users

# Load data
file_path = 'ratings.dat'
user_item_matrix = load_ratings_data(file_path)

# Select 10 random users
test_users = select_test_users(user_item_matrix, num_users=10)
print(f"Selected Test Users: {test_users}")

# Step 3: Generate Top-20 Recommendations Using KNNCF
def get_top_n_recommendations_knn(user_item_matrix, similarity_matrix, user_id, N=20, k=10):
    """Generate Top-N recommendations for a user using k-Nearest Neighbors Collaborative Filtering (KNNCF)."""
    unrated_items = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].notna()]
    predictions = []
    
    for movie_id in unrated_items.index:
        pred = predict_rating(user_item_matrix, similarity_matrix, user_id, movie_id, k)  # Using k-nearest neighbors
    
        if not np.isnan(pred):
            predictions.append((movie_id, pred))
    
    top_n_recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)[:N]
    return [x[0] for x in top_n_recommendations]  # Return movie IDs

# Step 4: Generate Top-20 Recommendations Using IMFR (Matrix Factorization with SVD)
def get_top_n_recommendations_imfr(reconstructed_matrix, user_id, N=20):
    """Generate Top-N recommendations for a user using Improved Matrix Factorization."""
    user_ratings = pd.Series(reconstructed_matrix[user_id-1], index=user_item_matrix.columns)
    user_ratings_filtered = user_ratings[user_item_matrix.loc[user_id].notna()]  # Filter out already rated items
    top_n_recommendations = user_ratings_filtered.nlargest(N).index.tolist()
    return top_n_recommendations

# Step 5: Create Relevance Vector Based on Rating Threshold
def create_relevance_vector(user_ratings, recommendations, threshold=3):
    relevance = []
    for movie_id in recommendations:
        if movie_id in user_ratings.index and not np.isnan(user_ratings[movie_id]):
            if user_ratings[movie_id] >= threshold:  # If rating >= 3, mark it as relevant
                relevance.append(1)
            else:
                relevance.append(0)
        else:
            relevance.append(0)  # If no rating exists, it's irrelevant
    return relevance

# Step 6: Calculate Ranking Metrics (AP and NDCG)
def calculate_average_precision(y_true, y_pred):
    return average_precision_score(y_true, y_pred)

def calculate_ndcg(y_true, y_pred, k=20):
    return ndcg_score([y_true], [y_pred], k=k)

# Step 7: Evaluate Models for Each Test User
def evaluate_models_for_users(user_item_matrix, test_users, cosine_similarity_matrix, reconstructed_matrix_svd_bias, N=20):
    ap_knn_values = []
    ndcg_knn_values = []
    ap_imfr_values = []
    ndcg_imfr_values = []

    # Evaluate each test user
    for user_id in test_users:
        
        # Generate Top-20 recommendations using KNNCF
        top_n_knn = get_top_n_recommendations_knn(user_item_matrix, cosine_similarity_matrix, user_id, N=N)
        
        # Generate Top-20 recommendations using IMFR
        top_n_imfr = get_top_n_recommendations_imfr(reconstructed_matrix_svd_bias, user_id, N=N)
        
        # Get the actual ratings for the Top-N recommendations
        true_ratings_knn = user_item_matrix.loc[user_id, top_n_knn]
        true_ratings_imfr = user_item_matrix.loc[user_id, top_n_imfr]
        
        # Create binary relevance vectors based on a rating threshold (e.g., 3)
        y_true_knn = create_relevance_vector(true_ratings_knn, top_n_knn, threshold=3)
        y_true_imfr = create_relevance_vector(true_ratings_imfr, top_n_imfr, threshold=3)
        
        # Calculate Average Precision
        ap_knn = calculate_average_precision(y_true_knn, top_n_knn)
        ap_imfr = calculate_average_precision(y_true_imfr, top_n_imfr)
        
        # Calculate NDCG
        ndcg_knn = calculate_ndcg(y_true_knn, top_n_knn)
        ndcg_imfr = calculate_ndcg(y_true_imfr, top_n_imfr)
        
        # Store results
        ap_knn_values.append(ap_knn)
        ndcg_knn_values.append(ndcg_knn)
        ap_imfr_values.append(ap_imfr)
        ndcg_imfr_values.append(ndcg_imfr)

    return np.mean(ap_knn_values), np.mean(ndcg_knn_values), np.mean(ap_imfr_values), np.mean(ndcg_imfr_values)

# Step 8: Generate Similarity Matrices and Reconstructed Matrices
cosine_similarity_matrix = calculate_similarity(user_item_matrix, 'cosine')
reconstructed_matrix_svd_bias = svd_with_bias(user_item_matrix, n_components=20)

# Step 9: Evaluate the Recommendations for the Test Users
ap_knn, ndcg_knn, ap_imfr, ndcg_imfr = evaluate_models_for_users(user_item_matrix, test_users, cosine_similarity_matrix, reconstructed_matrix_svd_bias)

# Print Results
print(f"Average Precision (KNNCF): {ap_knn}")
print(f"NDCG (KNNCF): {ndcg_knn}")
print(f"Average Precision (IMFR): {ap_imfr}")
print(f"NDCG (IMFR): {ndcg_imfr}")
