In [2]:
import numpy as np 
import pandas as pd 

In [3]:
# # Read data from files
# ratings = pd.read_csv('./DATASET/ratings.dat', sep='::', engine='python', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
# users = pd.read_csv('./DATASET/users.dat', sep='::', engine='python', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
# movies = pd.read_csv('./DATASET/movies.dat', sep='::', engine='python', header=None, names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')
# ratings.head()
# users.head()
# movies.head()

In [5]:
# # Merge datasets
# merged_data = pd.merge(pd.merge(ratings, users, on='UserID'), movies, on='MovieID')

# # Save the merged dataset
# merged_data.to_csv('./DATASET/merged_dataset.csv', index=False)

# merged_data.head()

merge_dataset = pd.read_csv('./DATASET/merged_dataset.csv')

In [6]:
# Create a user-item matrix and convert it to float as it was causing error by attempting to divide a NumPy array of integers by a floating-point number
user_item_matrix = merge_dataset.pivot_table(index='UserID', columns='MovieID', values='Rating', fill_value=0).astype(float)

# Convert the user-item matrix to a NumPy array
user_item_matrix_np = user_item_matrix.values

# Calculate cosine similarity between users using NumPy
user_similarity = np.dot(user_item_matrix_np, user_item_matrix_np.T)
user_similarity /= np.sqrt(np.outer(np.diag(user_similarity), np.diag(user_similarity.T)))

# Create a DataFrame with user similarity
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [12]:
# Function to get the top N similar users for a given user
def get_top_similar_users(user_id, n=5):
    similar_users = user_similarity_df.loc[user_id].sort_values(ascending=False)[1:n+1]
    return similar_users

# Function to predict the rating for a movie for a given user
def predict_rating(user_id, movie_id):
    similar_users = get_top_similar_users(user_id)
    
    # Get ratings of similar users for the movie
    similar_users_ratings = user_item_matrix.loc[similar_users.index, movie_id]
    
    # Calculate the weighted average of ratings
    weighted_sum = np.dot(similar_users, similar_users_ratings)
    total_similarity = similar_users.sum()
    
    if total_similarity == 0:
        return 0  # Return 0 if no similar users have rated the movie
    
    predicted_rating = weighted_sum / total_similarity
    return predicted_rating



In [13]:
# Example: Predict the rating for MovieID=1193 by UserID=2
user_id_example = 2
movie_id_example = 1193
predicted_rating_example = predict_rating(user_id_example, movie_id_example)

# Get the actual rating from the dataset
actual_rating = merge_dataset[(merge_dataset['UserID'] == user_id_example) & (merge_dataset['MovieID'] == movie_id_example)]['Rating'].values[0]

# Calculate the prediction error
error = abs(predicted_rating_example - actual_rating)

# Print the results
print(f'Predicted rating for MovieID={movie_id_example} by UserID={user_id_example}: {predicted_rating_example}')
print(f'Actual rating: {actual_rating}')
print(f'Prediction error: {error}')

Predicted rating for MovieID=1193 by UserID=2: 4.804680162792959
Actual rating: 5
Prediction error: 0.19531983720704105
