In [11]:
import pandas as pd
import numpy as np

Predicted rating for MovieID=1193 by UserID=2: 4.804680162792959
Actual rating: 5
Prediction error: 0.19531983720704105


In [12]:
def create_dataframe(column_names, file_name):
    with open(file_name, 'r', encoding='latin-1') as file:
        file_content = file.read()
    rows = file_content.strip().split('\n')
    data = [row.split('::') for row in rows]
    df = pd.DataFrame(data, columns=column_names)

    if 'UserID' in column_names:
        df['UserID'] = pd.to_numeric(df['UserID'])

    if 'MovieID' in column_names:
        df['MovieID'] = pd.to_numeric(df['MovieID'])

    if 'Rating' in column_names:
        df['Rating'] = pd.to_numeric(df['Rating'])

    return df

In [13]:
ratings_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
users_columns = ['UserID', 'Gender', 'Age', 'Occupation', 'ZipCode']
movies_columns = ['MovieID', 'Title', 'Genres']

ratings_data_df = create_dataframe(ratings_columns, './DATASET/ratings.dat')
users_data_df = create_dataframe(users_columns, './DATASET/users.dat')
movies_data_df = create_dataframe(movies_columns, './DATASET/movies.dat')

merged_df = pd.merge(ratings_data_df, users_data_df, on='UserID')
merged_df = pd.merge(merged_df, movies_data_df, on='MovieID')

In [14]:
def predict_rating(user_id, movie_id):
    similar_users = get_top_similar_users(user_id)
    
    # Get ratings of similar users for the movie
    similar_users_ratings = user_item_matrix.loc[similar_users.index, movie_id]
    
    # Calculate the weighted average of ratings
    weighted_sum = np.dot(similar_users, similar_users_ratings)
    total_similarity = similar_users.sum()
    
    if total_similarity == 0:
        return 0  # Return 0 if no similar users have rated the movie
    
    predicted_rating = weighted_sum / total_similarity
    return predicted_rating

In [15]:
def get_top_similar_users(user_id, n=5):
    similar_users = user_similarity_df.loc[user_id].sort_values(ascending=False)[1:n+1]
    return similar_users

In [16]:
def convert_1indexed_df_to_matrix(df):
    matrix = df.copy()
    matrix.loc[0] = np.nan
    matrix[0] = np.nan

    # Reorder columns and index to make NaN-filled row and column the first ones
    matrix = matrix[
        [0] + [col for col in matrix.columns if col != 0]
    ]
    matrix = matrix.reindex([0] + list(matrix.index))[:-1]
    return matrix.values

In [17]:
# Create a user-item matrix and convert it to float
user_item_matrix = merged_df.pivot_table(index='UserID', columns='MovieID', values='Rating', fill_value=0).astype(float)

# Convert the user-item matrix to a NumPy array
user_item_matrix_np = user_item_matrix.values

# Calculate cosine similarity between users using NumPy
user_similarity = np.dot(user_item_matrix_np, user_item_matrix_np.T)
user_similarity /= np.sqrt(np.outer(np.diag(user_similarity), np.diag(user_similarity.T)))

# Create a DataFrame with user similarity
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [18]:
# Example: Predict the rating for MovieID=1193 by UserID=2
user_id_example = 2
movie_id_example = 1193
predicted_rating_example = predict_rating(user_id_example, movie_id_example)

# Get the actual rating from the dataset
actual_rating = merged_df[(merged_df['UserID'] == user_id_example) & (merged_df['MovieID'] == movie_id_example)]['Rating'].values[0]

# Calculate the prediction error
error = abs(predicted_rating_example - actual_rating)

# Print the results
print(f'Predicted rating for MovieID={movie_id_example} by UserID={user_id_example}: {predicted_rating_example}')
print(f'Actual rating: {actual_rating}')
print(f'Prediction error: {error}')

Predicted rating for MovieID=1193 by UserID=2: 4.804680162792959
Actual rating: 5
Prediction error: 0.19531983720704105
