# Task
Build a movie recommendation system using the MovieLens dataset, employing user-based collaborative filtering to generate recommendations for a target user, and finally, summarize the implemented system.

In [3]:
import pandas as pd

# Download the dataset if not already present
!wget -nc https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip -n ml-latest-small.zip

# Load the 'ratings.csv' file into a pandas DataFrame
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')

# Display the first 5 rows of the DataFrame
print('First 5 rows of the DataFrame:')
print(ratings_df.head())

# Print the concise summary of the DataFrame
print('\nDataFrame Info:')
ratings_df.info()

# Check for any missing values in the DataFrame
print('\nMissing values per column:')
print(ratings_df.isnull().sum())

--2026-02-03 14:26:37--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.96.204
Connecting to files.grouplens.org (files.grouplens.org)|128.101.96.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2026-02-03 14:26:37 (3.30 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  
First 5 rows of the DataFrame:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

DataFrame Info:
<cl

In [4]:
import numpy as np

# Calculate the number of ratings per user
user_ratings_count = ratings_df['userId'].value_counts()
print('\nNumber of ratings per user (top 10):')
print(user_ratings_count.head(10))
print('\nNumber of ratings per user (bottom 10):')
print(user_ratings_count.tail(10))

# Calculate the number of ratings per movie
movie_ratings_count = ratings_df['movieId'].value_counts()
print('\nNumber of ratings per movie (top 10):')
print(movie_ratings_count.head(10))
print('\nNumber of ratings per movie (bottom 10):')
print(movie_ratings_count.tail(10))

# Define thresholds for filtering sparse data
# A common practice is to remove users who have rated very few movies
# and movies that have been rated by very few users.
# Let's choose thresholds: users with less than 20 ratings and movies with less than 10 ratings.
user_threshold = 20
movie_threshold = 10

# Identify users and movies to keep
active_users = user_ratings_count[user_ratings_count >= user_threshold].index
popular_movies = movie_ratings_count[movie_ratings_count >= movie_threshold].index

# Filter the original DataFrame
filtered_ratings_df = ratings_df[
    ratings_df['userId'].isin(active_users) &
    ratings_df['movieId'].isin(popular_movies)
]

print(f'\nOriginal DataFrame shape: {ratings_df.shape}')
print(f'Filtered DataFrame shape: {filtered_ratings_df.shape}')
print(f'Number of users before filtering: {ratings_df["userId"].nunique()}')
print(f'Number of users after filtering: {filtered_ratings_df["userId"].nunique()}')
print(f'Number of movies before filtering: {ratings_df["movieId"].nunique()}')
print(f'Number of movies after filtering: {filtered_ratings_df["movieId"].nunique()}')


Number of ratings per user (top 10):
userId
414    2698
599    2478
474    2108
448    1864
274    1346
610    1302
68     1260
380    1218
606    1115
288    1055
Name: count, dtype: int64

Number of ratings per user (bottom 10):
userId
406    20
576    20
569    20
595    20
207    20
442    20
278    20
147    20
320    20
53     20
Name: count, dtype: int64

Number of ratings per movie (top 10):
movieId
356     329
318     317
296     307
593     279
2571    278
260     251
480     238
110     237
589     224
527     220
Name: count, dtype: int64

Number of ratings per movie (bottom 10):
movieId
160684    1
173317    1
179135    1
184245    1
188675    1
188833    1
189381    1
3899      1
2848      1
147002    1
Name: count, dtype: int64

Original DataFrame shape: (100836, 4)
Filtered DataFrame shape: (81116, 4)
Number of users before filtering: 610
Number of users after filtering: 610
Number of movies before filtering: 9724
Number of movies after filtering: 2269


In [5]:
user_movie_matrix = filtered_ratings_df.pivot_table(index='userId', columns='movieId', values='rating')

print('User-Item Interaction Matrix (first 5 rows and a few columns):')
print(user_movie_matrix.iloc[:5, :10])

User-Item Interaction Matrix (first 5 rows and a few columns):
movieId   1   2    3   5    6   7   9   10  11  12
userId                                            
1        4.0 NaN  4.0 NaN  4.0 NaN NaN NaN NaN NaN
2        NaN NaN  NaN NaN  NaN NaN NaN NaN NaN NaN
3        NaN NaN  NaN NaN  NaN NaN NaN NaN NaN NaN
4        NaN NaN  NaN NaN  NaN NaN NaN NaN NaN NaN
5        4.0 NaN  NaN NaN  NaN NaN NaN NaN NaN NaN


In [6]:
user_movie_matrix_filled = user_movie_matrix.fillna(0)

print('User-Item Interaction Matrix after filling NaNs with zeros (first 5 rows and a few columns):')
print(user_movie_matrix_filled.iloc[:5, :10])

User-Item Interaction Matrix after filling NaNs with zeros (first 5 rows and a few columns):
movieId   1    2    3    5    6    7    9    10   11   12
userId                                                   
1        4.0  0.0  4.0  0.0  4.0  0.0  0.0  0.0  0.0  0.0
2        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
3        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
4        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
5        4.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity between users
user_similarity_matrix = cosine_similarity(user_movie_matrix_filled)

# Convert the similarity matrix to a Pandas DataFrame for better readability
user_similarity_matrix = pd.DataFrame(
    user_similarity_matrix,
    index=user_movie_matrix.index,
    columns=user_movie_matrix.index
)

print('User Similarity Matrix (first 5 rows and columns):')
print(user_similarity_matrix.iloc[:5, :5])

User Similarity Matrix (first 5 rows and columns):
userId         1         2         3         4         5
userId                                                  
1       1.000000  0.029977  0.115634  0.220630  0.134869
2       0.029977  1.000000  0.000000  0.004256  0.017471
3       0.115634  0.000000  1.000000  0.004532  0.009302
4       0.220630  0.004256  0.004532  1.000000  0.139754
5       0.134869  0.017471  0.009302  0.139754  1.000000


In [8]:
def predict_rating(user_id, movie_id, user_movie_matrix, user_similarity_matrix, k=5):
    # Get similarity scores for the target user
    user_sims = user_similarity_matrix[user_id]

    # Get ratings for the target movie from all users
    # Filter out NaNs to only consider users who have actually rated the movie
    movie_ratings = user_movie_matrix[movie_id].dropna()

    # Identify users who have rated the target movie and are not the target user
    rated_by_others = movie_ratings.index.drop(user_id, errors='ignore')

    if rated_by_others.empty:
        # If no other user has rated this movie, or only the target user has,
        # return NaN as a prediction cannot be made based on neighbors.
        return np.nan

    # Get similarity scores for users who rated the movie
    relevant_sims = user_sims.loc[rated_by_others]

    # Sort similar users by similarity in descending order and select top k neighbors
    # Filter out users with 0 or negative similarity, as they don't contribute positively
    relevant_sims_filtered = relevant_sims[relevant_sims > 0].sort_values(ascending=False)

    if relevant_sims_filtered.empty:
        return np.nan # No similar users with positive similarity

    top_k_neighbors = relevant_sims_filtered.head(k).index

    # Ensure these top k neighbors actually rated the movie and have a valid rating
    final_neighbors = top_k_neighbors.intersection(movie_ratings.index)

    if final_neighbors.empty:
        return np.nan

    # Get similarities and ratings for the final neighbors
    sim_scores = relevant_sims.loc[final_neighbors]
    neighbor_ratings = movie_ratings.loc[final_neighbors]

    # Calculate weighted average
    # Avoid division by zero if sum of similarities is zero
    if sim_scores.abs().sum() == 0:
        return np.nan

    predicted_rating = (sim_scores * neighbor_ratings).sum() / sim_scores.abs().sum()
    return predicted_rating

# Example: Predict a rating for user 1 for a movie they haven't rated
target_user_id = 1

# Find a movie not rated by the target user
unrated_movies_for_target_user = user_movie_matrix.loc[target_user_id][user_movie_matrix.loc[target_user_id].isna()]

if not unrated_movies_for_target_user.empty:
    # Get the ID of the first unrated movie
    target_movie_id = unrated_movies_for_target_user.index[0]
    print(f"\nAttempting to predict rating for User {target_user_id} for Movie {target_movie_id}.")

    # Predict the rating
    predicted_rating = predict_rating(target_user_id, target_movie_id, user_movie_matrix, user_similarity_matrix)

    if not np.isnan(predicted_rating):
        print(f"Predicted rating for User {target_user_id} for Movie {target_movie_id}: {predicted_rating:.2f}")
    else:
        print(f"Could not predict rating for User {target_user_id} for Movie {target_movie_id} (not enough similar users or data).")
else:
    print(f"User {target_user_id} has rated all available movies or no unrated movies found with current filtering for prediction.")



Attempting to predict rating for User 1 for Movie 2.
Predicted rating for User 1 for Movie 2: 2.50


In [9]:
def generate_recommendations(user_id, user_movie_matrix, user_similarity_matrix, num_recommendations=10, k=5):
    # Get movies the user has already rated
    rated_movies = user_movie_matrix.loc[user_id].dropna().index

    # Get all movie IDs present in the matrix
    all_movie_ids = user_movie_matrix.columns

    # Identify movies the user has not rated
    unrated_movies = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movies]

    predictions = {}
    for movie_id in unrated_movies:
        predicted_rating = predict_rating(user_id, movie_id, user_movie_matrix, user_similarity_matrix, k)
        if not np.isnan(predicted_rating):
            predictions[movie_id] = predicted_rating

    # Sort predictions by rating in descending order
    sorted_predictions = sorted(predictions.items(), key=lambda item: item[1], reverse=True)

    # Get the top N recommendations
    top_recommendations = sorted_predictions[:num_recommendations]

    return top_recommendations

# Generate recommendations for the target user (e.g., User 1)
target_user_id = 1
num_recommendations = 10

print(f"\nGenerating top {num_recommendations} recommendations for User {target_user_id}...")
recommendations = generate_recommendations(target_user_id, user_movie_matrix, user_similarity_matrix, num_recommendations=num_recommendations)

if recommendations:
    print(f"Top {num_recommendations} recommendations for User {target_user_id}:")
    for movie_id, predicted_rating in recommendations:
        print(f"  Movie ID: {movie_id}, Predicted Rating: {predicted_rating:.2f}")
else:
    print(f"Could not generate recommendations for User {target_user_id}. Either no unrated movies or not enough data for prediction.")


Generating top 10 recommendations for User 1...
Top 10 recommendations for User 1:
  Movie ID: 1200, Predicted Rating: 4.80
  Movie ID: 541, Predicted Rating: 4.80
  Movie ID: 92535, Predicted Rating: 4.80
  Movie ID: 3030, Predicted Rating: 4.71
  Movie ID: 4973, Predicted Rating: 4.71
  Movie ID: 750, Predicted Rating: 4.70
  Movie ID: 7387, Predicted Rating: 4.61
  Movie ID: 1235, Predicted Rating: 4.61
  Movie ID: 1203, Predicted Rating: 4.60
  Movie ID: 1244, Predicted Rating: 4.60


In [10]:
from sklearn.model_selection import train_test_split

# Split the filtered_ratings_df into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(filtered_ratings_df, test_size=0.2, random_state=42)

print(f'Shape of training data: {train_df.shape}')
print(f'Shape of test data: {test_df.shape}')
print('Data split into training and testing sets.')

Shape of training data: (64892, 4)
Shape of test data: (16224, 4)
Data split into training and testing sets.


In [11]:
from sklearn.metrics import mean_squared_error

# Reconstruct user_movie_matrix using only the training data
user_movie_matrix_train = train_df.pivot_table(index='userId', columns='movieId', values='rating')

# Fill NaN values with zeros
user_movie_matrix_train_filled = user_movie_matrix_train.fillna(0)

# Calculate user similarity matrix based on training data
user_similarity_matrix_train = pd.DataFrame(
    cosine_similarity(user_movie_matrix_train_filled),
    index=user_movie_matrix_train.index,
    columns=user_movie_matrix_train.index
)

print('User-Movie Matrix for Training Data (first 5 rows and a few columns):')
print(user_movie_matrix_train.iloc[:5, :10])
print('\nUser Similarity Matrix for Training Data (first 5 rows and columns):')
print(user_similarity_matrix_train.iloc[:5, :5])

# Prepare lists to store actual and predicted ratings
actual_ratings = []
predicted_ratings = []

# Iterate through the test set to make predictions
print('\nMaking predictions on the test set...')
for index, row in test_df.iterrows():
    user_id = int(row['userId'])
    movie_id = int(row['movieId'])
    actual_rating = row['rating']

    # Predict rating using the function defined earlier
    # Ensure the user and movie exist in the training matrix before predicting
    if user_id in user_movie_matrix_train.index and movie_id in user_movie_matrix_train.columns:
        predicted = predict_rating(user_id, movie_id, user_movie_matrix_train, user_similarity_matrix_train)

        if not np.isnan(predicted):
            actual_ratings.append(actual_rating)
            predicted_ratings.append(predicted)

# Calculate RMSE
if actual_ratings and predicted_ratings:
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    print(f'\nRoot Mean Squared Error (RMSE): {rmse:.4f}')
else:
    print('\nNo valid predictions could be made for the test set.')


User-Movie Matrix for Training Data (first 5 rows and a few columns):
movieId   1   2    3   5    6   7   9   10  11  12
userId                                            
1        4.0 NaN  4.0 NaN  4.0 NaN NaN NaN NaN NaN
2        NaN NaN  NaN NaN  NaN NaN NaN NaN NaN NaN
3        NaN NaN  NaN NaN  NaN NaN NaN NaN NaN NaN
4        NaN NaN  NaN NaN  NaN NaN NaN NaN NaN NaN
5        4.0 NaN  NaN NaN  NaN NaN NaN NaN NaN NaN

User Similarity Matrix for Training Data (first 5 rows and columns):
userId         1         2         3         4         5
userId                                                  
1       1.000000  0.018668  0.118326  0.183240  0.131963
2       0.018668  1.000000  0.000000  0.000000  0.000000
3       0.118326  0.000000  1.000000  0.004937  0.012565
4       0.183240  0.000000  0.004937  1.000000  0.081994
5       0.131963  0.000000  0.012565  0.081994  1.000000

Making predictions on the test set...

Root Mean Squared Error (RMSE): 0.9778


## Final Task

### Subtask:
Summarize the basic recommendation system built, including the algorithm used and the insights gained from the process.
