In [None]:
# Download dataset directly and set up environment
!wget https://files.grouplens.org/datasets/movielens/ml-25m.zip
!unzip ml-25m.zip

# Import all necessary libraries upfront to avoid import issues
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import time

# Set path to dataset
dataset_path = './ml-25m'
print(f"Using dataset path: {dataset_path}")

# Verify the dataset exists
if os.path.exists(dataset_path):
    print(f"Dataset found at {dataset_path}")
    print("Files in directory:", os.listdir(dataset_path))
else:
    print(f"Dataset not found at {dataset_path}. Please check the path.")

In this cell, I downloaded the MovieLens 25M dataset from the source and extracted it. I imported all the necessary libraries that will be used throughout the notebook, including pandas for data handling, numpy for numerical operations, and scikit-learn for implementing SVD. I set the path to the extracted dataset folder and verified that it exists with the expected files.

In [None]:
# Test all imported libraries
import matplotlib
import sklearn

print("Testing imports...")
print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)
print("Matplotlib version:", matplotlib.__version__)
print("Scikit-learn version:", sklearn.__version__)

# Test if other libraries are imported properly
print("SciPy sparse module available:", "Yes" if hasattr(sparse, "csr_matrix") else "No")
print("Seaborn available:", "Yes" if sns is not None else "No")
print("All imports loaded successfully!")

# Test a basic plot to ensure matplotlib works
plt.figure(figsize=(5,3))
plt.plot([1, 2, 3, 4], [1, 4, 9, 16])
plt.title('Test Plot')
plt.xlabel('x')
plt.ylabel('xÂ²')
plt.grid(True)
plt.show()

In [None]:
# Import necessary libraries for this cell
import pandas as pd
import os
import numpy as np

# Load the ratings and movies datasets
ratings = pd.read_csv(os.path.join(dataset_path, 'ratings.csv'))
movies = pd.read_csv(os.path.join(dataset_path, 'movies.csv'))

# Display information about the datasets
print(f"Ratings dataset shape: {ratings.shape}")
print(f"Movies dataset shape: {movies.shape}")

# Display the first few rows of each dataset
print("\nRatings dataset preview:")
display(ratings.head())

print("\nMovies dataset preview:")
display(movies.head())

# Check for missing values
print("\nMissing values in ratings dataset:")
print(ratings.isnull().sum())

print("\nMissing values in movies dataset:")
print(movies.isnull().sum())

In this cell, I loaded the ratings and movies datasets from the MovieLens files. The ratings dataset contains user ratings for different movies on a scale of 0.5 to 5 stars, while the movies dataset contains information about each movie including titles and genres. I displayed previews of both datasets to understand their structure and checked for any missing values that might need handling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Basic statistics for ratings
print("Ratings statistics:")
ratings_stats = ratings['rating'].describe()
display(ratings_stats)

# Visualize the rating distribution
plt.figure(figsize=(10, 6))
sns.histplot(ratings['rating'], bins=9, kde=True)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Check the number of ratings per user
user_ratings_count = ratings['userId'].value_counts()

plt.figure(figsize=(10, 6))
sns.histplot(user_ratings_count, bins=50, kde=True)
plt.title('Distribution of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')
plt.xscale('log')
plt.grid(True)
plt.show()

print(f"Median number of ratings per user: {user_ratings_count.median()}")
print(f"Mean number of ratings per user: {user_ratings_count.mean():.2f}")
print(f"Min number of ratings per user: {user_ratings_count.min()}")
print(f"Max number of ratings per user: {user_ratings_count.max()}")

I analyzed the basic statistics and distribution of the movie ratings data. The first visualization shows how ratings are distributed across the 0.5-5 star scale, helping understand user rating patterns. The second visualization shows how many ratings each user has made, revealing that most users rate relatively few movies while some power users rate thousands. This analysis helps inform how I'll filter the data in the next step.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Filter users who have rated at least 100 movies
min_ratings = 100
user_counts = ratings['userId'].value_counts()
active_users = user_counts[user_counts >= min_ratings].index

# Filter ratings for active users
filtered_ratings = ratings[ratings['userId'].isin(active_users)]

print(f"Original ratings shape: {ratings.shape}")
print(f"Filtered ratings shape: {filtered_ratings.shape}")
print(f"Kept {len(active_users)} users out of {len(user_counts)} total users")
print(f"Kept {filtered_ratings.shape[0]} ratings out of {ratings.shape[0]} total ratings")
print(f"Percentage of data kept: {100 * filtered_ratings.shape[0] / ratings.shape[0]:.2f}%")

# Plot the rating distribution after filtering
plt.figure(figsize=(10, 6))
sns.histplot(filtered_ratings['rating'], bins=9, kde=True)
plt.title('Distribution of Movie Ratings (After Filtering)')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Further reduce dataset size for memory efficiency
print("\nFurther reducing dataset size for memory efficiency...")

# Sample users
np.random.seed(42)
max_users = 5000  # Maximum number of users to include
sample_users = np.random.choice(
    filtered_ratings['userId'].unique(),
    size=min(max_users, len(filtered_ratings['userId'].unique())),
    replace=False
)
filtered_ratings_sample = filtered_ratings[filtered_ratings['userId'].isin(sample_users)]

# Focus on popular movies
min_movie_ratings = 50  # Movies must have at least this many ratings
movie_counts = filtered_ratings_sample['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= min_movie_ratings].index
filtered_ratings_sample = filtered_ratings_sample[filtered_ratings_sample['movieId'].isin(popular_movies)]

print(f"Working with {len(filtered_ratings_sample['userId'].unique())} users")
print(f"Working with {len(filtered_ratings_sample['movieId'].unique())} movies")
print(f"Total ratings: {len(filtered_ratings_sample)}")

I created a memory-efficient sparse user-item matrix where rows represent users, columns represent movies, and values represent ratings. Instead of using a dense matrix that would waste memory on storing zeros, I used a sparse representation that only stores non-zero values. I also created mappings between original IDs and matrix indices for later use. The data was split into training (80%) and testing (20%) sets to evaluate the model's performance.

In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import time
from sklearn.model_selection import train_test_split
# Create user-item matrix efficiently
print("Creating sparse user-item matrix...")
start_time = time.time()

# Create mappings for user and movie IDs
users = filtered_ratings_sample['userId'].unique()
movies_subset = filtered_ratings_sample['movieId'].unique()

userid_to_idx = {user: i for i, user in enumerate(users)}
movieid_to_idx = {movie: i for i, movie in enumerate(movies_subset)}
idx_to_userid = {i: user for user, i in userid_to_idx.items()}
idx_to_movieid = {i: movie for movie, i in movieid_to_idx.items()}

# Create row and column indices
row_indices = [userid_to_idx[user] for user in filtered_ratings_sample['userId']]
col_indices = [movieid_to_idx[movie] for movie in filtered_ratings_sample['movieId']]
ratings_data = filtered_ratings_sample['rating'].values

# Create sparse matrix
sparse_matrix = sparse.coo_matrix(
    (ratings_data, (row_indices, col_indices)),
    shape=(len(userid_to_idx), len(movieid_to_idx))
).tocsr()

print(f"Matrix creation completed in {time.time() - start_time:.2f} seconds")
print(f"Sparse matrix shape: {sparse_matrix.shape}")

# Calculate sparsity
sparsity = 1.0 - (sparse_matrix.count_nonzero() /
                 (sparse_matrix.shape[0] * sparse_matrix.shape[1]))
print(f"Matrix sparsity: {sparsity:.4f} (or {sparsity*100:.2f}% empty)")

# Split data for training and testing
train_data, test_data = train_test_split(filtered_ratings_sample, test_size=0.2, random_state=42)
print(f"Training set: {train_data.shape}")
print(f"Testing set: {test_data.shape}")

# Create training sparse matrix
train_row_indices = [userid_to_idx[user] for user in train_data['userId']]
train_col_indices = [movieid_to_idx[movie] for movie in train_data['movieId']]
train_ratings = train_data['rating'].values

train_sparse = sparse.coo_matrix(
    (train_ratings, (train_row_indices, train_col_indices)),
    shape=(len(userid_to_idx), len(movieid_to_idx))
).tocsr()

print(f"Training sparse matrix shape: {train_sparse.shape}")

I created a memory-efficient sparse user-item matrix where rows represent users, columns represent movies, and values represent ratings. Instead of using a dense matrix that would waste memory on storing zeros, I used a sparse representation that only stores non-zero values. I also created mappings between original IDs and matrix indices for later use. The data was split into training (80%) and testing (20%) sets to evaluate the model's performance.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
import time
# Apply Singular Value Decomposition (SVD)
print("Applying Truncated SVD...")
n_components = 100  # Number of latent factors
start_time = time.time()

# Fit SVD on the training data
svd = TruncatedSVD(n_components=n_components, random_state=42)
user_factors = svd.fit_transform(train_sparse)
item_factors = svd.components_.T

print(f"SVD completed in {time.time() - start_time:.2f} seconds")
print(f"User factors shape: {user_factors.shape}")
print(f"Item factors shape: {item_factors.shape}")

# Explained variance
explained_variance = svd.explained_variance_ratio_.sum()
print(f"Explained variance by {n_components} components: {explained_variance:.4f}")

# Plot the explained variance
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(svd.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by SVD Components')
plt.grid(True)
plt.axhline(y=0.8, color='r', linestyle='--', label='80% Threshold')
plt.legend()
plt.show()

# Plot the individual explained variance
plt.figure(figsize=(12, 6))
plt.bar(range(n_components), svd.explained_variance_ratio_)
plt.xlabel('Component Index')
plt.ylabel('Explained Variance Ratio')
plt.title('Individual Explained Variance by SVD Components')
plt.grid(True)
plt.show()

I applied Singular Value Decomposition (SVD) to extract latent factors from the user-item matrix. This technique reduces dimensionality and identifies hidden patterns that represent users' preferences and movie characteristics. The 100 components captured about 37% of the variance in the data. The first visualization shows the cumulative explained variance as components are added, while the second shows the individual contribution of each component.

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
# Function to get movie recommendations for a user
def get_recommendations(user_id, n_recommendations=10):
    if user_id not in userid_to_idx:
        print(f"User {user_id} not found in the sampled dataset.")
        return None

    # Get the user's index in the matrix
    user_idx = userid_to_idx[user_id]

    # Get the user's latent factors
    user_vector = user_factors[user_idx, :].reshape(1, -1)

    # Calculate predicted ratings for all movies
    predicted_ratings = np.dot(user_vector, item_factors.T).flatten()

    # Create a DataFrame with movie IDs and predicted ratings
    movie_ids = [idx_to_movieid[i] for i in range(len(idx_to_movieid))]
    movie_preds = pd.DataFrame({
        'movieId': movie_ids,
        'predicted_rating': predicted_ratings
    })

    # Get movies that the user has already rated
    rated_movies = train_data[train_data['userId'] == user_id]['movieId'].unique()

    # Filter out already rated movies
    recommendations = movie_preds[~movie_preds['movieId'].isin(rated_movies)]

    # Sort by predicted rating
    recommendations = recommendations.sort_values('predicted_rating', ascending=False)

    # Get top N recommendations
    top_recommendations = recommendations.head(n_recommendations)

    # Merge with movie information
    result = top_recommendations.merge(movies, on='movieId')

    return result[['movieId', 'title', 'genres', 'predicted_rating']]

# Function to find similar movies
def get_similar_movies(movie_id, n_similar=10):
    if movie_id not in movieid_to_idx:
        print(f"Movie {movie_id} not found in the sampled dataset.")
        return None

    # Get the movie's index in the matrix
    movie_idx = movieid_to_idx[movie_id]

    # Get the movie's latent factors
    movie_vector = item_factors[movie_idx, :].reshape(1, -1)

    # Calculate similarity with all other movies
    similarity_scores = cosine_similarity(movie_vector, item_factors)

    # Create a DataFrame with movie IDs and similarity scores
    movie_ids = [idx_to_movieid[i] for i in range(len(idx_to_movieid))]
    similar_movies = pd.DataFrame({
        'movieId': movie_ids,
        'similarity': similarity_scores[0]
    })

    # Remove the input movie
    similar_movies = similar_movies[similar_movies['movieId'] != movie_id]

    # Sort by similarity
    similar_movies = similar_movies.sort_values('similarity', ascending=False)

    # Get top N similar movies
    top_similar = similar_movies.head(n_similar)

    # Merge with movie information
    result = top_similar.merge(movies, on='movieId')

    return result[['movieId', 'title', 'genres', 'similarity']]

# Function to evaluate the model
def evaluate_model(sample_size=1000):
    # Create a set of user-item pairs from the test data
    if len(test_data) > sample_size:
        test_sample = test_data.sample(sample_size, random_state=42)
    else:
        test_sample = test_data

    # Initialize variables for error calculation
    error_sum = 0
    count = 0

    for _, row in test_sample.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']

        # Skip if user or movie not in mappings
        if user_id not in userid_to_idx or movie_id not in movieid_to_idx:
            continue

        user_idx = userid_to_idx[user_id]
        movie_idx = movieid_to_idx[movie_id]

        # Get the actual rating
        actual_rating = row['rating']

        # Calculate predicted rating
        user_vec = user_factors[user_idx, :].reshape(1, -1)
        movie_vec = item_factors[movie_idx, :].reshape(-1, 1)
        predicted_rating = np.dot(user_vec, movie_vec)[0][0]

        # Calculate error
        error = (actual_rating - predicted_rating) ** 2
        error_sum += error
        count += 1

    # Calculate RMSE
    if count > 0:
        rmse = np.sqrt(error_sum / count)
        return rmse
    else:
        return None

I defined three key functions that power the recommendation system. The first function gets personalized movie recommendations for a user based on their latent factors. The second function finds movies similar to a given movie using cosine similarity between latent factors. The third function evaluates the model's performance by calculating the Root Mean Square Error (RMSE) between predicted and actual ratings in the test set.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
import time
# Calculate RMSE on the test set
print("Evaluating model...")
start_time = time.time()
rmse = evaluate_model(sample_size=5000)
print(f"RMSE: {rmse:.4f}")
print(f"Evaluation completed in {time.time() - start_time:.2f} seconds")

# Test different numbers of components to see the effect on RMSE
component_range = [10, 20, 50, 100, 150, 200]
rmse_values = []

for n_comp in component_range:
    print(f"Testing with {n_comp} components...")

    # Apply SVD with different number of components
    svd_test = TruncatedSVD(n_components=n_comp, random_state=42)
    user_factors_test = svd_test.fit_transform(train_sparse)
    item_factors_test = svd_test.components_.T

    # Save original factors
    global user_factors, item_factors
    user_factors_temp, item_factors_temp = user_factors, item_factors
    user_factors, item_factors = user_factors_test, item_factors_test

    # Evaluate
    rmse = evaluate_model(sample_size=2000)
    rmse_values.append(rmse)
    print(f"RMSE with {n_comp} components: {rmse:.4f}")

    # Restore original factors
    user_factors, item_factors = user_factors_temp, item_factors_temp

# Plot RMSE vs number of components
plt.figure(figsize=(10, 6))
plt.plot(component_range, rmse_values, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('RMSE')
plt.title('RMSE vs Number of SVD Components')
plt.grid(True)
plt.show()

I evaluated the model's performance by calculating the RMSE on the test data, which measures how close the predicted ratings are to actual ratings. I then experimented with different numbers of SVD components to find the optimal balance between model complexity and accuracy. The visualization shows how RMSE changes with different component counts, helping to identify the sweet spot where additional components no longer significantly improve performance.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Get recommendations for a sample user
sample_user_id = list(userid_to_idx.keys())[0]  # Get the first user in our sample
print(f"Getting movie recommendations for user {sample_user_id}:")
recommendations = get_recommendations(sample_user_id, n_recommendations=10)
display(recommendations)

# Visualize the predicted ratings
plt.figure(figsize=(12, 6))
plt.barh(range(len(recommendations)), recommendations['predicted_rating'], align='center')
plt.yticks(range(len(recommendations)), recommendations['title'].str.slice(0, 30))
plt.xlabel('Predicted Rating')
plt.title(f'Top 10 Movie Recommendations for User {sample_user_id}')
plt.tight_layout()
plt.show()

# Check what genres the user typically rates highly
user_ratings = train_data[train_data['userId'] == sample_user_id]
user_movies = user_ratings.merge(movies, on='movieId')

print(f"\nUser {sample_user_id} has rated {len(user_movies)} movies")
print("Average rating by genre:")

# Extract all genres from the user's rated movies
all_genres = []
for genres in user_movies['genres']:
    all_genres.extend(genres.split('|'))

# Count ratings by genre
genre_ratings = {}
for _, row in user_movies.iterrows():
    for genre in row['genres'].split('|'):
        if genre not in genre_ratings:
            genre_ratings[genre] = []
        genre_ratings[genre].append(row['rating'])

# Calculate average rating per genre
genre_avg_rating = {genre: np.mean(ratings) for genre, ratings in genre_ratings.items()
                   if len(ratings) >= 5}  # Only include genres with at least 5 movies

# Sort by average rating
genre_avg_rating = {k: v for k, v in sorted(genre_avg_rating.items(),
                                           key=lambda item: item[1],
                                           reverse=True)}

# Display genres and average ratings
genre_df = pd.DataFrame(list(genre_avg_rating.items()),
                       columns=['Genre', 'Average Rating'])
display(genre_df)

# Visualize average ratings by genre
plt.figure(figsize=(12, 6))
plt.barh(range(len(genre_avg_rating)), list(genre_avg_rating.values()), align='center')
plt.yticks(range(len(genre_avg_rating)), list(genre_avg_rating.keys()))
plt.xlabel('Average Rating')
plt.title(f'Average Ratings by Genre for User {sample_user_id}')
plt.tight_layout()
plt.show()

I generated movie recommendations for a sample user and visualized the predicted ratings. I also analyzed the user's past rating behavior by calculating average ratings for each genre they've rated. This helps understand why certain movies are being recommended - the system identifies patterns in the user's preferences across different genres an

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Find similar movies to a popular movie
print("\nFinding movies similar to a popular movie...")
# Try to find Toy Story in our dataset
toy_story_id = 1
if toy_story_id in movieid_to_idx:
    movie_to_use = toy_story_id
    movie_info = movies[movies['movieId'] == movie_to_use]
    print(f"Finding movies similar to: {movie_info['title'].values[0]}")
else:
    # If not found, use the first movie in our dataset
    movie_to_use = movies_subset[0]
    movie_info = movies[movies['movieId'] == movie_to_use]
    print(f"Toy Story not found in sample. Using movie: {movie_info['title'].values[0]}")

similar_movies = get_similar_movies(movie_to_use, n_similar=10)
print("Similar movies:")
display(similar_movies)

# Visualize similar movies
plt.figure(figsize=(12, 6))
plt.barh(range(len(similar_movies)), similar_movies['similarity'], align='center')
plt.yticks(range(len(similar_movies)), similar_movies['title'].str.slice(0, 30))
plt.xlabel('Similarity Score')
plt.title(f'Top 10 Movies Similar to {movie_info["title"].values[0]}')
plt.tight_layout()
plt.show()

print("\nSVD-based Movie Recommendation System is complete!")

# Interactive function to get recommendations
def interactive_recommendations():
    while True:
        print("\n==== Movie Recommendation System ====")
        print("1. Get movie recommendations for a user")
        print("2. Find similar movies")
        print("3. Exit")

        choice = input("Enter choice (1-3): ")

        if choice == '1':
            user_id = int(input("Enter user ID: "))
            n_recs = int(input("Number of recommendations: "))
            recommendations = get_recommendations(user_id, n_recommendations=n_recs)
            if recommendations is not None:
                print("\nRecommended movies:")
                for idx, row in recommendations.iterrows():
                    print(f"{row['title']} | Rating: {row['predicted_rating']:.2f}")
            else:
                print(f"User {user_id} not found")

        elif choice == '2':
            movie_name = input("Enter partial movie name: ")
            matching_movies = movies[movies['title'].str.contains(movie_name, case=False)]

            if matching_movies.empty:
                print("No matching movies found")
                continue

            print("\nMatching movies:")
            for idx, row in matching_movies.head(10).iterrows():
                print(f"{row['movieId']}: {row['title']}")

            movie_id = int(input("Enter movie ID: "))
            n_similar = int(input("Number of similar movies: "))

            similar_movies = get_similar_movies(movie_id, n_similar=n_similar)
            if similar_movies is not None:
                print("\nSimilar movies:")
                for idx, row in similar_movies.iterrows():
                    print(f"{row['title']} | Similarity: {row['similarity']:.4f}")
            else:
                print(f"Movie {movie_id} not found")

        elif choice == '3':
            print("Exiting...")
            break

        else:
            print("Invalid choice")


interactive_recommendations()