In [23]:
import pandas as pd
import numpy as np

# Load the dataset
ratings = pd.read_csv('./ml-20m/ratings.csv')
movies = pd.read_csv('./ml-20m/movies.csv')
links = pd.read_csv('./ml-20m/links.csv')
ratings.head()
ratings['userId'].nunique()


138493

In [24]:
# Filter to active users who has at least 50 ratings
user_rating_counts = ratings['userId'].value_counts()
active_users = user_rating_counts[user_rating_counts >= 50].index
print(f"Number of active users: {len(active_users)}")
len(ratings)


Number of active users: 85307


20000263

In [25]:
# Select a subset of 10k users
selected_users = active_users.to_series().sample(10000, random_state=7)
ratings_small = ratings[ratings['userId'].isin(selected_users)]
len(ratings_small)


2130396

In [26]:
# Clean movie and link
movies_small = movies[movies['movieId'].isin(ratings_small['movieId'])]
links_small = links[links['movieId'].isin(ratings_small['movieId'])]
summary = {
    "num_users": ratings_small['userId'].nunique(),
    "num_movies": ratings_small['movieId'].nunique(),
    "num_ratings": len(ratings_small)
}
summary


{'num_users': 10000, 'num_movies': 17614, 'num_ratings': 2130396}

In [27]:
# Export data
ratings_small.to_csv('./subset_ratings.csv', index=False)
movies_small.to_csv('./subset_movies.csv', index=False)
links_small.to_csv('./subset_links.csv', index=False)

In [30]:
scores = pd.read_csv('./ml-20m/genome-scores.csv')
tags = pd.read_csv('./ml-20m/genome-tags.csv')

tagged_scores = scores.merge(tags, on='tagId')
tagged_scores.head()

Unnamed: 0,movieId,tagId,relevance,tag
0,1,1,0.025,007
1,1,2,0.025,007 (series)
2,1,3,0.05775,18th century
3,1,4,0.09675,1920s
4,1,5,0.14675,1930s


In [31]:
subset_movie_ids = movies_small['movieId'].unique()
tagged_scores_small = tagged_scores[tagged_scores['movieId'].isin(subset_movie_ids)]


In [32]:
movie_tag_martix = tagged_scores_small.pivot_table(
    index='movieId', columns='tag', values='relevance', fill_value=0
)
movie_tag_martix.head()

tag,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.025,0.025,0.05775,0.09675,0.14675,0.217,0.067,0.26275,0.262,0.032,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
2,0.03975,0.04375,0.03775,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,0.0205,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
3,0.0435,0.05475,0.028,0.077,0.054,0.0685,0.056,0.185,0.04925,0.02675,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185
4,0.03725,0.0395,0.03675,0.031,0.06825,0.0405,0.02325,0.087,0.05125,0.03025,...,0.0575,0.03375,0.02275,0.03975,0.18525,0.05925,0.015,0.01525,0.0645,0.013
5,0.042,0.05275,0.05925,0.03675,0.07525,0.12525,0.0285,0.085,0.0295,0.02875,...,0.0425,0.02825,0.0215,0.026,0.14275,0.02075,0.0165,0.01675,0.1075,0.01825


# 🎬 MovieLens 20M Subset Preparation & Tag Genome Feature Matrix

## ✅ Overview

This notebook prepares a **manageable, structured subset** of the MovieLens 20M dataset and enriches it with **Tag Genome-based movie features** for content-based modeling.

---

## 📁 Files Used

- `ratings.csv` — all user ratings
- `movies.csv` — movie titles + genres
- `links.csv` — IMDB and TMDb IDs
- `genome-scores.csv` — tag relevance scores (machine-learned)
- `genome-tags.csv` — readable names for tag IDs

---

## 🧼 Data Filtering & Subsetting

### Step 1: Filter Active Users
- Only keep users with **≥ 50 ratings**
- Randomly sample **10,000 users** from them
- Resulting `ratings_small` has **~1M ratings**

### Step 2: Filter Movies & Metadata
- Keep only movies rated by the selected users
- Generate:
  - `subset_ratings.csv`
  - `subset_movies.csv`
  - `subset_links.csv`

### Summary Stats:
| Metric       | Count     |
|--------------|-----------|
| Users        | 10,000    |
| Movies       | (filtered count varies) |
| Ratings      | ~1M       |

---

## 🧬 Step 3: Build Content Feature Matrix (Movie × Tags)

Using `genome-scores.csv` and `genome-tags.csv`:

- Merged tag ID with tag name
- Filtered only relevant `movieId`s
- Built a **dense movie-tag matrix** where:
  - Rows = movieId
  - Columns = tag (e.g. "dark humor", "romantic")
  - Values = tag relevance score (0 to 1)

### Code Used:
```python
tagged_scores = scores.merge(tags, on='tagId')
tagged_scores_small = tagged_scores[tagged_scores['movieId'].isin(movies_small['movieId'])]
movie_tag_matrix = tagged_scores_small.pivot_table(
    index='movieId', columns='tag', values='relevance', fill_value=0
)
