# MovieLens Preprocessing Notebook

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Read in the set of movies
movies = pd.read_csv('data/movies.csv')

# Drop unused columns
movies = movies.drop('genres', axis=1)

# Rename columns
movies = movies.rename(columns={'movieId': 'movie_id'})

movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [3]:
# Read in user ratings
ratings = pd.read_csv('data/ratings.csv')

# Drop unused columns
ratings = ratings.drop('timestamp', axis=1)

# Rename columns
ratings = ratings.rename(columns={'movieId': 'movie_id', 'userId': 'user_id'})

ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [4]:
# Merge the movies and ratings
ratings = pd.merge(ratings, movies, on='movie_id').drop('movie_id', axis=1)

# Normalize ratings to 1
ratings['rating'] = ratings['rating'] / ratings['rating'].max()

ratings.head()

Unnamed: 0,user_id,rating,title
0,1,1.0,Pulp Fiction (1994)
1,1,0.7,Three Colors: Red (Trois couleurs: Rouge) (1994)
2,1,1.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,1,1.0,Underground (1995)
4,1,0.7,Singin' in the Rain (1952)


In [5]:
rating_threshold = 0.7

def aggregate_movies(df: pd.DataFrame) -> list:
    return df[df['rating'] >= rating_threshold]['title'].tolist()

users = ratings.groupby('user_id').apply(aggregate_movies, include_groups=False).reset_index()
users.columns = ['user_id', 'movies']

# Remove users with very few preferred movies
users = users[users['movies'].apply(len) > 10]

users.head()

Unnamed: 0,user_id,movies
0,1,"[Pulp Fiction (1994), Three Colors: Red (Trois..."
1,2,"[Toy Story (1995), Braveheart (1995), Apollo 1..."
2,3,"[Toy Story (1995), City of Lost Children, The ..."
3,4,"[Star Wars: Episode IV - A New Hope (1977), Pu..."
4,5,"[Toy Story (1995), Ace Ventura: When Nature Ca..."


In [6]:
# Select a portion of movies to hide from the recommendation system
def split_movies(df: pd.DataFrame) -> pd.Series:
    """Split the movies into known and hidden sets for a user."""
    known_movies, hidden_movies = train_test_split(df['movies'], test_size=0.1)

    return pd.Series([known_movies, hidden_movies], index=['known_movies', 'hidden_movies'])

users = users.merge(users.apply(split_movies, axis=1), left_index=True, right_index=True).drop('movies', axis=1)

users.head()

Unnamed: 0,user_id,known_movies,hidden_movies
0,1,"[Run Lola Run (Lola rennt) (1998), City of God...","[Spring, Summer, Fall, Winter... and Spring (B..."
1,2,[Star Wars: Episode V - The Empire Strikes Bac...,"[Rainmaker, The (1997), Daredevil (2003), Last..."
2,3,"[Passengers (2016), Bandits (2001), Hardcore H...","[Tron: Legacy (2010), Beautiful Mind, A (2001)..."
3,4,"[Ex Machina (2015), Incredibles 2 (2018), The ...","[Hellboy (2004), Lord of the Rings: The Fellow..."
4,5,"[Twelve Monkeys (a.k.a. 12 Monkeys) (1995), Sl...","[Silence of the Lambs, The (1991), Apollo 13 (..."


In [7]:
# Split the users into training and test sets
train_users, test_users = train_test_split(users, test_size=0.2)

# Save the data
train_users.to_csv('data/train_users.csv', index=False)
test_users.to_csv('data/test_users.csv', index=False)

train_users.shape, test_users.shape

((124407, 3), (31102, 3))

In [8]:
# # Explode the hidden movies into separate rows for each set
# train_movies = train_users.drop('known_movies', axis=1).explode('hidden_movies')

# test_movies = test_users.drop('known_movies', axis=1).explode('hidden_movies')