# MovieLens Preprocessing Notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Read in the set of movies
movies = pd.read_csv('data/movies.csv')

# Drop unused columns
movies = movies.drop('genres', axis=1)

# Rename columns
movies = movies.rename(columns={'movieId': 'movie_id'})

movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [3]:
# Read in user ratings
ratings = pd.read_csv('data/ratings.csv')

# Drop unused columns
ratings = ratings.drop('timestamp', axis=1)

# Rename columns
ratings = ratings.rename(columns={'movieId': 'movie_id', 'userId': 'user_id'})

ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [4]:
# Merge the movies and ratings
ratings = pd.merge(ratings, movies, on='movie_id').drop('movie_id', axis=1)

# Normalize ratings to 1
ratings['rating'] = ratings['rating'] / ratings['rating'].max()

ratings.head()

Unnamed: 0,user_id,rating,title
0,1,1.0,Pulp Fiction (1994)
1,1,0.7,Three Colors: Red (Trois couleurs: Rouge) (1994)
2,1,1.0,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,1,1.0,Underground (1995)
4,1,0.7,Singin' in the Rain (1952)


In [5]:
rating_threshold = 0.7

def aggregate_movies(df: pd.DataFrame) -> np.ndarray:
    return np.array(df[df['rating'] >= rating_threshold]['title'], dtype=str)

preferred_movies = ratings.groupby('user_id').apply(aggregate_movies, include_groups=False).reset_index()
preferred_movies.columns = ['user_id', 'titles']

preferred_movies.head()

Unnamed: 0,user_id,titles
0,1,"[Pulp Fiction (1994), Three Colors: Red (Trois..."
1,2,"[Toy Story (1995), Braveheart (1995), Apollo 1..."
2,3,"[Toy Story (1995), City of Lost Children, The ..."
3,4,"[Star Wars: Episode IV - A New Hope (1977), Pu..."
4,5,"[Toy Story (1995), Ace Ventura: When Nature Ca..."
