In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [2]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

In [3]:
# Read in movies file
movies = pd.read_csv('input_data/movies.dat', sep='::', header=None, engine='python',
                      names=['movie', 'title', 'genre'])

# Move genres to their own dataframe
genres = movies[['movie', 'genre']].copy()
movies = movies.drop('genre', 1)
movies

Unnamed: 0,movie,title
0,1.0,Toy Story (1995)
1,2.0,Jumanji (1995)
2,3.0,Grumpier Old Men (1995)
3,4.0,Waiting to Exhale (1995)
4,5.0,Father of the Bride Part II (1995)
...,...,...
10685,65088.0,Bedtime Stories (2008)
10686,65091.0,Manhattan Melodrama (1934)
10687,65126.0,Choke (2008)
10688,65130.0,Revolutionary Road (2008)


In [4]:
# Fix genres dataframe
genres['genre'] = genres['genre'].apply(lambda x: x.split('|'))
genres = genres.explode('genre')
genres

Unnamed: 0,movie,genre
0,1.0,Adventure
0,1.0,Animation
0,1.0,Children
0,1.0,Comedy
0,1.0,Fantasy
...,...,...
10687,65126.0,Comedy
10687,65126.0,Drama
10688,65130.0,Drama
10688,65130.0,Romance


In [5]:
# Read ratings
ratings = pd.read_csv('input_data/ratings.dat', sep='::', header=None, engine='python',
                      names=['user', 'movie', 'rating', 'timestamp'])

# Convert timestamp to datetime
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

# Remove ratings that are not valid (assuming 1 -> 5 inclusive, 0.5 increments)
ratings = ratings[ratings['rating'].isin([1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5])]

# Remove ratings where there are null values for movie
ratings = ratings[~ratings.isnull().any(axis=1)]

# Reset index
ratings = ratings.reset_index(drop=True).copy()
ratings

Unnamed: 0,user,movie,rating,timestamp
0,1.0,122.0,5.0,1996-08-02 11:24:06
1,1.0,185.0,5.0,1996-08-02 10:58:45
2,1.0,231.0,5.0,1996-08-02 10:56:32
3,1.0,292.0,5.0,1996-08-02 10:57:01
4,1.0,316.0,5.0,1996-08-02 10:56:32
...,...,...,...,...
9905039,71567.0,2107.0,1.0,1998-12-02 06:35:53
9905040,71567.0,2126.0,2.0,1998-12-03 01:39:03
9905041,71567.0,2294.0,5.0,1998-12-02 05:52:48
9905042,71567.0,2338.0,2.0,1998-12-02 05:53:36


In [6]:
# Q1 - most popular movies, as defined by "most ranking"
# I am interpreting that as "most number of rankings", not "highest average rating"
# I am counting twice if a single user to ranks a movie twice

ratings[['movie', 'rating']] \
    .groupby('movie', as_index=False) \
    .count() \
    .sort_values('rating', ascending=False) \
    .head(5) \
    .merge(movies)

Unnamed: 0,movie,rating,title
0,296.0,34756,Pulp Fiction (1994)
1,356.0,34338,Forrest Gump (1994)
2,593.0,33571,"Silence of the Lambs, The (1991)"
3,480.0,32520,Jurassic Park (1993)
4,318.0,31098,"Shawshank Redemption, The (1994)"


In [7]:
# Q2 - "top 5 ranked movie genres"
# I am interpreting this question as "the top 5 movie genres with the highest average rating"
# I am using a single user rating as my base event instead of using the average rating of each movie in the genre

ratings[['movie', 'rating']] \
    .merge(genres)[['genre', 'rating']] \
    .groupby('genre', as_index=False) \
    .mean() \
    .sort_values('rating', ascending=False) \
    .head(5)

Unnamed: 0,genre,rating
10,Film-Noir,4.02304
7,Documentary,3.833089
12,IMAX,3.809789
18,War,3.800543
14,Mystery,3.69788


In [8]:
# Q3 - How many movies have been ranked the most consecutive days?
# Sort
df = ratings[['movie', 'timestamp']].sort_values(['movie', 'timestamp'])

# Convert timestamps to day numbers
df['date'] = df['timestamp'].dt.date

# Remove duplicates
df = df[['movie', 'date']].drop_duplicates()
df

Unnamed: 0,movie,date
4866305,1.0,1996-01-29
4988175,1.0,1996-02-01
5014666,1.0,1996-02-02
5039562,1.0,1996-02-05
5108070,1.0,1996-02-12
...,...,...
7298308,65126.0,2009-01-04
8729245,65126.0,2009-01-05
3706675,65130.0,2009-01-04
4627214,65133.0,2009-01-04


In [9]:
# Create indicator for streaks
df['next_date'] = df.groupby('movie')['date'].shift(-1)
df['streak_start_over'] = np.where((df['next_date'] - df['date']).dt.days == 1, 0, 1)
df

Unnamed: 0,movie,date,next_date,streak_start_over
4866305,1.0,1996-01-29,1996-02-01,1
4988175,1.0,1996-02-01,1996-02-02,0
5014666,1.0,1996-02-02,1996-02-05,1
5039562,1.0,1996-02-05,1996-02-12,1
5108070,1.0,1996-02-12,1996-02-22,1
...,...,...,...,...
7298308,65126.0,2009-01-04,2009-01-05,0
8729245,65126.0,2009-01-05,,1
3706675,65130.0,2009-01-04,,1
4627214,65133.0,2009-01-04,2009-01-05,0


In [10]:
# Create streak id
df['streak_id'] = df['streak_start_over'].cumsum()
df

Unnamed: 0,movie,date,next_date,streak_start_over,streak_id
4866305,1.0,1996-01-29,1996-02-01,1,1
4988175,1.0,1996-02-01,1996-02-02,0,1
5014666,1.0,1996-02-02,1996-02-05,1,2
5039562,1.0,1996-02-05,1996-02-12,1,3
5108070,1.0,1996-02-12,1996-02-22,1,4
...,...,...,...,...,...
7298308,65126.0,2009-01-04,2009-01-05,0,2157277
8729245,65126.0,2009-01-05,,1,2157278
3706675,65130.0,2009-01-04,,1,2157279
4627214,65133.0,2009-01-04,2009-01-05,0,2157279


In [11]:
# Find best streaks
df[['movie', 'streak_id', 'date']] \
    .groupby(['movie', 'streak_id'], as_index=False) \
    .count() \
    .merge(movies)[['movie', 'title', 'date']] \
    .rename(columns={'date':'consecutive_rating_days'}) \
    .sort_values('consecutive_rating_days', ascending=False) \
    .head(5)

Unnamed: 0,movie,title,consecutive_rating_days
1782718,5952.0,"Lord of the Rings: The Two Towers, The (2002)",622
1126149,2858.0,American Beauty (1999),485
1652138,4993.0,"Lord of the Rings: The Fellowship of the Ring,...",483
173708,356.0,Forrest Gump (1994),425
1830424,6377.0,Finding Nemo (2003),425


In [12]:
## ML Section - recommender system

In [13]:
# Partition users randomly into train/test
train_users, test_users = train_test_split(ratings['user'].unique(), test_size=0.2)
len(train_users), len(test_users)

(55899, 13975)

In [14]:
# Find movies with >= 50 ratings
eligible_movies = ratings[['movie', 'rating']] \
    .groupby('movie', as_index=False) \
    .count()
eligible_movies = eligible_movies[eligible_movies['rating']>=50]

In [15]:
# Transform training data into sparse matrix
user_movie = ratings[ratings['user'].isin(train_users)] \
    .merge(eligible_movies[['movie']]) \
    .pivot(index='user', columns='movie', values='rating') \
    .fillna(0)
user_movie_matrix = csr_matrix(user_movie)
user_movie_matrix

<55899x7192 sparse matrix of type '<class 'numpy.float64'>'
	with 7887520 stored elements in Compressed Sparse Row format>

In [16]:
# Train model
model_knn = NearestNeighbors(metric='minkowski', algorithm='brute', n_neighbors=5, n_jobs=-1)
model_knn.fit(user_movie_matrix)

NearestNeighbors(algorithm='brute', n_jobs=-1)

In [17]:
# Who are our test users?
test_users[:10]

array([28032.,  2368., 23557., 33271., 43871., 40025., 29165., 50552.,
       22803., 60314.])

In [18]:
# Transform testing data into sparse matrix
test = ratings[ratings['user'].isin(test_users)] \
    .merge(eligible_movies[['movie']]) \
    .pivot(index='user', columns='movie', values='rating') \
    .fillna(0)
test

movie,1.0,2.0,3.0,4.0,5.0,...,63131.0,63876.0,63992.0,64497.0,64957.0
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
12.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0
22.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
71551.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
71561.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
71562.0,0.0,3.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
71563.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0


In [19]:
# Let's test for our first test user, user 8
test_values = np.transpose(test.values[0,:].reshape(-1, 1))
test_values.shape

(1, 7192)

In [20]:
# Make predictions for our one user
neigh_dist, neigh_ind = model_knn.kneighbors(test_values, n_neighbors=5)
neigh_ind  # these are the indices of the users closest to user 8

array([[49627, 38250,  2672, 22180, 19069]], dtype=int64)

In [21]:
user_movie = user_movie.reset_index()
users = user_movie[user_movie.index.isin(neigh_ind[0])]['user'].values
users # these are the user ids of the users closest to user 8

array([ 3457., 24490., 28416., 48863., 63381.])

In [22]:
# Q4
# Running out of time, but here's one (very simple) way to pick 5 movies
# Ideally you'd do something a little more sophisticated, like taking average ratings, 
# or experiment with using more neighbors, etc.
ratings[ratings['user'].isin(users)] \
    .merge(movies)[['title', 'rating']] \
    .sort_values('rating', ascending=False) \
    .head(5)

Unnamed: 0,title,rating
85,"Lord of the Rings: The Two Towers, The (2002)",5.0
81,Planet of the Apes (1968),4.0
77,Conspiracy Theory (1997),3.0
75,Star Trek VI: The Undiscovered Country (1991),3.0
83,Fatal Attraction (1987),2.0


In [23]:
# Q5/6 no more time
# I don't want to try to rush a solution
# My next steps would be to build functions for predicting and re-factor code
# Current approach is not optimal for predictions
# Then I would be able to easily measure performance, get top recommended movies, and also deploy model at scale