In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [2]:
# Reading the MovieLens Dataset:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
# Printing some statistics about the data sets.
print('\nShape of Movie Data:', movies.shape)
print('\nShape of Ratings Data:', ratings.shape)


Shape of Movie Data: (9742, 3)

Shape of Ratings Data: (100836, 4)


In [4]:
# Printing the first 5 rows of the data sets.
print('\nFirst 5 rows of Movie Data:\n\n', movies.head())
print('\n\nFirst 5 rows of Ratings Data:\n\n', ratings.head())


First 5 rows of Movie Data:

    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


First 5 rows of Ratings Data:

    userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [5]:
# Description of the data sets.
print('\nMovie Dataset Description:\n\n', movies.describe())
print('\n\nRatings Dataset Description:\n\n', ratings.describe())


Movie Dataset Description:

              movieId
count    9742.000000
mean    42200.353623
std     52160.494854
min         1.000000
25%      3248.250000
50%      7300.000000
75%     76232.000000
max    193609.000000


Ratings Dataset Description:

               userId        movieId         rating     timestamp
count  100836.000000  100836.000000  100836.000000  1.008360e+05
mean      326.127564   19435.295718       3.501557  1.205946e+09
std       182.618491   35530.987199       1.042529  2.162610e+08
min         1.000000       1.000000       0.500000  8.281246e+08
25%       177.000000    1199.000000       3.000000  1.019124e+09
50%       325.000000    2991.000000       3.500000  1.186087e+09
75%       477.000000    8122.000000       4.000000  1.435994e+09
max       610.000000  193609.000000       5.000000  1.537799e+09


In [6]:
# Merging the two data sets.
movie_lens = pd.merge(movies, ratings, on='movieId').drop('timestamp', axis=1)

In [7]:
# Printing some statistics about the MovieLens Dataset.
print('\nFirst 5 rows of the MovieLens Dataset:\n\n', movie_lens.head())
print('\n\nMovieLens Dataset Shape:', movie_lens.shape)
print('\nDescription of the MovieLens Dataset:\n\n', movie_lens.describe())


First 5 rows of the MovieLens Dataset:

    movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating  
0       1     4.0  
1       5     4.0  
2       7     4.5  
3      15     2.5  
4      17     4.5  


MovieLens Dataset Shape: (100836, 5)

Description of the MovieLens Dataset:

              movieId         userId         rating
count  100836.000000  100836.000000  100836.000000
mean    19435.295718     326.127564       3.501557
std     35530.987199     182.618491       1.042529
min         1.000000       1.000000       0.500000
25%      1199.000000     177.000000     

# Creating Movie Recommendations for New Users

In [8]:
# Creating Movie Recommendations for new users.

# Getting the unique movie IDs.
unique_movie_ids = np.unique(movie_lens['movieId'])

# Calculating the average movie rating for movies with more than 20 ratings.
average_movie_ratings = {}
for movie_id in unique_movie_ids:
    movie_ratings = movie_lens['rating'].loc[movie_lens['movieId'] == movie_id]
    if len(movie_ratings) > 20:
        average_movie_ratings[movie_id] = (np.mean(movie_ratings))

# Sorting the average movie ratings.
sorted_average_movie_ratings = sorted(average_movie_ratings.items(), key=lambda x: x[1], reverse=True)[:20]

# Getting the recommended movie titles for a new user.
recommended_movie_titles = []
for movie_id in sorted_average_movie_ratings:
    recommended_movie_titles.append(np.unique(movie_lens['title'].loc[movie_lens['movieId'] == movie_id[0]]))
    
# Printing the recommended movies for the user.
print('Recommended Movies:\n')
for movie_title in recommended_movie_titles:
    print(movie_title)

Recommended Movies:

['Shawshank Redemption, The (1994)']
['Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)']
['Philadelphia Story, The (1940)']
['In the Name of the Father (1993)']
['Lawrence of Arabia (1962)']
['Hoop Dreams (1994)']
['Godfather, The (1972)']
['Harold and Maude (1971)']
['Logan (2017)']
['Fight Club (1999)']
['Cool Hand Luke (1967)']
['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)']
['Rear Window (1954)']
['Godfather: Part II, The (1974)']
['Departed, The (2006)']
['Goodfellas (1990)']
['Manchurian Candidate, The (1962)']
['Casablanca (1942)']
['Dark Knight, The (2008)']
['Usual Suspects, The (1995)']


For a new user, I am recommeding the top 20 movies. These will be movies which have more than 20 reviews and have the highest average rating across all those reviews. Thus this approach recommends the popular choice to the user.

# Creating Movie Recommendations for Existing Users

In [9]:
# Unique Generes from the dataset description.
unique_genres = []
for i in range(len(movies)):
    unique_genres_per_movie = np.unique(movies['genres'].iloc[i].split('|'))
    for unique_genre in unique_genres_per_movie:
        if unique_genre not in unique_genres:
            unique_genres.append(unique_genre)

# Printing the unique genres in the MovieLens Dataset.
print('Unique Genres in the MovieLens Dataset:\n')
for unique_genre in unique_genres:
    print(unique_genre)

Unique Genres in the MovieLens Dataset:

Adventure
Animation
Children
Comedy
Fantasy
Romance
Drama
Action
Crime
Thriller
Horror
Mystery
Sci-Fi
War
Musical
Documentary
IMAX
Western
Film-Noir
(no genres listed)


In [10]:
# Creating a dataframe of unique genres.
unique_genres_df = pd.DataFrame(columns=unique_genres)

# Printing the unique genres dataframe.
print('Unique Genres Dataframe:\n\n', unique_genres_df)

Unique Genres Dataframe:

 Empty DataFrame
Columns: [Adventure, Animation, Children, Comedy, Fantasy, Romance, Drama, Action, Crime, Thriller, Horror, Mystery, Sci-Fi, War, Musical, Documentary, IMAX, Western, Film-Noir, (no genres listed)]
Index: []


In [11]:
# Selecting user with userID 1 for movie recommendations. You can change this to any valid userID.
user = movie_lens.loc[movie_lens['userId'] == 1]

# Printing the user dataframe.
print('User Datframe:\n\n', user)

User Datframe:

        movieId                           title  \
0            1                Toy Story (1995)   
325          3         Grumpier Old Men (1995)   
433          6                     Heat (1995)   
2107        47     Seven (a.k.a. Se7en) (1995)   
2379        50      Usual Suspects, The (1995)   
...        ...                             ...   
56816     3744                    Shaft (2000)   
57276     3793                    X-Men (2000)   
57457     3809          What About Bob? (1991)   
59170     4006  Transformers: The Movie (1986)   
65533     5060    M*A*S*H (a.k.a. MASH) (1970)   

                                            genres  userId  rating  
0      Adventure|Animation|Children|Comedy|Fantasy       1     4.0  
325                                 Comedy|Romance       1     4.0  
433                          Action|Crime|Thriller       1     4.0  
2107                              Mystery|Thriller       1     5.0  
2379                        Crime|Mys

In [12]:
# Replacing the generes in the user dataframe with the list of genres.
for i in range(len(user['genres'])):
    genres = user['genres'].iloc[i].split('|')
    user['genres'].iloc[i] = genres

# Getting the unique genres for which the user has left a rating.
unique_genres_user = []
for i in range(len(user['genres'])):
    rated_movie_genres = np.unique(user['genres'].iloc[i])
    for rated_movie_genre in rated_movie_genres:
        if rated_movie_genre not in unique_genres_user:
            unique_genres_user.append(rated_movie_genre)
            
# Printing the unique genres for which the user has left a rating.
print('Genres for which the user has left a rating:\n')
for unique_genre_user in unique_genres_user:
    print(unique_genre_user)

Genres for which the user has left a rating:

Adventure
Animation
Children
Comedy
Fantasy
Romance
Action
Crime
Thriller
Mystery
Horror
Drama
War
Western
Sci-Fi
Musical
Film-Noir


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [13]:
# Calculating the average rating per genre for the user.
average_rating_per_genre_user = {}
for unique_genre in unique_genres_user:
    ratings = []
    for i in range(len(user['genres'])):
        if unique_genre in user['genres'].iloc[i]:
            ratings.append(user['rating'].iloc[i])
    average_rating_per_genre_user[unique_genre] = np.mean(ratings)

# Printing the average rating per genre for the user.
print('Average rating per genre for the user:\n')
for average_rating in average_rating_per_genre_user.items():
    print(f'{average_rating[0]}---> \t {average_rating[1]}')

Average rating per genre for the user:

Adventure---> 	 4.3882352941176475
Animation---> 	 4.689655172413793
Children---> 	 4.5476190476190474
Comedy---> 	 4.27710843373494
Fantasy---> 	 4.297872340425532
Romance---> 	 4.3076923076923075
Action---> 	 4.322222222222222
Crime---> 	 4.355555555555555
Thriller---> 	 4.1454545454545455
Mystery---> 	 4.166666666666667
Horror---> 	 3.4705882352941178
Drama---> 	 4.529411764705882
War---> 	 4.5
Western---> 	 4.285714285714286
Sci-Fi---> 	 4.225
Musical---> 	 4.681818181818182
Film-Noir---> 	 5.0


In [14]:
# Creating the user average rating per genre dataframe.
user_df = pd.DataFrame(average_rating_per_genre_user, index=[0])
user_df = unique_genres_df.merge(user_df, how='outer')

# Sorting all the columns alphabetically.
user_df = user_df.reindex(sorted(user_df.columns), axis=1)

# Adding the movieId column with the value 0. 
#This is just for easier comparison with the average movie rating dataframe which is created below.
user_df.insert(20, 'movieId', 0)

# Filling the missing values with 0.
user_df = user_df.fillna(0)

# Printing the user dataframe.
print('User dataframe:\n\n', user_df)

User dataframe:

    (no genres listed)    Action  Adventure  Animation  Children    Comedy  \
0                   0  4.322222   4.388235   4.689655  4.547619  4.277108   

      Crime  Documentary     Drama   Fantasy  ...    Horror  IMAX   Musical  \
0  4.355556            0  4.529412  4.297872  ...  3.470588     0  4.681818   

    Mystery   Romance  Sci-Fi  Thriller  War   Western  movieId  
0  4.166667  4.307692   4.225  4.145455  4.5  4.285714        0  

[1 rows x 21 columns]


In [15]:
# Getting the unique movie IDs.
unique_movie_ids = np.unique(movie_lens['movieId'])

# Calculating the average movie rating for movies with more than 20 ratings and are not already rated by the user.
average_movie_ratings = {}
for movie_id in unique_movie_ids:
    if movie_id not in np.unique(user['movieId']):
        movie_ratings = movie_lens['rating'].loc[movie_lens['movieId'] == movie_id]
        if len(movie_ratings) > 20:
            average_movie_ratings[movie_id] = (np.mean(movie_ratings))

In [16]:
# Creating the average movie ratings dataframe.
average_movie_ratings_df = pd.DataFrame(list(average_movie_ratings.items()), columns=['movieId', 'average_rating'])

# Adding the genre columns to the average movie ratings dataframe.
for i in range(len(unique_genres)):
    average_movie_ratings_df.insert(i+2, unique_genres[i], 0)

# Adding the average rating for the movie to it's genres.
for i in range(len(average_movie_ratings_df)):
    genres_list = (movies['genres'].loc[movies['movieId'] == average_movie_ratings_df['movieId'].iloc[i]]).str.split('|')
    for genres in genres_list:
        for genre in genres:
            average_movie_ratings_df[genre].iloc[i] = average_movie_ratings_df['average_rating'].iloc[i]
            
# Removing the average_rating column from the dataframe.
average_movie_ratings_df = average_movie_ratings_df.drop(['average_rating'], axis=1)

# Sorting the columns alphabetically.
average_movie_ratings_df = average_movie_ratings_df.reindex(sorted(average_movie_ratings_df.columns), axis=1)

# Printing the average movie ratings dataframe.
print('Average Movie Ratings Dataframe:\n\n', average_movie_ratings_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Average Movie Ratings Dataframe:

       (no genres listed)    Action  Adventure  Animation  Children    Comedy  \
0                      0  0.000000   3.431818   0.000000  3.431818  0.000000   
1                      0  0.000000   0.000000   0.000000  0.000000  3.071429   
2                      0  0.000000   0.000000   0.000000  0.000000  3.185185   
3                      0  3.496212   3.496212   0.000000  0.000000  0.000000   
4                      0  0.000000   0.000000   0.000000  0.000000  3.671429   
...                  ...       ...        ...        ...       ...       ...   
1050                   0  0.000000   0.000000   0.000000  0.000000  0.000000   
1051                   0  3.890625   3.890625   3.890625  3.890625  3.890625   
1052                   0  0.000000   0.000000   0.000000  0.000000  0.000000   
1053                   0  3.925926   3.925926   0.000000  0.000000  0.000000   
1054                   0  4.280000   0.000000   0.000000  0.000000  0.000000   

    

In [17]:
# Instantiating the nearest neighbors model from sklearn.
nearest_neighbors = NearestNeighbors(n_neighbors=10)
nearest_neighbors.fit(average_movie_ratings_df)

# Getting the closest movies to the user's preference based on average genre rating.
nearest_neighbors = nearest_neighbors.kneighbors(user_df)

# Getting the indices of the closest movies to the user's preference based on average genre rating.
nearest_neighbors_indices = [item for sublist in nearest_neighbors[1] for item in sublist]

# Getting the recommended movie Ids. 
recommended_movie_ids = []
for nearest_neighbor_index in nearest_neighbors_indices:
    recommended_movie_ids.append(average_movie_ratings_df['movieId'].iloc[nearest_neighbor_index])

# Getting the recommended movie titles.
recommended_movie_titles = []
for recommended_movie_id in recommended_movie_ids:
    recommended_movie_title_list = (movies['title'].loc[movies['movieId'] == recommended_movie_id])
    for recommended_movie_title in recommended_movie_title_list:
        recommended_movie_titles.append(recommended_movie_title)

# Printing the recommended movie titles for the user.
print('Recommended movies for the user:\n')
for recommended_movie_title in recommended_movie_titles:
    print(recommended_movie_title)

Recommended movies for the user:

Jumanji (1995)
Father of the Bride Part II (1995)
Sabrina (1995)
GoldenEye (1995)
American President, The (1995)
Casino (1995)
Sense and Sensibility (1995)
Ace Ventura: When Nature Calls (1995)
Get Shorty (1995)
Copycat (1995)


This model first checks the movies which the user has left a rating for, then gets all the genres the user has left a rating for and calculates the average rating for all the genres. Thus, from this we can identify the genres the user likes the most and which ones he doesn't. Now, we create a movie dataframe, where we select the movies which have more than 20 reviews and then calculate the average rating for the movie across all the reviews. Now we compare the two dataframe using Sklearn's nearest neighbor technique and get the movies closest to the user dataframe as the recommended movies. Thus, the recommended movies are the movies which are the most similar to the ones the user likes.