dataset link: https://www.kaggle.com/grouplens/movielens-20m-dataset

# Preprocessing movies and ratings tables

In [1]:
import numpy as np
import pandas as pd

Movies Table Preprocessing:

In [2]:
movies = pd.read_csv('data/movie.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [3]:
movies['genres'] = movies['genres'].replace(['None','NaN','(no genres listed)'],'')
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|').astype('str')

# Creating new column with movie names and year
movies['title_with_year'] = movies['title']

# Dropping years from movie names
movies['title'] = movies['title'].apply(lambda x: x[:-7])

In [4]:
movies

Unnamed: 0,movieId,title,genres,title_with_year
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",Toy Story (1995)
1,2,Jumanji,"['Adventure', 'Children', 'Fantasy']",Jumanji (1995)
2,3,Grumpier Old Men,"['Comedy', 'Romance']",Grumpier Old Men (1995)
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",Waiting to Exhale (1995)
4,5,Father of the Bride Part II,['Comedy'],Father of the Bride Part II (1995)
...,...,...,...,...
27273,131254,Kein Bund für's Leben,['Comedy'],Kein Bund für's Leben (2007)
27274,131256,"Feuer, Eis & Dosenbier",['Comedy'],"Feuer, Eis & Dosenbier (2002)"
27275,131258,The Pirates,['Adventure'],The Pirates (2014)
27276,131260,Rentun Ruusu,[''],Rentun Ruusu (2001)


Ratings Table Preprocessing:

In [5]:
ratings = pd.read_csv('data/rating.csv')
print(ratings.shape)
ratings.head()

(20000263, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [6]:
# dropping timestamp column

ratings.drop(columns='timestamp', inplace=True)
print(ratings.shape)
ratings.head(2)

(20000263, 3)


Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5


# Content based

In [7]:
# Vectorize genres column's texts

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
tfidf_matrix

<27278x186 sparse matrix of type '<class 'numpy.float64'>'
	with 85434 stored elements in Compressed Sparse Row format>

In [8]:
# Creatiting cosine similarity matrix of vectorized_genres_texts vs vectorized_genres_texts

from sklearn.metrics.pairwise import cosine_similarity

cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim

array([[1.        , 0.3170459 , 0.06282188, ..., 0.26183304, 0.        ,
        0.19914606],
       [0.3170459 , 1.        , 0.        , ..., 0.32154162, 0.        ,
        0.24455947],
       [0.06282188, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.26183304, 0.32154162, 0.        , ..., 1.        , 0.        ,
        0.32926041],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.19914606, 0.24455947, 0.        , ..., 0.32926041, 0.        ,
        1.        ]])

In [10]:
def similar_movies(movie_name):
    
    try:
        # getting movie_table's_index from given movie_name
        movie_index = np.where(movies['title']==movie_name)[0][0]
        
        # getting new_indices(enumerate) and similarity scores of every movies, respect to the given movie_name
        sim_scores = list(enumerate(cos_sim[movie_index]))
        # sort with respect to similarity scores
        sim_scores.sort(key=lambda x: x[1:], reverse=True)
        
        # pick up only new_indices (which is same as movie_table's_index) from sim_scores
        movie_indices = [i[0] for i in sim_scores]
        
        # getting top 5 movie_names from movie_indices and return it
        return movies['title_with_year'][movie_indices].iloc[1:6]
    
    except:
        print('This movie is not present in the dataset.') 

Getting recommendations for some movies.

In [11]:
similar_movies('Toy Story')

2209                                       Antz (1998)
3027                                Toy Story 2 (1999)
3663    Adventures of Rocky and Bullwinkle, The (2000)
3922                  Emperor's New Groove, The (2000)
4790                             Monsters, Inc. (2001)
Name: title_with_year, dtype: object

In [12]:
similar_movies('3 Idiots')

10        American President, The (1995)
51               Mighty Aphrodite (1995)
57     Postman, The (Postino, Il) (1994)
92                Beautiful Girls (1996)
193       Something to Talk About (1995)
Name: title_with_year, dtype: object

In [13]:
similar_movies('Jumanji')

55                 Kids of the Round Table (1995)
59             Indian in the Cupboard, The (1995)
124             NeverEnding Story III, The (1994)
990               Escape to Witch Mountain (1975)
1959    Darby O'Gill and the Little People (1959)
Name: title_with_year, dtype: object

# Collaborative filtering

In [14]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [15]:
# num of ratings per user

num_of_ratings = ratings['userId'].value_counts()
num_of_ratings

118205    9254
8405      7515
82418     5646
121535    5520
125794    5491
          ... 
89305       20
110463      20
96990       20
134747      20
6526        20
Name: userId, Length: 138493, dtype: int64

In [16]:
# Creating dataframe with num_of_ratings information

num_ratings = pd.DataFrame()
num_ratings['userId'] = num_of_ratings.index
num_ratings['counts'] = num_of_ratings.values

print(num_ratings.shape)
num_ratings.head(2)

(138493, 2)


Unnamed: 0,userId,counts
0,118205,9254
1,8405,7515


In [17]:
# adding counts(number_of_rating_per_user) column in ratings table

movies_df = pd.merge(ratings, num_ratings, on='userId')
print(movies_df.shape)
movies_df.head(3)

(20000263, 4)


Unnamed: 0,userId,movieId,rating,counts
0,1,2,3.5,175
1,1,29,3.5,175
2,1,32,3.5,175


In [18]:
# Dropping those users who rated less than 100 movies. (to minimizing privot table's calculation complexity)

popularity_threshold = 100
movies_df = movies_df.query('counts >= @popularity_threshold')
movies_df.shape

(16042904, 4)

In [19]:
pivot_table = movies_df.pivot_table(values='rating', index='movieId', columns='userId').fillna(0)
pivot_table

userId,1,3,7,11,14,18,21,22,23,24,...,138471,138472,138474,138475,138477,138483,138484,138486,138490,138493
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,4.0,0.0,4.5,4.5,0.0,0.0,3.0,4.0,4.0,...,4.5,3.0,5.0,0.0,3.0,4.0,0.0,5.0,0.0,3.5
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,3.0,3.0,0.0,0.0,4.0
3,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,2.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Fitting our data in knn algorithm

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(pivot_table)

NearestNeighbors(algorithm='brute', metric='cosine')

In [21]:
movies.head()

Unnamed: 0,movieId,title,genres,title_with_year
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",Toy Story (1995)
1,2,Jumanji,"['Adventure', 'Children', 'Fantasy']",Jumanji (1995)
2,3,Grumpier Old Men,"['Comedy', 'Romance']",Grumpier Old Men (1995)
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",Waiting to Exhale (1995)
4,5,Father of the Bride Part II,['Comedy'],Father of the Bride Part II (1995)


In [22]:
def recommend_movies_for(movie_name):
    try:
        # getting movieId from movie_name
        mov_ID = movies[movies['title']==movie_name]['movieId'].values[0]
        
        # getting pivot_table_index from movieId
        movie_pivot_idx = np.where(pivot_table.index==mov_ID)[0][0]
        
        # getting top recommended movies pivot_table's_index
        recomended_movies = model_knn.kneighbors(pivot_table.iloc[movie_pivot_idx,:].values.reshape(1, -1), n_neighbors = 6)
        
        # getting movieId from pivot_table_indexes
        recomended_movieId = [pivot_table.index[i] for i in recomended_movies[1][0][1:]]
        
        # getting movie_names from movieId
        recomended_movies_names = [movies.iloc[np.where(movies['movieId']==m_id)[0][0],:]['title_with_year'] for m_id in recomended_movieId]
        return recomended_movies_names
    
    except:
        print('This movie is not present in the dataset.')

Getting recommendations for some Hollywood movies.

In [23]:
recommend_movies_for('Fight Club')

['Matrix, The (1999)',
 'American Beauty (1999)',
 'Memento (2000)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 'Pulp Fiction (1994)']

In [24]:
recommend_movies_for('Primer')

['Moon (2009)',
 'Sunshine (2007)',
 'Children of Men (2006)',
 'Machinist, The (Maquinista, El) (2004)',
 'District 9 (2009)']

In [26]:
recommend_movies_for('Ratatouille')

['WALL·E (2008)',
 'Up (2009)',
 'Dark Knight, The (2008)',
 'Iron Man (2008)',
 'Incredibles, The (2004)']

In [27]:
recommend_movies_for('Lord of the Rings: The Fellowship of the Ring, The')

['Lord of the Rings: The Two Towers, The (2002)',
 'Lord of the Rings: The Return of the King, The (2003)',
 'Shrek (2001)',
 'Matrix, The (1999)',
 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)']

In [28]:
recommend_movies_for('My Octopus Teacher')

This movie is not present in the dataset.


In [29]:
recommend_movies_for('Apollo 13')

['Forrest Gump (1994)',
 'Jurassic Park (1993)',
 'Fugitive, The (1993)',
 'Shawshank Redemption, The (1994)',
 'Silence of the Lambs, The (1991)']

Getting recommendations for some Bollywood movies.

In [30]:
recommend_movies_for('Bhaag Milkha Bhaag')

This movie is not present in the dataset.


In [31]:
recommend_movies_for('3 Idiots')

['Like Stars on Earth (Taare Zameen Par) (2007)',
 'Paint It Yellow (Rang De Basanti) (2006)',
 'Kahaani (2012)',
 'Jab We Met (2007)',
 'Ghajini (2008)']

In [32]:
recommend_movies_for('Slumdog Millionaire')

['Dark Knight, The (2008)',
 'Inception (2010)',
 'Inglourious Basterds (2009)',
 'WALL·E (2008)',
 'Avatar (2009)']

In [33]:
recommend_movies_for('Life of Pi')

['Django Unchained (2012)',
 'Silver Linings Playbook (2012)',
 'Gravity (2013)',
 'Skyfall (2012)',
 'Argo (2012)']

In [34]:
recommend_movies_for('Darjeeling Limited, The')

['Life Aquatic with Steve Zissou, The (2004)',
 'Little Miss Sunshine (2006)',
 'Burn After Reading (2008)',
 'Juno (2007)',
 'There Will Be Blood (2007)']

# Collaborative Filtering method gives us more simmilar movies.