In [293]:
import os
import pandas as pd

In [294]:
data_path = os.path.join('C://Users/Saurabh/Downloads/ml-latest-small/', 'ml-latest-small/')
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

In [295]:
# read data
df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

In [296]:
df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [297]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [298]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [299]:
from scipy.sparse import csr_matrix

In [300]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

In [301]:
df_movies_cnt = pd.DataFrame(
            df_ratings.groupby('movieId').size(),
            columns=['count'])

In [302]:
popular_movies = list(set(df_movies_cnt.query('count >= 50').index))  # noqa

In [303]:
movies_filter = df_ratings.movieId.isin(popular_movies).values

In [304]:
model = NearestNeighbors()

In [305]:
movie_rating_thres = 50
user_rating_thres = 50

In [306]:
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
model.set_params(**{
            'n_neighbors': 20,
            'algorithm': 'brute',
            'metric': 'cosine',
            'n_jobs': -1})

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [307]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [308]:
df_movies_cnt = pd.DataFrame(
df_ratings.groupby('movieId').size(),
columns=['count'])

In [309]:
popular_movies = list(set(df_movies_cnt.query('count >= 50').index))

In [310]:
#Checks if the movie Id is in the list of popular movies
movies_filter = df_ratings.movieId.isin(popular_movies).values

In [311]:
df_users_cnt = pd.DataFrame(
    df_ratings.groupby('userId').size(),
    columns=['count'])
active_users = list(set(df_users_cnt.query('count >= 50').index))  # noqa
#Checks if the movie Id is in the list of popular movies
users_filter = df_ratings.userId.isin(active_users).values

In [312]:
df_ratings_filtered = df_ratings[movies_filter & users_filter]

In [313]:
df_ratings_filtered.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [314]:
movie_user_mat = df_ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0)

In [315]:
hashmap = {
            movie: i for i, movie in
            enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
        }

In [316]:
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [317]:
# clean up
import gc
del df_movies, df_movies_cnt, df_users_cnt
del df_ratings, df_ratings_filtered, movie_user_mat
gc.collect()

81

In [318]:
def _fuzzy_matching(hashmap, fav_movie):
        """
        return the closest match via fuzzy ratio.
        If no match found, return None
        Parameters
        ----------
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        Return
        ------
        index of the closest match
        """
        match_tuple = []
        # get match
        for title, idx in hashmap.items():
            ratio = fuzz.ratio(title.lower(), fav_movie.lower())
            if ratio >= 60:
                match_tuple.append((title, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

In [319]:
model.fit(movie_user_mat_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [320]:
fav_movie='Jumanji'

In [321]:
idx =_fuzzy_matching(hashmap, fav_movie)

Found possible matches in our database: ['Jumanji (1995)']



In [322]:
n_recommendations=20

In [323]:
distances, indices = model.kneighbors(
            movie_user_mat_sparse[idx],
            n_neighbors=n_recommendations+1)

In [324]:
# get list of raw idx of recommendations
raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]

In [292]:
reverse_hashmap = {v: k for k, v in hashmap.items()}
print('Recommendations for {}:'.format(fav_movie))
for i, (idx, dist) in enumerate(raw_recommends):
    print('{0}: {1}, with distance ''of {2}'.format(i+1, reverse_hashmap[idx], dist))

Recommendations for Jumanji:
1: Santa Clause, The (1994), with distance of 0.4976845383644104
2: Terminator 2: Judgment Day (1991), with distance of 0.4943341016769409
3: Speed (1994), with distance of 0.4915449619293213
4: Ace Ventura: When Nature Calls (1995), with distance of 0.48949891328811646
5: Batman (1989), with distance of 0.4879942536354065
6: Independence Day (a.k.a. ID4) (1996), with distance of 0.48735493421554565
7: Waterworld (1995), with distance of 0.48676472902297974
8: True Lies (1994), with distance of 0.48449409008026123
9: Forrest Gump (1994), with distance of 0.4835752844810486
10: Die Hard: With a Vengeance (1995), with distance of 0.47968143224716187
11: Casper (1995), with distance of 0.4760867953300476
12: Stargate (1994), with distance of 0.4665136933326721
13: Nightmare Before Christmas, The (1993), with distance of 0.4644862413406372
14: Home Alone (1990), with distance of 0.44241857528686523
15: Beauty and the Beast (1991), with distance of 0.43303328752