In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install thefuzz
%cd '/content/drive/MyDrive/Colab Notebooks/ML - CS576/RecSys'

/content/drive/MyDrive/Colab Notebooks/ML - CS576/RecSys


In [None]:
MOVIES_FILE = 'movies_large.csv'
RATINGS_FILE = 'ratings_large.csv'

In [None]:
import pandas as pd

# Loading datasets
df_movies = pd.read_csv(f'{MOVIES_FILE}')
df_ratings = pd.read_csv(f'{RATINGS_FILE}')

df_ratings = df_ratings.drop('timestamp', axis=1)

In [None]:
import unicodedata

years = []

for idx, row in df_movies.iterrows():
  title = unicodedata.normalize("NFKD", row['title'])
  title_parts = title.strip().split(' ')
  possible_year_part = title_parts[-1][1:-1]

  if possible_year_part.isnumeric():
    years.append(int(possible_year_part))
  else:
    years.append(0)

df_movies['year'] = years

In [None]:
df_movies_new = df_movies[~df_movies.genres.str.contains('Documentary') & ~df_movies.genres.str.contains('Music')]
df_movies_new = df_movies_new[df_movies_new.year >= 2008]

In [None]:
df_ratings_new = df_ratings[df_ratings.movieId.isin(df_movies_new.movieId.values)]

In [None]:
# Drop movies with less than 50 ratings.
popularity_threshold = 50


df_movies_popularity = pd.DataFrame(df_ratings_new.groupby('movieId').size(), columns=['count'])
df_movies_popularity = df_movies_popularity.reset_index()

popular_movies = list(set(df_movies_popularity.query('count >= @popularity_threshold').movieId))

df_ratings_top_movies = df_ratings_new[df_ratings_new.movieId.isin(popular_movies)]

print('shape of original ratings data: ', df_ratings_new.shape)
print(f'shape of ratings data after dropping movies with ratings less than {popularity_threshold} ratings: ', df_ratings_top_movies.shape)
print('relevant user count: ', len(df_ratings_top_movies.userId.unique()))
print('relevant movie count: ', len(df_ratings_top_movies.movieId.unique()))

shape of original ratings data:  (6144408, 3)
shape of ratings data after dropping movies with ratings less than 50 ratings:  (5931374, 3)
relevant user count:  139049
relevant movie count:  5181


In [None]:
# Drop rows with associated low-activity-raters or very-high-activity-raters

lower = 15
upper = 200


df_rater_frequency = pd.DataFrame(df_ratings_new.groupby('userId').size(), columns=['count'])
df_rater_frequency = df_rater_frequency.reset_index()

top_raters = list(set(df_rater_frequency.query('count >= @lower and count <= @upper').userId))

df_ratings_final = df_ratings_top_movies[df_ratings_top_movies.userId.isin(top_raters)]

print('shape of original ratings data: ', df_ratings_new.shape)
print(f'shape of ratings data after using df_ratings_top_movies and dropping ratings with associated low-activity-raters or very-high-activity-raters: ', df_ratings_final.shape)
print('relevant user count: ', len(df_ratings_final.userId.unique()))
print('relevant movie count: ', len(df_ratings_final.movieId.unique()))

shape of original ratings data:  (6144408, 3)
shape of ratings data after using df_ratings_top_movies and dropping ratings with associated low-activity-raters or very-high-activity-raters:  (3150110, 3)
relevant user count:  57033
relevant movie count:  5181


In [None]:
# Build a rating matrix
rating_matrix = df_ratings_final.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

rating_matrix.head()

userId,3,10,13,17,22,27,33,37,40,44,...,330927,330928,330931,330933,330937,330948,330949,330961,330970,330974
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
53207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55830,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.5
57326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# mapper from movie title to index
movie_to_idx = {
    movie: i for i, movie in
    enumerate(list(df_movies_new.set_index('movieId').loc[rating_matrix.index].title))
}

# mapper from index to movie title
idx_to_movie = {
    i: movie for i, movie in
    enumerate(list(df_movies_new.set_index('movieId').loc[rating_matrix.index].title))
}

In [None]:
from scipy.sparse import csr_matrix

# convert to scipy sparse matrix with efficiency in mind.
rating_matrix_sparse = csr_matrix(rating_matrix.values)

In [None]:
df_movies_final = df_movies_new[df_movies_new.movieId.isin(df_ratings_final.movieId.values)]

In [None]:
import sklearn.neighbors

knn = sklearn.neighbors.NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)

In [None]:
knn.fit(rating_matrix_sparse)

In [None]:
from thefuzz import fuzz

# Return movies based on similar title or matching (year AND genre).
# This is meant to fetch movies that can then be input to the recommender to get recommendations.
# Think of this as a barebones search engine!!!
def movie_fetcher(mapper, movie, release_year=2020, genres=['Action']):
    title_matches = []
    year_genre_matches = []

    # get match
    for _, row in df_movies_final.iterrows():
      title = row['title']
      idx = mapper[title]
      row_genres = row['genres'].lower()

      ratio = fuzz.partial_ratio(title.lower(), movie.lower())

      if ratio >= 75:
            title_matches.append((title, idx, row_genres))

      if row['year'] == release_year:
        for genre in genres:
          if genre.lower() not in row_genres:
            break
        else:
          year_genre_matches.append((title, idx, row_genres))


    return title_matches, year_genre_matches

In [None]:
search_movie = 'Harry Potter'
search_year = 2017
search_genres = ['sci-fi',  'Horror']

title_matches, year_genre_matches = movie_fetcher(movie_to_idx, search_movie, search_year, search_genres)

for movies in title_matches:
  print('Matching by title (title, idx): ', movies[0], "|||  ", movies[1])
for movies in year_genre_matches:
  print('Matching by year and genre (title, idx, genre): ', movies[0], "   |||  ", movies[1], "   |||   ", movies[2])

Matching by title (title, idx):  Harry Potter and the Half-Blood Prince (2009) |||   377
Matching by title (title, idx):  Harry Potter and the Deathly Hallows: Part 1 (2010) |||   851
Matching by title (title, idx):  Harry Potter and the Deathly Hallows: Part 2 (2011) |||   1052
Matching by year and genre (title, idx, genre):  Resident Evil: The Final Chapter (2017)    |||   3441    |||    action|horror|sci-fi
Matching by year and genre (title, idx, genre):  Life (2017)    |||   3476    |||    horror|sci-fi|thriller
Matching by year and genre (title, idx, genre):  Alien: Covenant (2017)    |||   3483    |||    action|horror|sci-fi|thriller
Matching by year and genre (title, idx, genre):  Blame! (2017)    |||   3574    |||    action|animation|drama|horror|sci-fi
Matching by year and genre (title, idx, genre):  The Dark Tower (2017)    |||   3624    |||    fantasy|horror|sci-fi|western
Matching by year and genre (title, idx, genre):  Zygote (2017)    |||   3636    |||    horror|sci-fi
Ma

In [None]:
def recommend_n_movies(mapper, model, data, ref, movie_id, n=5):
    """
    return n similar movies

    Params
    ----------
    mapper: used to reverse map movie id to title
    model: knn model
    data: data
    ref: reference data used to fetch metadata
    movie_id: of user input movie
    n: number of recommendations to fetch
    """
    distances, indices = model.kneighbors(data[movie_id], n_neighbors=n+1)

    # print(distances)
    # print(distances.squeeze())
    # print(distances.squeeze().tolist())

    # get raw idx recommendations
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]

    print('Input movie: ', mapper[movie_id])
    print()
    print('Recommendations: -')
    for i, (idx, dist) in enumerate(raw_recommends):
        rec_movie = mapper[idx]
        rec_genres = ref[ref.title == rec_movie].genres.values[0]
        print('{0}: {1} ({2}), with distance of {3}'.format(i+1, rec_movie, rec_genres, dist))

In [None]:
recommend_n_movies(idx_to_movie, knn, rating_matrix_sparse, df_movies_final, 3441, 5)

Input movie:  Resident Evil: The Final Chapter (2017)

Recommendations: -
1: Alien: Covenant (2017) (Action|Horror|Sci-Fi|Thriller), with distance of 0.8561754938422456
2: Underworld: Blood Wars (2016) (Action|Horror), with distance of 0.803730508825629
3: Resident Evil: Vendetta (2017) (Animation|Horror), with distance of 0.7686318847291652
4: Resident Evil: Afterlife (2010) (Action|Horror|Sci-Fi|Thriller|IMAX), with distance of 0.703574765817222
5: Resident Evil: Retribution (2012) (Action|Horror|Sci-Fi|IMAX), with distance of 0.6219301770268075


In [None]:
recommend_n_movies(idx_to_movie, knn, rating_matrix_sparse, df_movies_final, 1052)

Input movie:  Harry Potter and the Deathly Hallows: Part 2 (2011)

Recommendations: -
1: Hobbit: An Unexpected Journey, The (2012) (Adventure|Fantasy|IMAX), with distance of 0.5243109073349307
2: The Hunger Games (2012) (Action|Adventure|Drama|Sci-Fi|Thriller), with distance of 0.5220391772207156
3: Avengers, The (2012) (Action|Adventure|Sci-Fi|IMAX), with distance of 0.5098254035434151
4: Harry Potter and the Half-Blood Prince (2009) (Adventure|Fantasy|Mystery|Romance|IMAX), with distance of 0.2556997894605544
5: Harry Potter and the Deathly Hallows: Part 1 (2010) (Action|Adventure|Fantasy|IMAX), with distance of 0.20480816412262204


In [None]:
recommend_n_movies(idx_to_movie, knn, rating_matrix_sparse, df_movies_final, 2390, 10)

Input movie:  Avengers: Infinity War - Part I (2018)

Recommendations: -
1: Guardians of the Galaxy (2014) (Action|Adventure|Sci-Fi), with distance of 0.492871172297363
2: Logan (2017) (Action|Sci-Fi), with distance of 0.48817560501521395
3: Doctor Strange (2016) (Action|Adventure|Sci-Fi), with distance of 0.4591113096357654
4: Spider-Man: Into the Spider-Verse (2018) (Action|Adventure|Animation|Sci-Fi), with distance of 0.454664261628006
5: Black Panther (2017) (Action|Adventure|Sci-Fi), with distance of 0.4516474167411404
6: Untitled Spider-Man Reboot (2017) (Action|Adventure|Fantasy), with distance of 0.41806631577208986
7: Deadpool 2 (2018) (Action|Comedy|Sci-Fi), with distance of 0.4179017205125143
8: Guardians of the Galaxy 2 (2017) (Action|Adventure|Sci-Fi), with distance of 0.39649256003852795
9: Avengers: Infinity War - Part II (2019) (Action|Adventure|Sci-Fi), with distance of 0.2878181823164676
10: Thor: Ragnarok (2017) (Action|Adventure|Sci-Fi), with distance of 0.277413629

In [None]:
recommend_n_movies(idx_to_movie, knn, rating_matrix_sparse, df_movies_final, 1269)

Input movie:  Journey 2: The Mysterious Island (2012)

Recommendations: -
1: Spy Next Door, The (2010) (Action|Children|Comedy), with distance of 0.867059196288859
2: Spy Kids: All the Time in the World in 4D (2011) (Action|Adventure|Children|Comedy|Sci-Fi), with distance of 0.864881071784694
3: Percy Jackson: Sea of Monsters (2013) (Adventure|Children|Fantasy), with distance of 0.8470148251234224
4: Race to Witch Mountain (2009) (Adventure|Children|Fantasy|Sci-Fi|Thriller), with distance of 0.8262119240229784
5: Journey to the Center of the Earth (2008) (Action|Adventure|Sci-Fi), with distance of 0.76905596419453


In [None]:
recommend_n_movies(idx_to_movie, knn, rating_matrix_sparse, df_movies_final, 836)

Input movie:  Black Swan (2010)

Recommendations: -
1: Dark Knight, The (2008) (Action|Crime|Drama|IMAX), with distance of 0.5803895145450594
2: Social Network, The (2010) (Drama), with distance of 0.5760113517915533
3: Inglourious Basterds (2009) (Action|Drama|War), with distance of 0.5620035717705465
4: Shutter Island (2010) (Drama|Mystery|Thriller), with distance of 0.5501503540753012
5: Inception (2010) (Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX), with distance of 0.5279057102365062


In [None]:
recommend_n_movies(idx_to_movie, knn, rating_matrix_sparse, df_movies_final, 2099)

Input movie:  Hercules (2014)

Recommendations: -
1: Teenage Mutant Ninja Turtles (2014) (Action|Adventure|Comedy), with distance of 0.8604323452425476
2: Dracula Untold (2014) (Action|Drama|Fantasy), with distance of 0.8542483580694488
3: Transformers: Age of Extinction (2014) (Action|Adventure|Sci-Fi), with distance of 0.8539516784135355
4: The Expendables 3 (2014) (Action|Adventure), with distance of 0.8493063695034774
5: Wrath of the Titans (2012) (Action|Adventure|Fantasy|IMAX), with distance of 0.8318200355375754


In [None]:
# movie_fetcher(movie_to_idx, 'Transformers')[0]
recommend_n_movies(idx_to_movie, knn, rating_matrix_sparse, df_movies_final, 361)

Input movie:  Transformers: Revenge of the Fallen (2009)

Recommendations: -
1: Transformers: Age of Extinction (2014) (Action|Adventure|Sci-Fi), with distance of 0.6962458478642812
2: X-Men Origins: Wolverine (2009) (Action|Sci-Fi|Thriller), with distance of 0.6784653421120357
3: G.I. Joe: The Rise of Cobra (2009) (Action|Adventure|Sci-Fi|Thriller), with distance of 0.666497145465516
4: Terminator Salvation (2009) (Action|Adventure|Sci-Fi|Thriller), with distance of 0.6499924807915685
5: Transformers: Dark of the Moon (2011) (Action|Adventure|Sci-Fi|War|IMAX), with distance of 0.5127623579268963
