In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import re

In [2]:
PATH = "movielens_latest"
print(os.listdir(PATH))

['genome-tags.csv', 'genome-scores.csv', 'tags.csv', 'links.csv', 'ratings.csv', 'movies.csv', 'README.txt']


In [3]:
movie = pd.read_csv(os.path.join(PATH, "movies.csv"))
rating = pd.read_csv(os.path.join(PATH, "ratings.csv"))

In [4]:
movie_titles = movie["title"].unique()

In [5]:
def year_from_title(title):
    try:
        res = int(re.findall(r"\(\s*\+?(-?\d+)\s*\)", title)[0])
        return res
    except:
        return 0

In [6]:
movie["year"] = movie["title"].apply(year_from_title)
movie = movie[movie["year"] > 1995]

In [7]:
movie.head(3)

Unnamed: 0,movieId,title,genres,year
60,61,Eye for an Eye (1996),Drama|Thriller,1996
62,63,Don't Be a Menace to South Central While Drink...,Comedy|Crime,1996
63,64,Two if by Sea (1996),Comedy|Romance,1996


In [8]:
rating.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471


In [9]:
movie = movie.loc[:,["movieId","title"]]
rating = rating.loc[:,["userId","movieId","rating"]]

data = pd.merge(movie,rating)

In [10]:
print(data.shape)
data.head(3)

(14676024, 4)


Unnamed: 0,movieId,title,userId,rating
0,61,Eye for an Eye (1996),4,3.0
1,61,Eye for an Eye (1996),114,4.0
2,61,Eye for an Eye (1996),164,3.0


In [1]:
top_rated_movies = data["title"].value_counts()[:500].items()
for i, (key, value) in enumerate(tqdm(top_rated_movies)):
    to_remove = np.random.choice(data[data['title']==key].index, size=int(value*0.30*np.exp(-i/120)), replace=False)
    data.drop(to_remove, inplace=True)

In [None]:
most_rated_movies = data["title"].value_counts()[:1300].keys()
data_subset = data[data['title'].isin(most_rated_movies)]

In [None]:
print(data_subset.shape)
data_subset.head()

In [None]:
pivot_table = data_subset.pivot_table(index = ["userId"], columns = ["title"], values = "rating")

In [14]:
print(pivot_table.shape)
pivot_table.head(5)

(258832, 1000)


title,10 Things I Hate About You (1999),101 Dalmatians (1996),12 Years a Slave (2013),127 Hours (2010),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),"2 Fast 2 Furious (Fast and the Furious 2, The) (2003)",2012 (2009),21 (2008),...,Yes Man (2008),You've Got Mail (1998),Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,1.0,,,2.0,,,...,,2.5,,,,,,,3.5,3.5
5,,,,,,,,,,,...,,,,,,,,,,


In [15]:
def recommended_movies(movie_title, num=11):
    movie_watched = pivot_table[movie_title]

    similarity_with_other_movies = pivot_table.corrwith(movie_watched)
    similarity_with_other_movies = similarity_with_other_movies.sort_values(ascending=False)

    return pd.DataFrame(zip(similarity_with_other_movies.head(num)[1:].keys(),
                            similarity_with_other_movies.head(num)[1:].values),
                        columns=["title", "similarity"])

In [18]:
recommended_movies("Godfather: Part II, The (1974)")

In [16]:
recommended_movies("Lord of the Rings: The Fellowship of the Ring, The (2001)")

Unnamed: 0,title,similarity
0,"Lord of the Rings: The Two Towers, The (2002)",0.893852
1,"Lord of the Rings: The Return of the King, The...",0.892909
2,"Hobbit: An Unexpected Journey, The (2012)",0.548251
3,"Hobbit: The Desolation of Smaug, The (2013)",0.52074
4,The Hobbit: The Battle of the Five Armies (2014),0.50737
5,Pirates of the Caribbean: The Curse of the Bla...,0.361502
6,Harry Potter and the Prisoner of Azkaban (2004),0.344534
7,X-Men (2000),0.343988
8,Star Wars: Episode VII - The Force Awakens (2015),0.343452
9,Star Wars: Episode III - Revenge of the Sith (...,0.33761


In [None]:
recommended_movies("Pulp Fiction (1994)")

In [17]:
recommended_movies("Avengers, The (2012)")

Unnamed: 0,title,similarity
0,Avengers: Age of Ultron (2015),0.76911
1,Captain America: Civil War (2016),0.734993
2,Captain America: The Winter Soldier (2014),0.724375
3,Iron Man (2008),0.698434
4,Captain America: The First Avenger (2011),0.689643
5,Thor (2011),0.681642
6,Thor: The Dark World (2013),0.674295
7,Iron Man 2 (2010),0.665133
8,Iron Man 3 (2013),0.660984
9,Thor: Ragnarok (2017),0.595301


In [None]:
# matching = [s for s in pivot_table.columns if "Intouchables" in s]
# matching