In [125]:
import pandas as pd

movies = pd.read_csv("ml-25m/movies.csv")


In [126]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [127]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [128]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [129]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range = (1, 2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [131]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])

    # we are finding the similarity between the given title and all other titles
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    # this retieves the top-most similar titles
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    
    # returns the five most similar results
    return results

In [132]:
import ipywidgets as widgets
from IPython.display import display
# This creates a text box for user input
movie_input = widgets.Text(
    value = "In The Mood for Love",
    description = "Movie Title",
    disabled = False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            # calling the search function from above and displaying
            display(search(title))

movie_input.observe(on_type, names = 'value')

display(movie_input, movie_list)

Text(value='In The Mood for Love', description='Movie Title')

Output()

In [133]:
ratings = pd.read_csv("ml-25m/ratings.csv")

In [134]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [135]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [136]:
movie_id = 1

In [137]:
# here, we find users who also liked the given movie
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4.5)]["userId"].unique()

In [138]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]
# This gets a list of movies these specific users also liked

In [139]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
# this counts up the number of 'likes' for movies among the similar users
# we divide to get a percenage


similar_user_recs = similar_user_recs[similar_user_recs > 0.15]
similar_user_recs

1        1.000000
318      0.555243
356      0.537775
260      0.536236
296      0.488559
           ...   
1923     0.154181
4011     0.153278
4878     0.150889
48780    0.150677
44191    0.150305
Name: movieId, Length: 146, dtype: float64

In [140]:
# Next, we want to find how much all users like a given movie
# This will help determine if a certain movie is actually part
# of a 'niche', or if it is just a generally well-liked movie
# that everyone enjoys.

all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [141]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
# This is finding the percentage of all users who recommended a given movie in our 
# list of 'similar people' movie recommendations. What we want (in this model) are movies
# that have a large differential between general public and selected user ratings
# so essentially, movies that fit within the "niche"

all_user_recs

318     0.338541
296     0.281613
2571    0.241410
356     0.232737
593     0.223480
          ...   
648     0.029126
1393    0.027560
2987    0.025405
2797    0.025385
2355    0.024822
Name: movieId, Length: 146, dtype: float64

In [142]:
# now, we will actually compare 'niche' vs. general public scores
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis = 1)
rec_percentages.columns = ["similar", "all"]

rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.123388
318,0.555243,0.338541
356,0.537775,0.232737
260,0.536236,0.219818
296,0.488559,0.281613
...,...,...
1923,0.154181,0.033836
4011,0.153278,0.071406
4878,0.150889,0.073476
48780,0.150677,0.067580


In [143]:
# Here we create a "score", which we define as the ratio between how much
# similar user like the movie, and how much the general person likes a movie

rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending = False)

rec_percentages

Unnamed: 0,similar,all,score
2355,0.241625,0.024822,9.734431
1,1.000000,0.123388,8.104539
3114,0.415397,0.053128,7.818732
648,0.200743,0.029126,6.892322
2797,0.169684,0.025385,6.684415
...,...,...,...
79132,0.229520,0.129971,1.765924
296,0.488559,0.281613,1.734856
858,0.346907,0.207797,1.669453
318,0.555243,0.338541,1.640104


In [154]:
# display the top 10 recommendations

rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
2264,0.241625,0.024822,9.734431,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
0,1.0,0.123388,8.104539,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.415397,0.053128,7.818732,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
637,0.200743,0.029126,6.892322,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,Mission Impossible 1996
2705,0.169684,0.025385,6.684415,2797,Big (1988),Comedy|Drama|Fantasy|Romance,Big 1988
2895,0.169047,0.025405,6.654165,2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...,Who Framed Roger Rabbit 1988
584,0.218689,0.033122,6.602571,592,Batman (1989),Action|Crime|Thriller,Batman 1989
14813,0.207858,0.034753,5.981013,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
5241,0.194054,0.032853,5.906698,5349,Spider-Man (2002),Action|Adventure|Sci-Fi|Thriller,SpiderMan 2002
580,0.393629,0.066787,5.893777,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992


In [197]:
def find_similar_movies(movie_id, nicheness):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4.1)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4.1)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .12]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >= nicheness)]
    # howNiche is called above
    
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(8).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [203]:
movie_name_input = widgets.Text(
    value='In The Mood for Love',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()
    


def add_niche(nicheness = 4):
    return nicheness



def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 4:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            #####  widgets.interact(add_niche, nicheness = (0, 5, 1));
            display(find_similar_movies(movie_id, add_niche()))
            
            






movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)


Text(value='In The Mood for Love', description='Movie Title:')

Output()