In [17]:
import pandas as pd

movies = pd.read_csv("/movies.csv" )

In [18]:
import re

def clean_title(title):
  title = re.sub("[^a-zA-Z0-9 ]", "", title)
  return title

In [19]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [20]:
#Term Frequency Matrix
from sklearn.feature_extraction.text import TfidfVectorizer
vectorize = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorize.fit_transform(movies["clean_title"])

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  title = clean_title(title)
  query_vec = vectorize.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()

  #Find five most similar items to search term
  indices = np.argpartition(similarity, -5)[-5:]
  results = movies.iloc[indices][::-1]
  return results

In [69]:
#Building a search box
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = 'Toy Story',
    description = 'Movie Title: ',
    disabled = False
)

movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type, names='value')

In [70]:
ratings = pd.read_csv("/ratings.csv")

In [27]:
#Finding Users who like the same movie
movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()

In [53]:
#Similar users who liked other movies
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]

In [54]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [56]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >= 4)]

In [57]:
#Percentage of users who liked the movies
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [59]:
#Creating a recommendation score
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [71]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [72]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.235424,4.24766,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.327658,0.102244,3.204655,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.189412,0.068981,2.745867,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.158375,0.057712,2.744244,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
1992,0.122816,0.04713,2.605923,2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance,Little Mermaid The 1989
2669,0.102776,0.039856,2.578646,2761,"Iron Giant, The (1999)",Adventure|Animation|Children|Drama|Sci-Fi,Iron Giant The 1999
1047,0.231179,0.099117,2.33239,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
1005,0.127339,0.054721,2.327039,1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,Mary Poppins 1964
2705,0.152257,0.065915,2.309882,2797,Big (1988),Comedy|Drama|Fantasy|Romance,Big 1988
3650,0.127428,0.055521,2.295142,3751,Chicken Run (2000),Animation|Children|Comedy,Chicken Run 2000


In [73]:
#Recommendation Functions
def find_similar_movies(movie_id):
  similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
  similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]

  similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > .10]

  all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >= 4)]
  all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

  rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
  rec_percentages.columns = ["similar", "all"]

  rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
  rec_percentages = rec_percentages.sort_values("score", ascending=False)

  return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [76]:
movie_input_name = widgets.Text(
    value="Enter a movie name",
    description="Movie Title:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
  with recommendation_list:
    recommendation_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      results = search(title)
      movie_id = results.iloc[0]["movieId"]
      display(find_similar_movies(movie_id))

movie_input_name.observe(on_type, names="value")

display(movie_input_name, recommendation_list)

Text(value='Enter a movie name', description='Movie Title:')

Output()