## Extracting the zip file [data]

In [None]:
import zipfile
path = "ml-25m.zip"
with zipfile.ZipFile(path, 'r')as zip_ref:
    zip_ref.extractall()
print("Successfully Unzipped")

## Explore the data

In [1]:
import pandas as pd
movies = pd.read_csv("ml-25m/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


## Search Engine

In [3]:
import re

def clean_title(title):    
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [4]:
# Lets apply this function to make a clean titles without paranthesis and hyphens
movies["clean_title"] = movies["title"].apply(clean_title)

In [5]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


## Logic for searching the movies

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

## Search Engine 

In [8]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            display(search(title))
movie_input.observe(on_type, names = 'value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [9]:
ratings = pd.read_csv("ml-25m/ratings.csv")

In [10]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296.0,5.0,1.147880e+09
1,1,306.0,3.5,1.147869e+09
2,1,307.0,5.0,1.147869e+09
3,1,665.0,5.0,1.147879e+09
4,1,899.0,3.5,1.147869e+09
...,...,...,...,...
22516406,146386,3147.0,3.5,1.072882e+09
22516407,146386,3176.0,4.0,1.072882e+09
22516408,146386,3527.0,2.5,1.072882e+09
22516409,146386,4306.0,3.5,1.072882e+09


In [11]:
ratings.dtypes

userId         int64
movieId      float64
rating       float64
timestamp    float64
dtype: object

## Recommendation sys

In [12]:
movie_id = 1

In [21]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
similar_users

array([     3,      5,      8, ..., 146359, 146369, 146371], dtype=int64)

In [27]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [28]:
similar_user_recs

255           29.0
256           32.0
257           50.0
261          214.0
263          293.0
             ...  
22514914    1391.0
22514915    1393.0
22514916    1429.0
22514917    1485.0
22514918    1544.0
Name: movieId, Length: 2089790, dtype: float64

In [29]:
similar_user_recs.value_counts()

movieId
1.0         16963
318.0       14327
260.0       12443
296.0       12014
356.0       10976
            ...  
125942.0        1
80476.0         1
167700.0        1
74582.0         1
138966.0        1
Name: count, Length: 21840, dtype: int64

In [36]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

In [37]:
similar_user_recs = similar_user_recs[similar_user_recs > .10]
similar_user_recs

movieId
1.0        0.499338
318.0      0.421742
260.0      0.366283
296.0      0.353655
356.0      0.323099
             ...   
1527.0     0.102823
778.0      0.102764
4995.0     0.102411
78499.0    0.100203
34.0       0.100115
Name: count, Length: 91, dtype: float64

In [40]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [41]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296.0,5.0,1.147880e+09
29,1,4973.0,4.5,1.147869e+09
48,1,7361.0,5.0,1.147880e+09
72,2,110.0,5.0,1.141417e+09
76,2,260.0,5.0,1.141417e+09
...,...,...,...,...
22516384,146386,260.0,4.5,1.072883e+09
22516385,146386,356.0,4.5,1.072884e+09
22516391,146386,1196.0,4.5,1.072884e+09
22516392,146386,1210.0,5.0,1.072884e+09


In [43]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [44]:
all_users_recs

movieId
318.0      0.345604
296.0      0.287720
2571.0     0.246364
356.0      0.237053
593.0      0.227906
             ...   
2716.0     0.053771
34.0       0.052724
1073.0     0.049271
1148.0     0.047920
78499.0    0.035565
Name: count, Length: 91, dtype: float64

In [47]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
rec_percentages.columns = ["similar", "all"]

In [48]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.499338,0.125948
318.0,0.421742,0.345604
260.0,0.366283,0.223413
296.0,0.353655,0.287720
356.0,0.323099,0.237053
...,...,...
1527.0,0.102823,0.066772
778.0,0.102764,0.075555
4995.0,0.102411,0.076261
78499.0,0.100203,0.035565


In [50]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [53]:
rec_percentages = rec_percentages.sort_values("score", ascending = False)
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,0.499338,0.125948,3.964646
3114.0,0.170616,0.054298,3.142225
78499.0,0.100203,0.035565,2.817465
4886.0,0.167555,0.071620,2.339495
6377.0,0.166730,0.072905,2.286970
...,...,...,...
58559.0,0.181419,0.147888,1.226735
318.0,0.421742,0.345604,1.220304
4973.0,0.136617,0.113979,1.198614
2959.0,0.253569,0.219129,1.157167


In [55]:
rec_percentages.head(10).merge(movies, left_index = True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,0.499338,0.125948,3.964646,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.170616,0.054298,3.142225,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,0.100203,0.035565,2.817465,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.167555,0.07162,2.339495,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
6258,0.16673,0.072905,2.28697,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
1047,0.112007,0.049271,2.273279,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
8246,0.154338,0.069066,2.234642,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
580,0.151482,0.068167,2.222206,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
1120,0.103588,0.04792,2.161697,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,Wallace Gromit The Wrong Trousers 1993
359,0.184805,0.086678,2.132093,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


## Final Function 

In [58]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    return rec_percentages.head(10).merge(movies, left_index = True, right_on="movieId")[["score","title","genres"]]

In [64]:
movie_input_name = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_input_name.observe(on_type, names="value")
display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()