In [9]:
import pandas as pd
import zipfile
with zipfile.ZipFile('C:\\Users\\NIKITA\\Downloads\\ml-25m.zip' , 'r')as zip_ref:
    zip_ref.extractall()


In [14]:
movies = pd.read_csv("movies.csv")

##### Cleaning movie title with REGEX


In [15]:
import re
def clean_title(title):
    return re.sub("[a^zA-Z0-9]" , " ",title)
movies["clean_title"] = movies["title"].apply(clean_title)

##### CREATING A TFIDF MATRIX

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

##### CREATING A SEARCH FUNCTION

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec , tfidf).flatten()
    indices = np.argpartition(similarity , -5)[-5: ]
    results = movies.iloc[indices][::-1]
    return results

##### BUIDING AN INTERACTIVE SEARCH BOX WITH JUPYTER

In [24]:
import ipywidgets as widgets
from IPython.display import display
movie_input = widgets.Text(
    value = " ",
    description = "Movie Title:",
    disabled = False
)
movie_list = widgets.Output()
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title)>5:
            display(search(title))
movie_input.observe(on_type , names='value')
display(movie_input , movie_list)

Text(value=' ', description='Movie Title:')

Output()

##### READING IN MOVIE RATINGS DATA

In [30]:
ratings = pd.read_csv("ratings.csv")

##### FINDING USERS WHO LIKED SAME MOVIE

In [33]:
movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"]>4)]["userId"].unique()
similar_users
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
similar_user_recs
similar_user_recs = similar_user_recs.value_counts()/ len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs >.1]
similar_user_recs

movieId
1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: count, Length: 113, dtype: float64

##### FINDING HOW MUCH ALL USERS LIKE MOVIES

In [36]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"]>4)]
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_users_recs

movieId
318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: count, Length: 113, dtype: float64

##### CREATING A RECOMMENDATION SCORE

In [37]:
rec_percentages = pd.concat([similar_user_recs , all_users_recs] , axis = 1)
rec_percentages.columns = ["similar" , "all"]
rec_percentages
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score" , ascending = False)
rec_percentages
rec_percentages.head(10).merge(movies , left_index = True , right_on = "movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,oy tory ( )
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,oy tory ( )
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,"ug's ife, ( )"
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,oy tory ( )
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,"onsters, nc. ( )"
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,l ddin ( )
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,inding emo ( )
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,e uty nd the e st ( )
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,"ncredibles, he ( )"
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,"ion ing, he ( )"


##### BUILDING A RECOMMENDATION FUNCTION

In [38]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"]>4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts()/ len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs >.1]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"]>4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs , all_users_recs] , axis = 1)
    rec_percentages.columns = ["similar" , "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score" , ascending = False)

    return rec_percentages.head(10).merge(movies , left_index = True , right_on = "movieId")

##### CREATING AN INTERACTIVE RECOMMENDATION WIDGET

In [41]:
movie_name_input =  widgets.Text(
    value = "Toy Story",
    description = "movie Title",
    disabled = False
)
recommendation_list = widgets.Output()
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
movie_name_input.observe(on_type , names="value")
display(movie_name_input , recommendation_list)

Text(value='Toy Story', description='movie Title')

Output()