<a href="https://colab.research.google.com/github/Roshani-Dhule/DSBDA/blob/main/mini_project_of_dsbda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

# https://files.grouplens.org/datasets/movielens/ml-25m.zip
movies = pd.read_csv("/content/movies.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [5]:
movies["clean_title"] = movies["title"].apply(clean_title)


In [6]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [9]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Jumanji',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Jumanji', description='Movie Title:')

Output()

In [10]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [11]:
ratings = pd.read_csv("/content/ratings.csv")

In [12]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp    float64
dtype: object

In [13]:
movie_id = 1

In [14]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [15]:
similar_users

array([ 36,  75,  86,  90,  93,  95,  96,  98, 111, 120, 127, 143, 152,
       158, 160, 162, 171, 186, 188, 211, 217, 229, 230, 235, 249, 257,
       259, 297, 298, 302, 323, 329, 355, 359, 369, 371, 381, 392, 402,
       411, 428, 435, 439, 447, 449, 468, 469, 477, 484, 513, 519, 537,
       540, 541, 548, 551, 553, 561, 567, 573, 582, 593, 607, 609, 611,
       623, 624, 626, 628, 631, 644, 653, 654, 670, 683, 686, 694, 697,
       702, 709, 727, 733, 741, 749, 752, 765, 768, 773, 785, 791, 793,
       796, 803, 805, 807, 811, 830, 834, 839, 848, 856, 896, 904, 905])

In [16]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [17]:
similar_user_recs

5101         1
5105        34
5111       110
5114       150
5127       260
          ... 
127424    3671
127431    3751
127432    3760
127434    3812
127436    5060
Name: movieId, Length: 7345, dtype: int64

In [18]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [19]:
similar_user_recs

1       1.000000
318     0.423077
593     0.326923
260     0.317308
2571    0.298077
          ...   
1220    0.105769
1721    0.105769
1234    0.105769
1732    0.105769
2997    0.105769
Name: movieId, Length: 100, dtype: float64

In [20]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [21]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [22]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [23]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.123077
318,0.423077,0.328994
593,0.326923,0.231953
260,0.317308,0.209467
2571,0.298077,0.214201
...,...,...
1220,0.105769,0.036686
1721,0.105769,0.059172
1234,0.105769,0.043787
1732,0.105769,0.072189


In [24]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [25]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [26]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.123077,8.125,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
2264,0.134615,0.026036,5.170455,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.125,0.024852,5.029762,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
3021,0.230769,0.046154,5.0,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
729,0.125,0.029586,4.225,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995
1005,0.125,0.029586,4.225,1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,Mary Poppins 1964
4780,0.278846,0.073373,3.800403,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
4201,0.269231,0.081657,3.297101,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,Shrek 2001
33,0.173077,0.054438,3.179348,34,Babe (1995),Children|Drama,Babe 1995
2025,0.144231,0.047337,3.046875,2115,Indiana Jones and the Temple of Doom (1984),Action|Adventure|Fantasy,Indiana Jones and the Temple of Doom 1984


In [27]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [28]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()