In [None]:
import pandas as pd

In [None]:
from google.colab import files
uploaded = files.upload()



Saving movies.csv to movies (1).csv


In [None]:
movies = pd.read_csv("movies.csv")
movies.head()  # optional: show first few rows

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [None]:
movies["clean_title"] = movies["title"].apply(clean_title)


In [None]:
movies["combined_features"] = movies["clean_title"] + " " + movies["genres"]

In [None]:
movies["combined"] = movies["clean_title"] + " " + movies["genres"].fillna("")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["combined"])


In [None]:
movies

Unnamed: 0,movieId,title,genres,clean_title,combined_features,combined
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,Toy Story 1995 Adventure|Animation|Children|Co...,Toy Story 1995 Adventure|Animation|Children|Co...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995,Jumanji 1995 Adventure|Children|Fantasy,Jumanji 1995 Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995,Grumpier Old Men 1995 Comedy|Romance,Grumpier Old Men 1995 Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995,Waiting to Exhale 1995 Comedy|Drama|Romance,Waiting to Exhale 1995 Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995,Father of the Bride Part II 1995 Comedy,Father of the Bride Part II 1995 Comedy
...,...,...,...,...,...,...
62418,209157,We (2018),Drama,We 2018,We 2018 Drama,We 2018 Drama
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001,Window of the Soul 2001 Documentary,Window of the Soul 2001 Documentary
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018,Bad Poems 2018 Comedy|Drama,Bad Poems 2018 Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001,A Girl Thing 2001 (no genres listed),A Girl Thing 2001 (no genres listed)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]

    return results

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [None]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ratings = pd.read_csv("/content/drive/MyDrive/DATASETAI/ratings.csv")
ratings.dtypes

Unnamed: 0,0
userId,int64
movieId,int64
rating,float64
timestamp,int64


In [None]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [None]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [None]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
89745,1.000000,0.040459
58559,0.573393,0.148256
59315,0.530649,0.054931
79132,0.519715,0.132987
2571,0.496687,0.247010
...,...,...
47610,0.103545,0.022770
780,0.103380,0.054723
88744,0.103048,0.010383
1258,0.101226,0.083887


In [None]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title,combined_features,combined
17067,1.0,0.040459,24.716368,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012,Avengers The 2012 Action|Adventure|Sci-Fi|IMAX,Avengers The 2012 Action|Adventure|Sci-Fi|IMAX
20513,0.103711,0.005289,19.610199,106072,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX,Thor The Dark World 2013,Thor The Dark World 2013 Action|Adventure|Fant...,Thor The Dark World 2013 Action|Adventure|Fant...
25058,0.241054,0.012367,19.49177,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015,Avengers Age of Ultron 2015 Action|Adventure|S...,Avengers Age of Ultron 2015 Action|Adventure|S...
19678,0.216534,0.012119,17.867419,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013,Iron Man 3 2013 Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013 Action|Sci-Fi|Thriller|IMAX
16725,0.215043,0.012052,17.843074,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,Captain America The First Avenger 2011,Captain America The First Avenger 2011 Action|...,Captain America The First Avenger 2011 Action|...
16312,0.175447,0.010142,17.299824,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor 2011,Thor 2011 Action|Adventure|Drama|Fantasy|IMAX,Thor 2011 Action|Adventure|Drama|Fantasy|IMAX
21348,0.287608,0.016737,17.183667,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014,Captain America The Winter Soldier 2014 Action...,Captain America The Winter Soldier 2014 Action...
25071,0.214049,0.012856,16.649399,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016,Captain America Civil War 2016 Action|Sci-Fi|T...,Captain America Civil War 2016 Action|Sci-Fi|T...
25061,0.136017,0.008573,15.865628,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan 2015,AntMan 2015 Action|Adventure|Sci-Fi,AntMan 2015 Action|Adventure|Sci-Fi
14628,0.242876,0.015517,15.651921,77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX,Iron Man 2 2010,Iron Man 2 2010 Action|Adventure|Sci-Fi|Thrill...,Iron Man 2 2010 Action|Adventure|Sci-Fi|Thrill...


In [None]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [None]:
import pickle

In [None]:
model_data = {
    'vectorizer': vectorizer,
    'tfidf': tfidf,
    'movies': movies
}

with open('movie_recommender_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

In [None]:
with open('vectorizer3.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save the TF-IDF matrix
with open('tfidf3.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save the movie dataframe
with open('movies3.pkl', 'wb') as f:
    pickle.dump(movies, f)

In [None]:
import os
print(os.getcwd())


/content
