In [None]:
!pip install opendatasets

In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata?select=tmdb_5000_movies.csv")

In [None]:
od.download("https://www.kaggle.com/datasets/aayushsoni4/tmdb-6000-movie-dataset-with-ratings")

In [None]:
import pandas as pd
import numpy as np
import ast
import scipy.sparse as sp
from scipy.sparse import csr_matrix

In [None]:
movies=pd.read_csv("/content/tmdb-6000-movie-dataset-with-ratings/tmdb_6000_movie_dataset.csv")
ratings=pd.read_csv("/content/tmdb-6000-movie-dataset-with-ratings/tmdb_6000_movie_ratings.csv")

In [None]:
movies.columns

Index(['Unnamed: 0', 'budget', 'genres', 'homepage', 'tmdbId', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count'],
      dtype='object')

In [None]:
movies=movies[["tmdbId","title"]]

In [None]:
movies.head(5)

Unnamed: 0,tmdbId,title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter


In [None]:
movies.isnull().sum()

Unnamed: 0,0
tmdbId,0
title,0


In [None]:
movies.shape

(5798, 2)

In [None]:
ratings.columns

Index(['tmdbId', 'userId', 'rating'], dtype='object')

In [None]:
ratings=ratings[["tmdbId","userId","rating"]]

In [None]:
ratings.head(1)

Unnamed: 0,tmdbId,userId,rating
0,19995,10.0,3.5


In [None]:
ratings.isnull().sum()

Unnamed: 0,0
tmdbId,0
userId,0
rating,0


In [None]:
ratings.shape

(24537619, 3)

In [None]:
ratings=ratings.astype({"tmdbId":"int32",
                   "userId":"int32",
                   "rating":"float16"})

In [None]:
from scipy.sparse import coo_matrix
import numpy as np
import pandas as pd


movie_ids = np.sort(movies['tmdbId'].unique())
user_ids = np.sort(ratings['userId'].unique())


movie_id_to_idx = {tmdbId: idx for idx, tmdbId in enumerate(movie_ids)}
user_id_to_idx = {userId: idx for idx, userId in enumerate(user_ids)}


chunk_size = 1_000_000  
rows, cols, data = [], [], []

for i in range(0, len(ratings), chunk_size):
    chunk = ratings.iloc[i:i + chunk_size]

    
    chunk_rows = chunk['userId'].map(user_id_to_idx)
    chunk_cols = chunk['tmdbId'].map(movie_id_to_idx)

    
    valid_rows = ~chunk_rows.isna() & ~chunk_cols.isna()
    chunk = chunk[valid_rows]
    chunk_rows = chunk_rows[valid_rows]
    chunk_cols = chunk_cols[valid_rows]

    rows.append(chunk_rows.astype(np.uint32).values)
    cols.append(chunk_cols.astype(np.uint32).values)
    data.append(chunk['rating'].astype(np.float32).values)


rows = np.concatenate(rows)
cols = np.concatenate(cols)
data = np.concatenate(data)


user_movie_matrix = coo_matrix(
    (data, (rows, cols)),
    shape=(len(user_ids), len(movie_ids)),
    dtype=np.float32
).tocsr() 



In [None]:
from implicit.nearest_neighbours import bm25_weigh
weighted_matrix = bm25_weight(user_movie_matrix, K1=100, B=0.8)

In [None]:
from sklearn.neighbors import NearestNeighbors

cf_knn_model= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)
cf_knn_model.fit(user_movie_matrix)

In [None]:
!pip install fuzzywuzzy python-Levenshtein

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)


In [None]:
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

def movie_recommender_engine(movie_name, movies, user_movie_matrix, n_recs=10):
    
    movie_titles = movies['title'].tolist()
    movie_title_to_idx = {title: idx for idx, title in enumerate(movie_titles)}

    
    movie_movie_matrix = user_movie_matrix.T  # shape is (5798, 328859)

    cf_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_recs+1)
    cf_model.fit(movie_movie_matrix)  

    match = process.extractOne(movie_name, movie_titles)
    matched_movie = match[0]
    movie_idx = movie_title_to_idx[matched_movie]

   
    distances, indices = cf_model.kneighbors(movie_movie_matrix[movie_idx].reshape(1, -1))

    recommendations = []
    for i in range(1, n_recs+1): 
        rec_idx = indices.flatten()[i]
        recommendations.append({
            'Title': movie_titles[rec_idx],
            'Distance': distances.flatten()[i],
            'Original_Match': matched_movie,
            'Match_Score': match[1]
        })

    return pd.DataFrame(recommendations)

In [None]:
recs=5
recommendations = movie_recommender_engine("batman",movies,user_movie_matrix)

In [None]:
print(recommendations)

                          Title  Distance Original_Match  Match_Score
0         Kissing Jessica Stein  0.734505         Batman          100
1                 Deep Blue Sea  0.759735         Batman          100
2                      Mooz-lum  0.762405         Batman          100
3    You Can't Take It With You  0.763682         Batman          100
4                     Pontypool  0.769349         Batman          100
5   Four Weddings and a Funeral  0.771891         Batman          100
6                 Dum Maaro Dum  0.772833         Batman          100
7                  Wicked Blood  0.773378         Batman          100
8              Chicago Overcoat  0.774316         Batman          100
9  I Served the King of England  0.774569         Batman          100


<h4>Pickle Dump</h4>

In [None]:
import pickle
cf_model={
    'cf_knn_model':cf_knn_model,
    'user_movie_matrix':user_movie_matrix,
    'movie_id_to_idx':movie_id_to_idx,
    'user_id_to_idx':user_id_to_idx,
    'movie_ids':movie_ids,
    'user_ids':user_ids,
    'movie_titles':movie_titles
}
with open('cfmodel.pkl', 'wb') as f:
    pickle.dump(cf_model, f)

In [None]:
from google.colab import files
files.download('cfmodel.pkl')