# Importing Libraries

In [49]:
# Basic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import warnings
warnings.filterwarnings('ignore')
from joblib import load, dump, Parallel, delayed

# Visualization
import plotly.express as px
import plotly.graph_objects as go  # for 3D plot visualization
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

from langdetect import detect
from datetime import datetime


# Data Prepocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# Model Training
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import faiss
from sklearn.neighbors import NearestNeighbors

'''
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Necessary modules for collaborative filtering
from keras.layers import Input, Embedding, Dot, Flatten, Dense
from keras.models import Model
from keras.optimizers import Adam
from wordcloud import WordCloud
from collections import defaultdict
from collections import Counter
'''
# Necessary modules for content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


# Reading Datasets

In [50]:
# Setting column display to 50
pd.set_option('display.max_columns', 50)

In [51]:
# Importing movie details dataframe
df_movie = pd.read_csv('../AllData/Movie/movie.csv')
print("Shape of the Dataset:",df_movie.shape)
df_movie.head(3)

Shape of the Dataset: (1138845, 24)


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,165000000,http://www.interstellarmovie.net/,tt0816692,en,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,185000000,https://www.warnerbros.com/movies/dark-knight/,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."


In [53]:
# Importing user score dataframe
df_score = pd.read_excel('../AllData/Movie/movie_user_ratings.xlsx')
print("Shape of the dataset:",df_score.shape)
df_score.head()

Shape of the dataset: (1048575, 3)


Unnamed: 0,tmdbId,userId,rating
0,19995,10,3.5
1,19995,14,3.5
2,19995,22,2.5
3,19995,29,4.0
4,19995,40,3.0


# Data Preparation

In [54]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138845 entries, 0 to 1138844
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1138845 non-null  int64  
 1   title                 1138832 non-null  object 
 2   vote_average          1138845 non-null  float64
 3   vote_count            1138845 non-null  int64  
 4   status                1138845 non-null  object 
 5   release_date          951037 non-null   object 
 6   revenue               1138845 non-null  int64  
 7   runtime               1138845 non-null  int64  
 8   adult                 1138845 non-null  bool   
 9   backdrop_path         302107 non-null   object 
 10  budget                1138845 non-null  int64  
 11  homepage              120573 non-null   object 
 12  imdb_id               603229 non-null   object 
 13  original_language     1138845 non-null  object 
 14  original_title        1138832 non-

In [55]:
df_movie = df_movie.drop(columns=['status', 'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'tagline', 'production_companies', 'production_countries', 'spoken_languages'])

In [56]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138845 entries, 0 to 1138844
Data columns (total 9 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   id            1138845 non-null  int64  
 1   title         1138832 non-null  object 
 2   vote_average  1138845 non-null  float64
 3   vote_count    1138845 non-null  int64  
 4   release_date  951037 non-null   object 
 5   popularity    1138845 non-null  float64
 6   poster_path   779136 non-null   object 
 7   genres        682084 non-null   object 
 8   keywords      308298 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 78.2+ MB


In [57]:
# Preprocessing Score column
df_movie['vote_average'].value_counts()

vote_average
0.000     788571
6.000      30625
5.000      29880
10.000     25017
7.000      24272
           ...  
8.449          1
8.608          1
8.127          1
3.657          1
0.750          1
Name: count, Length: 5024, dtype: int64

In [58]:
# Filter out movies with 'vote_average' of 0
df_movie = df_movie[df_movie['vote_average'] != 0]

In [59]:
df_score['rating'] = df_score['rating'] * 2
df_score['rating'] = df_score['rating'].astype('int64')
df_score.head()

Unnamed: 0,tmdbId,userId,rating
0,19995,10,7
1,19995,14,7
2,19995,22,5
3,19995,29,8
4,19995,40,6


In [60]:
df_score['userId'] = df_score['userId'].astype('str')
df_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   tmdbId  1048575 non-null  int64 
 1   userId  1048575 non-null  object
 2   rating  1048575 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


In [61]:
df_score = df_score[df_score['userId'].notnull()]

In [62]:
df_score.isnull().sum()

tmdbId    0
userId    0
rating    0
dtype: int64

# Model Training (Content-Based V1)

In [13]:
# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Define a generator to compute TF-IDF matrix on the fly
tfidf_matrix = tfidf.fit_transform((genre for genre in df_movie['genres'].values.astype('U')))

# Convert to sparse matrix
tfidf_matrix_sparse = csr_matrix(tfidf_matrix)

In [14]:
# Function to compute cosine similarity for a single movie
def compute_cosine_similarity_for_movie(matrix, idx):
    movie_vector = matrix[idx]
    similarity_scores = cosine_similarity(movie_vector, matrix).flatten()
    return similarity_scores

# Content-Based Recommendation V1

In [15]:
# Function to get recommendations based on cosine similarity, genre, and ratings based on score
def get_recommendations_by_title_v1(title, matrix, df, n=10):
    # Get the index of the movie that matches the title
    idx = df[df['title'] == title].index[0]

    # Compute the similarity scores between the movie at the given index and all other movies
    sim_scores = compute_cosine_similarity_for_movie(matrix, idx)

    # Filter out movies with unknown scores
    valid_scores = [(i, score) for i, score in enumerate(sim_scores) if df.iloc[i]['vote_average'] != -1]

    # Sort the valid movie similarity scores based on the cosine similarity and ratings score in descending order
    sorted_scores = sorted(valid_scores, key=lambda x: (x[1], df.iloc[x[0]]['vote_average']), reverse=True)

    # Get the top n similar movies (excluding the movie itself)
    top_movies = [x for x in sorted_scores if x[0] != idx][:10]

    # Extract the indices of the recommended movies
    recommended_indices = [x[0] for x in top_movies]
    return df.iloc[recommended_indices][['id', 'title', 'genres', 'vote_average', 'release_date', 'poster_path']]

In [None]:
movie_title = 'Interstellar'
print(f'Recommendations for "{movie_title}":')
get_recommendations_by_title_v1(movie_title, tfidf_matrix_sparse, df_movie)

# Model Training (Content-Based V2)

In [17]:
# Compute the TF-IDF matrix
tfidf_matrix_v2 = tfidf.fit_transform(df_movie['genres'].values.astype('U')).astype(np.float32)

# Convert the sparse matrix to a dense matrix
tfidf_matrix_dense = tfidf_matrix_v2.toarray()

# Normalize the TF-IDF matrix
tfidf_matrix_dense = tfidf_matrix_dense / np.linalg.norm(tfidf_matrix_dense, axis=1, keepdims=True)

# Build the Faiss index
index = faiss.IndexFlatIP(tfidf_matrix_dense.shape[1])
index.add(tfidf_matrix_dense)

# Content-Based Recommendation V2

In [18]:
# Function to get recommendations based on cosine similarity, genre, and ratings based on score
def get_recommendations_by_title_v2(title, index, matrix, df, n=10):
    # Get the index of the movie that matches the title
    idx = df[df['title'] == title].index[0]

    # Compute the similarity scores between the movie at the given index and all other movies
    movie_vector = matrix[idx].reshape(1, -1).astype(np.float32)
    distances, indices = index.search(movie_vector, n + 1)  # n+1 because the movie itself will be included

    # Filter out the movie itself
    indices = indices[0]
    distances = distances[0]
    filtered_indices = [(i, d) for i, d in zip(indices, distances) if i != idx]

    # Sort the results by similarity score
    filtered_indices = sorted(filtered_indices, key=lambda x: (x[1], df.iloc[x[0]]['vote_average']), reverse=True)

    # Get the top n similar movies
    top_indices = [i for i, _ in filtered_indices[:n]]

    # Extract the indices of the recommended movies
    recommended_indices = [i for i in top_indices if df.iloc[i]['vote_average'] != -1]
    return df.iloc[recommended_indices][['id', 'title', 'genres', 'vote_average', 'release_date', 'poster_path']]

In [None]:
print(f'Recommendations for "{movie_title}":')
get_recommendations_by_title_v2(movie_title, index, tfidf_matrix_dense, df_movie)

# Model Training (SVD : User-Based Collaborative)

In [63]:
# # Load the dataset into Surprise's format
# reader = Reader(rating_scale=(df_score['rating'].min(), df_score['rating'].max()))
# data = Dataset.load_from_df(df_score[['userId', 'tmdbId', 'rating']], reader)

In [64]:
# # Define the SVD model
# svd_model = SVD()

# # Evaluate the model using cross-validation
# cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# # Train the SVD model on the entire dataset
# svd_trainset = data.build_full_trainset()
# svd_model.fit(svd_trainset)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6843  1.6765  1.6908  1.6774  1.6827  1.6823  0.0052  
MAE (testset)     1.2500  1.2450  1.2550  1.2442  1.2486  1.2485  0.0039  
Fit time          12.10   12.24   12.68   12.51   13.45   12.60   0.47    
Test time         2.03    2.13    1.96    1.93    1.94    2.00    0.07    


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x217b4ec9d10>

In [65]:
# dump(svd_model, '../AllModel/movie_svd_model.joblib')

['../AllModel/movie_svd_model.joblib']

# User-Based Collaborative Recommendation

In [75]:
# Create a pivot table with users as rows and movies as columns
user_movie_matrix = df_score.pivot(index='userId', columns='tmdbId', values='rating').fillna(0)

# Convert to sparse matrix
user_movie_sparse_matrix = csr_matrix(user_movie_matrix.values)

# Fit the NearestNeighbors model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_movie_sparse_matrix)

In [80]:
# Function to get real-time recommendations using user-based collaborative filtering
def get_recommendations_by_user(user_id, knn, user_movie_matrix, n=10):
    user_index = user_movie_matrix.index.get_loc(user_id)
    n_neighbors = min(n + 1, user_movie_sparse_matrix.shape[0])  # Ensure n_neighbors is not greater than the number of samples
    distances, indices = knn.kneighbors(user_movie_sparse_matrix[user_index], n_neighbors=n_neighbors)
    
    # Get the indices of the most similar users
    similar_users_indices = indices.flatten()[1:]  # Exclude the user itself
    similar_users_distances = distances.flatten()[1:]  # Exclude the user itself
    
    # Get the movies rated by the user
    user_rated_movies = user_movie_matrix.iloc[user_index]
    
    # Initialize a dictionary to store the weighted sum of ratings
    weighted_ratings = {}
    
    # Iterate over similar users
    for similar_user_index, distance in zip(similar_users_indices, similar_users_distances):
        similarity_score = 1 - distance
        similar_user_ratings = user_movie_matrix.iloc[similar_user_index]
        
        # Iterate over the movies rated by the similar user
        for tmdbId, rating in similar_user_ratings.items():
            if user_rated_movies[tmdbId] == 0:  # Only consider movies not rated by the user
                if tmdbId not in weighted_ratings:
                    weighted_ratings[tmdbId] = 0
                weighted_ratings[tmdbId] += similarity_score * rating
    
    # Sort the movies based on the weighted sum of ratings
    sorted_movies = sorted(weighted_ratings.items(), key=lambda x: x[1], reverse=True)
    
    # Get the top n movies
    top_n_movies = sorted_movies[:n]
    return top_n_movies

def hybrid_recommendations(user_id, svd, knn, user_movie_matrix, n=10):
    svd_recommendations = []
    
    if svd is not None:
        # Get initial recommendations using the SVD model
        user_rated_movies = user_movie_matrix.loc[user_id]
        all_movies = user_movie_matrix.columns
        
        for tmdbId in all_movies:
            if user_rated_movies[tmdbId] == 0:  # Only consider movies not rated by the user
                svd_recommendations.append((tmdbId, svd.predict(user_id, tmdbId).est))
        
        # Normalize the SVD recommendation scores
        if svd_recommendations:
            max_svd_score = max(svd_recommendations, key=lambda x: x[1])[1]
            min_svd_score = min(svd_recommendations, key=lambda x: x[1])[1]
            svd_recommendations = [(tmdbId, (score - min_svd_score) / (max_svd_score - min_svd_score), score) for tmdbId, score in svd_recommendations]
        
        # Sort the SVD recommendations
        svd_recommendations = sorted(svd_recommendations, key=lambda x: x[1], reverse=True)
        
        # Get the top n SVD recommendations
        top_svd_recommendations = svd_recommendations[:n]
    else:
        top_svd_recommendations = []
    
    # Refine the recommendations using user-based collaborative filtering
    user_based_recommendations = get_recommendations_by_user(user_id, knn, user_movie_matrix, n)

    # Normalize the user-based recommendation scores
    if user_based_recommendations:
        max_user_score = max(user_based_recommendations, key=lambda x: x[1])[1]
        min_user_score = min(user_based_recommendations, key=lambda x: x[1])[1]
        user_based_recommendations = [(tmdbId, (score - min_user_score) / (max_user_score - min_user_score), score) for tmdbId, score in user_based_recommendations]

    # Combine both sets of recommendations and remove duplicates
    combined_recommendations = list({tmdbId: (score, original_score) for tmdbId, score, original_score in top_svd_recommendations + user_based_recommendations}.items())

    # Sort by recommendation score and then by tv score
    combined_recommendations = sorted(combined_recommendations, key=lambda x: (x[1][0], x[1][1]), reverse=True)
    
    final_recommendations = [tmdbId for tmdbId, _ in combined_recommendations[:n]]
    return final_recommendations

def get_movie_details_by_ids(movie_ids):
    movies = []
    for movie_id in movie_ids:
        movie = df_movie[df_movie['id'] == movie_id]
        movies.append(movie)
    result = pd.concat(movies)
    return result[['id', 'title', 'genres', 'vote_average', 'release_date', 'poster_path']]

In [22]:
svd_model = load('../AllModel/movie_svd_model.joblib')

In [85]:
user_id = '9'
recommended_movies_ids = hybrid_recommendations(user_id, svd_model, knn_model, user_movie_matrix)
print(f'Top 10 recommended movies using hybrid approach for user {user_id}: {recommended_movies_ids}')
get_movie_details_by_ids(recommended_movies_ids)

Top 10 recommended movies using hybrid approach for user 9: [254, 14160, 558, 411, 155, 41154, 534, 177572, 155, 254]


Unnamed: 0,id,title,genres,vote_average,release_date,poster_path
472,254,King Kong,"Adventure, Drama, Action",6.869,2005-12-12,/6a2HY6UmD7XiDD3NokgaBAXEsD2.jpg
51,14160,Up,"Animation, Comedy, Family, Adventure",7.949,2009-05-28,/vpbaStTMt8qqXaEgnOR2EE4DNJk.jpg
132,558,Spider-Man 2,"Action, Adventure, Fantasy",7.258,2004-06-25,/olxpyq9kJAZ2NU1siLshhhXEPR7.jpg
283,411,"The Chronicles of Narnia: The Lion, the Witch ...","Adventure, Family, Fantasy",7.126,2005-12-07,/iREd0rNCjYdf5Ar0vfaW32yrkm.jpg
2,155,The Dark Knight,"Drama, Action, Crime, Thriller",8.512,2008-07-16,/qJ2tW6WMUDux911r6m7haRef0WH.jpg
292,41154,Men in Black 3,"Action, Comedy, Science Fiction",6.51,2012-05-23,/90DdoEStzeObs96fsYf4GG544iN.jpg
641,534,Terminator Salvation,"Action, Science Fiction, Thriller",6.0,2009-05-20,/gw6JhlekZgtKUFlDTezq3j5JEPK.jpg
110,177572,Big Hero 6,"Adventure, Family, Animation, Action, Comedy",7.738,2014-10-24,/2mxS4wUimwlLmI1xp6QW6NSU361.jpg
2,155,The Dark Knight,"Drama, Action, Crime, Thriller",8.512,2008-07-16,/qJ2tW6WMUDux911r6m7haRef0WH.jpg
472,254,King Kong,"Adventure, Drama, Action",6.869,2005-12-12,/6a2HY6UmD7XiDD3NokgaBAXEsD2.jpg


# Integrating Real-Time Data

In [86]:
from pymongo import MongoClient

# Function to fetch real-time data from the database
def fetch_real_time_data(uri, db_name, collection_name, data_type='movie'):
    # Connect to the MongoDB database
    client = MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    
    # Fetch the latest user ratings
    cursor = collection.find({'mediaType': 'movie', 'point': {'$gte': 1}}, {'_id': 0, 'userId': 1, 'mediaId': 1, 'point': 1})
    df_real_time = pd.DataFrame(list(cursor))
    df_real_time.rename(columns={'userId': 'userId', 'mediaId': 'tmdbId', 'point': 'rating'}, inplace=True)

    # Convert the data types
    df_real_time['userId'] = df_real_time['userId'].astype('str')
    df_real_time['tmdbId'] = df_real_time['tmdbId'].astype('int64')
    
    # Close the connection
    client.close()
    
    return df_real_time

# Function to update the user-item matrix and recompute nearest neighbors
def update_user_item_matrix(df_score, df_real_time):
    # Combine the existing data with the real-time data
    df_combined = pd.concat([df_score, df_real_time]).drop_duplicates(subset=['userId', 'tmdbId'], keep='last')
    
    # Create a pivot table with users as rows and movies as columns
    user_movie_matrix = df_combined.pivot(index='userId', columns='tmdbId', values='rating').fillna(0)
    
    # Convert to sparse matrix
    user_movie_sparse_matrix = csr_matrix(user_movie_matrix.values)
    
    return user_movie_matrix, user_movie_sparse_matrix

def update_knn_model(user_movie_sparse_matrix):
    # Fit the NearestNeighbors model
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(user_movie_sparse_matrix)
    
    return knn

In [90]:
mongouser = ""
mongopass = ""

uri = f'mongodb+srv://{mongouser}:{mongopass}@cluster0.ls3onag.mongodb.net/next-auth-prisma?retryWrites=true&w=majority&appName=Cluster0'
df_real_time = fetch_real_time_data(uri, 'next-auth-prisma', 'Media')
user_movie_matrix, user_movie_sparse_matrix = update_user_item_matrix(df_score, df_real_time)
knn = update_knn_model(user_movie_sparse_matrix)

user_id = '66ab7c9293c726da200f6fe2'
recommended_movies_ids = hybrid_recommendations(user_id, svd_model, knn, user_movie_matrix, 5)
print(f'Top 10 recommended movies using hybrid approach for user {user_id}: {recommended_movies_ids}')
get_movie_details_by_ids(recommended_movies_ids)

Top 10 recommended movies using hybrid approach for user 66ab7c9293c726da200f6fe2: [254, 155, 10681, 155, 58]


Unnamed: 0,id,title,genres,vote_average,release_date,poster_path
472,254,King Kong,"Adventure, Drama, Action",6.869,2005-12-12,/6a2HY6UmD7XiDD3NokgaBAXEsD2.jpg
2,155,The Dark Knight,"Drama, Action, Crime, Thriller",8.512,2008-07-16,/qJ2tW6WMUDux911r6m7haRef0WH.jpg
69,10681,WALL·E,"Animation, Family, Science Fiction",8.078,2008-06-22,/hbhFnRzzg6ZDmm8YAmxBnQpQIPh.jpg
2,155,The Dark Knight,"Drama, Action, Crime, Thriller",8.512,2008-07-16,/qJ2tW6WMUDux911r6m7haRef0WH.jpg
109,58,Pirates of the Caribbean: Dead Man's Chest,"Adventure, Fantasy, Action",7.346,2006-07-06,/uXEqmloGyP7UXAiphJUu2v2pcuE.jpg
