# Importing Libraries

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import warnings
warnings.filterwarnings('ignore')
from joblib import load, dump, Parallel, delayed

# Visualization
import plotly.express as px
import plotly.graph_objects as go  # for 3D plot visualization
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

from langdetect import detect
from datetime import datetime


# Data Prepocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# Model Training
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import faiss
from sklearn.neighbors import NearestNeighbors

'''
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Necessary modules for collaborative filtering
from keras.layers import Input, Embedding, Dot, Flatten, Dense
from keras.models import Model
from keras.optimizers import Adam
from wordcloud import WordCloud
from collections import defaultdict
from collections import Counter
'''
# Necessary modules for content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


# Reading Datasets

In [2]:
# Setting column display to 50
pd.set_option('display.max_columns', 50)

In [8]:
# Importing tv details dataframe
df_tv = pd.read_csv('../AllData/TV/tv.csv')
print("Shape of the Dataset:",df_tv.shape)
df_tv.head(3)

Shape of the Dataset: (168639, 29)


Unnamed: 0,id,name,number_of_seasons,number_of_episodes,original_language,vote_count,vote_average,overview,adult,backdrop_path,first_air_date,last_air_date,homepage,in_production,original_name,popularity,poster_path,type,status,tagline,genres,created_by,languages,networks,origin_country,spoken_languages,production_companies,production_countries,episode_run_time
0,1399,Game of Thrones,8,73,en,21857,8.442,Seven noble families fight for control of the ...,False,/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg,2011-04-17,2019-05-19,http://www.hbo.com/game-of-thrones,False,Game of Thrones,1083.917,/1XS1oqL89opfnbLl8WnZY1O1uJx.jpg,Scripted,Ended,Winter Is Coming,"Sci-Fi & Fantasy, Drama, Action & Adventure","David Benioff, D.B. Weiss",en,HBO,US,English,"Revolution Sun Studios, Television 360, Genera...","United Kingdom, United States of America",0
1,71446,Money Heist,3,41,es,17836,8.257,"To carry out the biggest heist in history, a m...",False,/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg,2017-05-02,2021-12-03,https://www.netflix.com/title/80192098,False,La Casa de Papel,96.354,/reEMJA1uzscCbkpeRJeTT2bjqUp.jpg,Scripted,Ended,The perfect robbery.,"Crime, Drama",Álex Pina,es,"Netflix, Antena 3",ES,Español,Vancouver Media,Spain,70
2,66732,Stranger Things,4,34,en,16161,8.624,"When a young boy vanishes, a small town uncove...",False,/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg,2016-07-15,2022-07-01,https://www.netflix.com/title/80057281,True,Stranger Things,185.711,/49WJfeN0moxb9IPfGn8AIqMGskD.jpg,Scripted,Returning Series,Every ending has a beginning.,"Drama, Sci-Fi & Fantasy, Mystery","Matt Duffer, Ross Duffer",en,Netflix,US,English,"21 Laps Entertainment, Monkey Massacre Product...",United States of America,0


In [4]:
# Importing user score dataframe
df_score = pd.read_csv('../AllData/TV/tv_user_ratings.csv')
print("Shape of the dataset:",df_score.shape)
df_score.head()

Shape of the dataset: (20, 3)


Unnamed: 0,userId,tmdbId,rating
0,66333506c89e65d331bc7653,76479,8
1,66333506c89e65d331bc7653,112470,9
2,66333506c89e65d331bc7653,218145,8
3,66333506c89e65d331bc7653,37854,8
4,66ab7c9293c726da200f6fe2,94605,9


# Data Preparation

In [9]:
df_tv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168639 entries, 0 to 168638
Data columns (total 29 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    168639 non-null  int64  
 1   name                  168634 non-null  object 
 2   number_of_seasons     168639 non-null  int64  
 3   number_of_episodes    168639 non-null  int64  
 4   original_language     168639 non-null  object 
 5   vote_count            168639 non-null  int64  
 6   vote_average          168639 non-null  float64
 7   overview              93333 non-null   object 
 8   adult                 168639 non-null  bool   
 9   backdrop_path         77780 non-null   object 
 10  first_air_date        136903 non-null  object 
 11  last_air_date         138735 non-null  object 
 12  homepage              50998 non-null   object 
 13  in_production         168639 non-null  bool   
 14  original_name         168634 non-null  object 
 15  

In [10]:
df_tv = df_tv.drop(columns=['number_of_seasons', 'number_of_episodes', 'original_language', 'overview', 'adult', 'backdrop_path', 'last_air_date', 'homepage', 'original_name', 'popularity', 'status', 'tagline', 'created_by', 'languages', 'networks', 'origin_country', 'spoken_languages', 'production_countries', 'episode_run_time'])

In [11]:
df_tv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168639 entries, 0 to 168638
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    168639 non-null  int64  
 1   name                  168634 non-null  object 
 2   vote_count            168639 non-null  int64  
 3   vote_average          168639 non-null  float64
 4   first_air_date        136903 non-null  object 
 5   in_production         168639 non-null  bool   
 6   poster_path           108737 non-null  object 
 7   type                  168639 non-null  object 
 8   genres                99713 non-null   object 
 9   production_companies  59342 non-null   object 
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 11.7+ MB


In [8]:
# Preprocessing Score column
df_tv['vote_average'].value_counts()

vote_average
0.000     111135
10.000      5433
8.000       5310
7.000       5134
6.000       4487
           ...  
6.511          1
6.915          1
7.170          1
6.777          1
8.638          1
Name: count, Length: 2603, dtype: int64

In [9]:
# Filter out tvs with 'vote_average' of 0
df_tv = df_tv[df_tv['vote_average'] != 0]

In [10]:
df_score['userId'] = df_score['userId'].astype('str')
df_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   userId  20 non-null     object
 1   tmdbId  20 non-null     int64 
 2   rating  20 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 612.0+ bytes


In [11]:
df_score = df_score[df_score['userId'].notnull()]

In [12]:
df_score.isnull().sum()

userId    0
tmdbId    0
rating    0
dtype: int64

# Model Training (Content-Based V1)

In [12]:
# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Define a generator to compute TF-IDF matrix on the fly
tfidf_matrix = tfidf.fit_transform((genre for genre in df_tv['genres'].values.astype('U')))

# Convert to sparse matrix
tfidf_matrix_sparse = csr_matrix(tfidf_matrix)

In [14]:
# Function to compute cosine similarity for a single tv
def compute_cosine_similarity_for_tv(matrix, idx):
    tv_vector = matrix[idx]
    similarity_scores = cosine_similarity(tv_vector, matrix).flatten()
    return similarity_scores

# Content-Based Recommendation V1

In [17]:
# Function to get recommendations based on cosine similarity, genre, and ratings based on score
def get_recommendations_by_title_v1(title, matrix, df, n=10):
    # Get the index of the tv that matches the title
    idx = df[df['name'] == title].index[0]

    # Compute the similarity scores between the tv at the given index and all other tvs
    sim_scores = compute_cosine_similarity_for_tv(matrix, idx)

    # Filter out tvs with unknown scores
    valid_scores = [(i, score) for i, score in enumerate(sim_scores) if df.iloc[i]['vote_average'] != -1]

    # Sort the valid tv similarity scores based on the cosine similarity and ratings score in descending order
    sorted_scores = sorted(valid_scores, key=lambda x: (x[1], df.iloc[x[0]]['vote_average']), reverse=True)

    # Get the top n similar tvs (excluding the tv itself)
    top_tvs = [x for x in sorted_scores if x[0] != idx][:10]

    # Extract the indices of the recommended tvs
    recommended_indices = [x[0] for x in top_tvs]
    return df.iloc[recommended_indices][['id', 'name', 'genres', 'vote_average', 'first_air_date', 'poster_path']]

In [18]:
tv_title = 'Stranger Things'
print(f'Recommendations for "{tv_title}":')
get_recommendations_by_title_v1(tv_title, tfidf_matrix_sparse, df_tv)

Recommendations for "Stranger Things":


Unnamed: 0,name,genres,vote_average
42404,In Absentia,"Drama, Sci-Fi & Fantasy, Mystery",10.0
44337,La Flor,"Drama, Mystery, Sci-Fi & Fantasy",10.0
49984,Abrigados,"Drama, Mystery, Sci-Fi & Fantasy",10.0
54343,Memory Eclipse,"Sci-Fi & Fantasy, Drama, Mystery",10.0
15165,Wu Xin: The Monster Killer,"Mystery, Drama, Sci-Fi & Fantasy",9.5
20151,La Pluie,"Drama, Sci-Fi & Fantasy, Mystery",9.2
5641,Someday or One Day,"Mystery, Sci-Fi & Fantasy, Drama",9.029
29298,Dedalo,"Mystery, Sci-Fi & Fantasy, Drama",9.0
19564,Resurrection,"Mystery, Drama, Sci-Fi & Fantasy",8.8
12048,13 Klinicheskaya,"Drama, Sci-Fi & Fantasy, Mystery",8.778


# Model Training (Content-Based V2)

In [13]:
# Compute the TF-IDF matrix
tfidf_matrix_v2 = tfidf.fit_transform(df_tv['genres'].values.astype('U')).astype(np.float32)

# Convert the sparse matrix to a dense matrix
tfidf_matrix_dense = tfidf_matrix_v2.toarray()

# Normalize the TF-IDF matrix
tfidf_matrix_dense = tfidf_matrix_dense / np.linalg.norm(tfidf_matrix_dense, axis=1, keepdims=True)

# Build the Faiss index
index = faiss.IndexFlatIP(tfidf_matrix_dense.shape[1])
index.add(tfidf_matrix_dense)

# Content-Based Recommendation V2

In [14]:
# Function to get recommendations based on cosine similarity, genre, and ratings based on score
def get_recommendations_by_title_v2(title, index, matrix, df, n=10):
    # Get the index of the tv that matches the title
    idx = df[df['name'] == title].index[0]

    # Compute the similarity scores between the tv at the given index and all other tvs
    tv_vector = matrix[idx].reshape(1, -1).astype(np.float32)
    distances, indices = index.search(tv_vector, n + 1)  # n+1 because the tv itself will be included

    # Filter out the tv itself
    indices = indices[0]
    distances = distances[0]
    filtered_indices = [(i, d) for i, d in zip(indices, distances) if i != idx]

    # Sort the results by similarity score
    filtered_indices = sorted(filtered_indices, key=lambda x: (x[1], df.iloc[x[0]]['vote_average']), reverse=True)

    # Get the top n similar tvs
    top_indices = [i for i, _ in filtered_indices[:n]]

    # Extract the indices of the recommended tvs
    recommended_indices = [i for i in top_indices if df.iloc[i]['vote_average'] != -1]
    return df.iloc[recommended_indices][['id', 'name', 'genres', 'vote_average', 'first_air_date', 'poster_path']]

In [16]:
tv_title = 'Stranger Things'
print(f'Recommendations for "{tv_title}":')
get_recommendations_by_title_v2(tv_title, index, tfidf_matrix_dense, df_tv)

Recommendations for "Stranger Things":


Unnamed: 0,name,genres,vote_average
94,The Originals,"Sci-Fi & Fantasy, Drama, Mystery",8.6
31,Dark,"Drama, Sci-Fi & Fantasy, Mystery",8.432
52,Black Mirror,"Sci-Fi & Fantasy, Drama, Mystery",8.306
28,Supernatural,"Drama, Mystery, Sci-Fi & Fantasy",8.304
9,WandaVision,"Sci-Fi & Fantasy, Mystery, Drama",8.3
101,Grimm,"Drama, Mystery, Sci-Fi & Fantasy",8.283
87,Chilling Adventures of Sabrina,"Mystery, Sci-Fi & Fantasy, Drama",8.242
38,American Horror Story,"Drama, Mystery, Sci-Fi & Fantasy",8.145
151,Fringe,"Sci-Fi & Fantasy, Drama, Mystery",8.119
83,Under the Dome,"Drama, Mystery, Sci-Fi & Fantasy",7.214


# Model Training (SVD : User-Based Collaborative)

In [13]:
# Load the dataset into Surprise's format
reader = Reader(rating_scale=(df_score['rating'].min(), df_score['rating'].max()))
data = Dataset.load_from_df(df_score[['userId', 'tmdbId', 'rating']], reader)

In [14]:
# Define the SVD model
svd_model = SVD()

# Evaluate the model using cross-validation
cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Train the SVD model on the entire dataset
svd_trainset = data.build_full_trainset()
svd_model.fit(svd_trainset)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7954  1.1303  2.3436  1.1511  1.3075  1.3456  0.5262  
MAE (testset)     0.6821  0.9418  1.5283  0.9556  1.1085  1.0432  0.2785  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a8994857d0>

In [15]:
dump(svd_model, '../AllModel/tv_svd_model.joblib')

['../AllModel/tv_svd_model.joblib']

# User-Based Collaborative Recommendation

In [16]:
# Create a pivot table with users as rows and tvs as columns
user_tv_matrix = df_score.pivot(index='userId', columns='tmdbId', values='rating').fillna(0)

# Convert to sparse matrix
user_tv_sparse_matrix = csr_matrix(user_tv_matrix.values)

# Fit the NearestNeighbors model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_tv_sparse_matrix)

In [29]:
# Function to get real-time recommendations using user-based collaborative filtering
def get_recommendations_by_user(user_id, knn, user_tv_matrix, n=10):
    user_index = user_tv_matrix.index.get_loc(user_id)
    n_neighbors = min(n + 1, user_tv_sparse_matrix.shape[0])  # Ensure n_neighbors is not greater than the number of samples
    distances, indices = knn.kneighbors(user_tv_sparse_matrix[user_index], n_neighbors=n_neighbors)
    
    # Get the indices of the most similar users
    similar_users_indices = indices.flatten()[1:]  # Exclude the user itself
    similar_users_distances = distances.flatten()[1:]  # Exclude the user itself
    
    # Get the tvs rated by the user
    user_rated_tvs = user_tv_matrix.iloc[user_index]
    
    # Initialize a dictionary to store the weighted sum of ratings
    weighted_ratings = {}
    
    # Iterate over similar users
    for similar_user_index, distance in zip(similar_users_indices, similar_users_distances):
        similarity_score = 1 - distance
        similar_user_ratings = user_tv_matrix.iloc[similar_user_index]
        
        # Iterate over the tvs rated by the similar user
        for tmdbId, rating in similar_user_ratings.items():
            if user_rated_tvs[tmdbId] == 0:  # Only consider tvs not rated by the user
                if tmdbId not in weighted_ratings:
                    weighted_ratings[tmdbId] = 0
                weighted_ratings[tmdbId] += similarity_score * rating
    
    # Sort the tvs based on the weighted sum of ratings
    sorted_tvs = sorted(weighted_ratings.items(), key=lambda x: x[1], reverse=True)
    
    # Get the top n tvs
    top_n_tvs = sorted_tvs[:n]
    return top_n_tvs

def hybrid_recommendations(user_id, svd, knn, user_tv_matrix, n=10):
    svd_recommendations = []

    if svd is not None:
        # Get initial recommendations using the SVD model
        user_rated_tvs = user_tv_matrix.loc[user_id]
        all_tvs = user_tv_matrix.columns
        
        for tmdbId in all_tvs:
            if user_rated_tvs[tmdbId] == 0:  # Only consider tvs not rated by the user
                svd_recommendations.append((tmdbId, svd.predict(user_id, tmdbId).est))

        # Normalize the SVD recommendation scores
        if svd_recommendations:
            max_svd_score = max(svd_recommendations, key=lambda x: x[1])[1]
            min_svd_score = min(svd_recommendations, key=lambda x: x[1])[1]
            svd_recommendations = [(tmdbId, (score - min_svd_score) / (max_svd_score - min_svd_score), score) for tmdbId, score in svd_recommendations]
        
        # Sort the SVD recommendations
        svd_recommendations = sorted(svd_recommendations, key=lambda x: x[1], reverse=True)
        
        # Get the top n SVD recommendations
        top_svd_recommendations = svd_recommendations[:n]
    else:
        top_svd_recommendations = []
    
    # Refine the recommendations using user-based collaborative filtering
    user_based_recommendations = get_recommendations_by_user(user_id, knn, user_tv_matrix, n)

    # Normalize the user-based recommendation scores
    if user_based_recommendations:
        max_user_score = max(user_based_recommendations, key=lambda x: x[1])[1]
        min_user_score = min(user_based_recommendations, key=lambda x: x[1])[1]
        user_based_recommendations = [(tmdbId, (score - min_user_score) / (max_user_score - min_user_score), score) for tmdbId, score in user_based_recommendations]

    # Combine both sets of recommendations and remove duplicates
    combined_recommendations = list({tmdbId: (score, original_score) for tmdbId, score, original_score in top_svd_recommendations + user_based_recommendations}.items())

    # Sort by recommendation score and then by tv score
    combined_recommendations = sorted(combined_recommendations, key=lambda x: (x[1][0], x[1][1]), reverse=True)
    
    final_recommendations = [tmdbId for tmdbId, _ in combined_recommendations[:n]]
    return final_recommendations

def get_tv_details_by_ids(tv_ids):
    tvs = []
    for tv_id in tv_ids:
        tv = df_tv[df_tv['id'] == tv_id]
        tvs.append(tv)
    result = pd.concat(tvs)
    return result[['id', 'name', 'genres', 'vote_average', 'first_air_date', 'poster_path']]

In [135]:
svd_model = load('../AllModel/tv_svd_model.joblib')

In [33]:
user_id = '66ab7c9293c726da200f6fe2'
recommended_tvs_ids = hybrid_recommendations(user_id, svd_model, knn_model, user_tv_matrix)
print(f'Top 10 recommended tvs using hybrid approach for user {user_id}: {recommended_tvs_ids}')
get_tv_details_by_ids(recommended_tvs_ids)

Top 10 recommended tvs using hybrid approach for user 66ab7c9293c726da200f6fe2: [112470, 37854, 218145]


Unnamed: 0,id,name,genres,vote_average,first_air_date,poster_path
6520,112470,Here it all begins,Soap,6.911,2020-11-02,/60cqjI590JKXCAABqCStVmSBGET.jpg
60,37854,One Piece,"Action & Adventure, Comedy, Animation",8.725,1999-10-20,/fcXdJlbSdUEeMSJFsXKsznGwwok.jpg
16446,218145,Mama na prenájom,"Family, Comedy",6.3,2023-01-09,/fH7PP2Rkdlo414IHvZABBHhtoqd.jpg


# Integrating Real-Time Data

In [34]:
from pymongo import MongoClient

# Function to fetch real-time data from the database
def fetch_real_time_data(uri, db_name, collection_name, data_type='serie'):
    # Connect to the MongoDB database
    client = MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    
    # Fetch the latest user ratings
    cursor = collection.find({'mediaType': 'serie', 'point': {'$gte': 1}}, {'_id': 0, 'userId': 1, 'mediaId': 1, 'point': 1})
    df_real_time = pd.DataFrame(list(cursor))
    df_real_time.rename(columns={'userId': 'userId', 'mediaId': 'tmdbId', 'point': 'rating'}, inplace=True)

    # Convert the data types
    df_real_time['userId'] = df_real_time['userId'].astype('str')
    df_real_time['tmdbId'] = df_real_time['tmdbId'].astype('int64')
    
    # Close the connection
    client.close()
    
    return df_real_time

# Function to update the user-item matrix and recompute nearest neighbors
def update_user_item_matrix(df_score, df_real_time):
    # Combine the existing data with the real-time data
    df_combined = pd.concat([df_score, df_real_time]).drop_duplicates(subset=['userId', 'tmdbId'], keep='last')
    
    # Create a pivot table with users as rows and tvs as columns
    user_tv_matrix = df_combined.pivot(index='userId', columns='tmdbId', values='rating').fillna(0)
    
    # Convert to sparse matrix
    user_tv_sparse_matrix = csr_matrix(user_tv_matrix.values)
    
    return user_tv_matrix, user_tv_sparse_matrix

def update_knn_model(user_tv_sparse_matrix):
    # Fit the NearestNeighbors model
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(user_tv_sparse_matrix)
    
    return knn

In [35]:
mongouser = ""
mongopass = ""

uri = f'mongodb+srv://{mongouser}:{mongopass}@cluster0.ls3onag.mongodb.net/next-auth-prisma?retryWrites=true&w=majority&appName=Cluster0'
df_real_time = fetch_real_time_data(uri, 'next-auth-prisma', 'Media')
user_tv_matrix, user_tv_sparse_matrix = update_user_item_matrix(df_score, df_real_time)
knn = update_knn_model(user_tv_sparse_matrix)

user_id = '66ab7c9293c726da200f6fe2'
recommended_tvs_ids = hybrid_recommendations(user_id, svd_model, knn, user_tv_matrix)
print(f'Top 10 recommended tvs using hybrid approach for user {user_id}: {recommended_tvs_ids}')
get_tv_details_by_ids(recommended_tvs_ids)

Top 10 recommended tvs using hybrid approach for user 66ab7c9293c726da200f6fe2: [112470, 37854, 218145]


Unnamed: 0,id,name,genres,vote_average,first_air_date,poster_path
6520,112470,Here it all begins,Soap,6.911,2020-11-02,/60cqjI590JKXCAABqCStVmSBGET.jpg
60,37854,One Piece,"Action & Adventure, Comedy, Animation",8.725,1999-10-20,/fcXdJlbSdUEeMSJFsXKsznGwwok.jpg
16446,218145,Mama na prenájom,"Family, Comedy",6.3,2023-01-09,/fH7PP2Rkdlo414IHvZABBHhtoqd.jpg
