**Project Goal:** Develop a hybrid recommender system that improves recommendation accuracy and effectiveness by mitigating the impact of homonyms and the cold-start problem.

In [1]:
import pandas as pd
import numpy as np

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')
data = pd.read_csv('movies.csv')

In [3]:
links.columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

In [4]:
movies.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [5]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [6]:
tags.columns

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

In [7]:
# combine the dataset
df1 = pd.merge(ratings, movies, on='movieId')
df2 = pd.merge(df1, tags, on='movieId')
data = pd.merge(df2, links, on='movieId')

In [8]:
df1.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [9]:
df2.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,title,genres,userId_y,tag,timestamp_y
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
4,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825


In [10]:
data.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,title,genres,userId_y,tag,timestamp_y,imdbId,tmdbId
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764,114709,862.0
1,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825,114709,862.0
2,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013,114709,862.0
3,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764,114709,862.0
4,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825,114709,862.0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233213 entries, 0 to 233212
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   userId_x     233213 non-null  int64  
 1   movieId      233213 non-null  int64  
 2   rating       233213 non-null  float64
 3   timestamp_x  233213 non-null  int64  
 4   title        233213 non-null  object 
 5   genres       233213 non-null  object 
 6   userId_y     233213 non-null  int64  
 7   tag          233213 non-null  object 
 8   timestamp_y  233213 non-null  int64  
 9   imdbId       233213 non-null  int64  
 10  tmdbId       233213 non-null  float64
dtypes: float64(2), int64(6), object(3)
memory usage: 19.6+ MB


# Method : 1
1. Load dataset into DataFrame.
2. Combine metadata into a single text feature.
3. Vectorize metadata using TF-IDF.
4. Add temporal features to the feature set.
5. Cluster items using DBSCAN.
6. For new item query:
   a. Vectorize the query.
   b. Add temporal feature.
   c. Predict cluster.
   d. Recommend items from the same cluster.
7. For new user profile:
   a. Filter items based on user preferences.
   b. Recommend items from the largest cluster in the filtered dataset.
8. Output clustered items and recommendations.

In [12]:
# Vectorize metadata using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN

# Combine metadata into a single text feature
data['metadata'] = data['title'] + ' ' + data['genres'] + ' ' + data['tag']

In [13]:
data.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,title,genres,userId_y,tag,timestamp_y,imdbId,tmdbId,metadata
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...
1,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...
2,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...
3,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...
4,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...


In [14]:
# remove timestamp_y
data.drop('timestamp_y', axis=1, inplace=True)

In [15]:
# Add temporal features to the feature set
data['timestamp_x'] = pd.to_datetime(data['timestamp_x'], unit='s')
data['year'] = data['timestamp_x'].dt.year
data['month'] = data['timestamp_x'].dt.month
data['day'] = data['timestamp_x'].dt.day

In [16]:
data.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,title,genres,userId_y,tag,imdbId,tmdbId,metadata,year,month,day
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,2000,7,30
1,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,2000,7,30
2,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,2000,7,30
3,5,1,4.0,1996-11-08 06:36:02,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,1996,11,8
4,5,1,4.0,1996-11-08 06:36:02,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,1996,11,8


In [17]:
# unique datset preprocessing operation
data.drop_duplicates(inplace=True)

In [18]:
data.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,title,genres,userId_y,tag,imdbId,tmdbId,metadata,year,month,day
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,2000,7,30
1,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,2000,7,30
2,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,2000,7,30
3,5,1,4.0,1996-11-08 06:36:02,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,1996,11,8
4,5,1,4.0,1996-11-08 06:36:02,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,1996,11,8


In [19]:
# Cluster items using DBSCAN
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['metadata'])
dbscan = DBSCAN(eps=0.5, min_samples=5)

In [None]:
# For new item query:
# a. Vectorize the query.
# b. Add temporal feature.
# c. Predict cluster.
# d. Recommend items from the same cluster

query = 'Toy Story'
query_vector = vectorizer.transform([query])
query_cluster = dbscan.fit_predict(query_vector)