### Step 1: Clean and Optimize Data Handling

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
# Load datasets
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
# Combine datasets
data = ratings.merge(movies, on='movieId', how='inner')\
              .merge(tags, on='movieId', how='left')\
              .merge(links, on='movieId', how='left')

In [4]:
# Clean data
data.dropna(subset=['title', 'genres'], inplace=True)
data.drop_duplicates(inplace=True)

In [5]:
# Combine metadata into a single text feature
data['metadata'] = data['title'] + ' ' + \
    data['genres'] + ' ' + data['tag'].fillna('')

In [6]:
# combine and clean the timestamp to covert single colomn timestamp_x and timestamp_y to timestamp
data['timestamp'] = data.apply(lambda row: pd.to_datetime(row['timestamp_x']) if row['timestamp_x'] != 0 else pd.to_datetime(row['timestamp_y']), axis=1)
data.drop(['timestamp_x', 'timestamp_y'], axis=1, inplace=True)
data

Unnamed: 0,userId_x,movieId,rating,title,genres,userId_y,tag,imdbId,tmdbId,metadata,timestamp
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,1970-01-01 00:00:00.964982703
1,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,1970-01-01 00:00:00.964982703
2,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,1970-01-01 00:00:00.964982703
3,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,1970-01-01 00:00:00.847434962
4,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,114709,862.0,Toy Story (1995) Adventure|Animation|Children|...,1970-01-01 00:00:00.847434962
...,...,...,...,...,...,...,...,...,...,...,...
285757,610,160341,2.5,Bloodmoon (1997),Action|Thriller,,,118745,30948.0,Bloodmoon (1997) Action|Thriller,1970-01-01 00:00:01.479545749
285758,610,160527,4.5,Sympathy for the Underdog (1971),Action|Crime|Drama,,,66806,90351.0,Sympathy for the Underdog (1971) Action|Crime|...,1970-01-01 00:00:01.479544998
285759,610,160836,3.0,Hazard (2005),Action|Drama|Thriller,,,798722,70193.0,Hazard (2005) Action|Drama|Thriller,1970-01-01 00:00:01.493844794
285760,610,163937,3.5,Blair Witch (2016),Horror|Thriller,,,1540011,351211.0,Blair Witch (2016) Horror|Thriller,1970-01-01 00:00:01.493848789


In [7]:
# Convert timestamp to datetime and extract temporal features
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')
data['year'] = data['timestamp'].dt.year
data['month'] = data['timestamp'].dt.month
data['day'] = data['timestamp'].dt.day

In [8]:
# Drop irrelevant columns
data.drop(['userId_y', 'userId_x','tmdbId',
          'timestamp'], axis=1, inplace=True)

In [9]:
# save the cleaned data
data.to_csv('data.csv', index=False)

In [10]:
# Display processed data
print(data.head())

   movieId  rating             title  \
0        1     4.0  Toy Story (1995)   
1        1     4.0  Toy Story (1995)   
2        1     4.0  Toy Story (1995)   
3        1     4.0  Toy Story (1995)   
4        1     4.0  Toy Story (1995)   

                                        genres    tag  imdbId  \
0  Adventure|Animation|Children|Comedy|Fantasy  pixar  114709   
1  Adventure|Animation|Children|Comedy|Fantasy  pixar  114709   
2  Adventure|Animation|Children|Comedy|Fantasy    fun  114709   
3  Adventure|Animation|Children|Comedy|Fantasy  pixar  114709   
4  Adventure|Animation|Children|Comedy|Fantasy  pixar  114709   

                                            metadata  year  month  day  
0  Toy Story (1995) Adventure|Animation|Children|...  1970      1    1  
1  Toy Story (1995) Adventure|Animation|Children|...  1970      1    1  
2  Toy Story (1995) Adventure|Animation|Children|...  1970      1    1  
3  Toy Story (1995) Adventure|Animation|Children|...  1970      1    1  
4  

### Step 2: Vectorize Metadata and Cluster Using DBSCAN

TfidfVectorizer with max_features=5000 on a large dataset can be slow because it processes a large number of terms. 
🔥 Optimization Strategies:
✅ Use HashingVectorizer instead of TfidfVectorizer — It reduces memory and computation time by hashing features (no need to store large vocabulary).
✅ Reduce max_features based on experimentation — Values between 1000–3000 often provide good balance.
✅ Use n-grams carefully — unigram or bigram only to capture meaningful patterns.
✅ Preprocess text — Remove stopwords, punctuation, and apply stemming to reduce feature space.

In [10]:
import pandas as pd
data = pd.read_csv('data.csv')

In [12]:
data

Unnamed: 0,movieId,rating,title,genres,tag,imdbId,metadata,year,month,day
0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar,114709,Toy Story (1995) Adventure|Animation|Children|...,1970,1,1
1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar,114709,Toy Story (1995) Adventure|Animation|Children|...,1970,1,1
2,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,fun,114709,Toy Story (1995) Adventure|Animation|Children|...,1970,1,1
3,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar,114709,Toy Story (1995) Adventure|Animation|Children|...,1970,1,1
4,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar,114709,Toy Story (1995) Adventure|Animation|Children|...,1970,1,1
...,...,...,...,...,...,...,...,...,...,...
285757,160341,2.5,Bloodmoon (1997),Action|Thriller,,118745,Bloodmoon (1997) Action|Thriller,1970,1,1
285758,160527,4.5,Sympathy for the Underdog (1971),Action|Crime|Drama,,66806,Sympathy for the Underdog (1971) Action|Crime|...,1970,1,1
285759,160836,3.0,Hazard (2005),Action|Drama|Thriller,,798722,Hazard (2005) Action|Drama|Thriller,1970,1,1
285760,163937,3.5,Blair Witch (2016),Horror|Thriller,,1540011,Blair Witch (2016) Horror|Thriller,1970,1,1


In [2]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import normalize

# Use HashingVectorizer for better speed and memory efficiency
vectorizer = HashingVectorizer(
    n_features=2000, ngram_range=(1, 2), alternate_sign=False)
X = vectorizer.fit_transform(data['metadata'])

# Normalize to improve clustering performance
X = normalize(X)

print(f"Feature matrix shape: {X.shape}")

Feature matrix shape: (285762, 2000)


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

# Direct KMeans clustering
kmeans = KMeans(n_clusters=10, random_state=42, n_init='auto')
data['cluster'] = kmeans.fit_predict(X)

# Function to recommend items


def recommend(query):
    query_vector = vectorizer.transform([query])
    cluster = kmeans.predict(query_vector)[0]

    # Get all items in the same cluster
    cluster_items = data[data['cluster'] == cluster]

    # Compute similarity within the cluster
    similarity = cosine_similarity(query_vector, X[cluster_items.index])

    # Rank by similarity
    cluster_items['similarity'] = similarity[0]
    recommendations = cluster_items.sort_values(
        by='similarity', ascending=False).drop_duplicates('title').head(10)

    return recommendations[['title', 'genres', 'similarity']]

In [None]:
data.co

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import normalize
import joblib

# Reduce dimensions with TruncatedSVD (100 components for speed)
svd = TruncatedSVD(n_components=10)
X_reduced = svd.fit_transform(X)

# Normalize data for better clustering
X_reduced = normalize(X_reduced)

# Faster DBSCAN clustering
dbscan = DBSCAN(eps=0.3, min_samples=2, n_jobs=-1)
data['cluster'] = dbscan.fit_predict(X_reduced)

# Save model
joblib.dump(dbscan, 'dbscan_model.pkl')

# Evaluate clustering if valid clusters exist
valid_clusters = data[data['cluster'] != -1]['cluster']
if len(valid_clusters) > 1:
    silhouette = silhouette_score(
        X_reduced[valid_clusters.index], valid_clusters)
    db_score = davies_bouldin_score(
        X_reduced[valid_clusters.index], valid_clusters)
    print(f'Silhouette Score: {silhouette:.3f}')
    print(f'Davies-Bouldin Index: {db_score:.3f}')
else:
    print("Not enough clusters for evaluation.")

Update

In [4]:
from sklearn.cluster import DBSCAN, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import normalize

import joblib
import numpy as np

In [5]:
# Reduce dimensions with TruncatedSVD (faster than PCA for sparse data)
svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(X)

In [6]:
# Normalize for better clustering performance
X_reduced = normalize(X_reduced)

In [7]:
# Pre-cluster with MiniBatchKMeans to simplify DBSCAN input
kmeans = MiniBatchKMeans(n_clusters=50, random_state=42, batch_size=256)
pre_clusters = kmeans.fit_predict(X_reduced)

  super()._check_params_vs_input(X, default_n_init=3)


In [None]:
# Fine-tune DBSCAN parameters
dbscan = DBSCAN(eps=0.3, min_samples=10, n_jobs=-1)
data['cluster'] = dbscan.fit_predict(X_reduced)

In [None]:
# Save model
joblib.dump(dbscan, 'dbscan_model.pkl')

# Evaluate clustering if valid clusters exist
valid_clusters = data[data['cluster'] != -1]['cluster']

if len(valid_clusters) > 1:
    silhouette = silhouette_score(
        X_reduced[valid_clusters.index], valid_clusters)
    db_score = davies_bouldin_score(
        X_reduced[valid_clusters.index], valid_clusters)
    print(f'Silhouette Score: {silhouette:.3f}')
    print(f'Davies-Bouldin Index: {db_score:.3f}')
else:
    print("Not enough clusters for evaluation.")

  super()._check_params_vs_input(X, default_n_init=3)


In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Fine-tune DBSCAN parameters
dbscan = DBSCAN(eps=0.2, min_samples=5)
data['cluster'] = dbscan.fit_predict(X)

# Save model
joblib.dump(dbscan, 'dbscan_model.pkl')

# Evaluate clustering if valid clusters exist
valid_clusters = data[data['cluster'] != -1]['cluster']

if len(valid_clusters) > 1:
    silhouette = silhouette_score(X[valid_clusters.index], valid_clusters)
    db_score = davies_bouldin_score(
        X[valid_clusters.index].toarray(), valid_clusters)
    print(f'Silhouette Score: {silhouette:.3f}')
    print(f'Davies-Bouldin Index: {db_score:.3f}')
else:
    print("Not enough clusters for evaluation.")

In [None]:
# Save model and vectorizer
joblib.dump(dbscan, 'dbscan_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')