In [1]:
import pandas as pd
import numpy as np
import ast
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

# Merge data on title
movies = movies.merge(credits, on="title")
movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [3]:
def convert(obj):
    try:
        return [i["name"] for i in ast.literal_eval(obj)]
    except:
        return []

def get_director(obj):
    try:
        return [i["name"] for i in ast.literal_eval(obj) if i["job"] == "Director"]
    except:
        return []

In [4]:
# Apply processing
movies["genres"] = movies["genres"].apply(convert)
movies["keywords"] = movies["keywords"].apply(convert)
movies["cast"] = movies["cast"].apply(lambda x: convert(x)[:3])
movies["crew"] = movies["crew"].apply(get_director)
movies["overview"] = movies["overview"].fillna("")

# Create tags column
movies["tags"] = movies["overview"] + " " + \
                  movies["genres"].apply(lambda x: " ".join(x)) + " " + \
                  movies["keywords"].apply(lambda x: " ".join(x)) + " " + \
                  movies["cast"].apply(lambda x: " ".join(x)) + " " + \
                  movies["crew"].apply(lambda x: " ".join(x))

movies["tags"] = movies["tags"].str.lower()

In [5]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000, stop_words="english")
vectors = tfidf.fit_transform(movies["tags"]).toarray()


In [6]:
# KMeans Clustering
kmeans = KMeans(n_clusters=20, random_state=42)
movies["cluster"] = kmeans.fit_predict(vectors)


In [7]:
# Save files
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

with open("kmeans_model.pkl", "wb") as f:
    pickle.dump(kmeans, f)

movies.to_pickle("movies_with_clusters.pkl")

print("✅ Model and vectorizer saved successfully.")


✅ Model and vectorizer saved successfully.
