In [None]:
#import block
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [None]:
import pandas as pd
df_movies = pd.read_csv("tmdb_movie_data.csv")
df_movies.drop(columns=['text_blob'], inplace=True)
# If these are still lists:
df_movies['crew_blob'] = (
    df_movies['directors'].apply(lambda x: " ".join(eval(x)) if isinstance(x, str) else " ".join(x)) + " " +
    df_movies['top_cast'].apply(lambda x: " ".join(eval(x)) if isinstance(x, str) else " ".join(x))
)

df_movies.head()



In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # small, fast, accurate
overview_embeddings = model.encode(df_movies['overview'].fillna(""), convert_to_numpy=True)
crew_embeddings = model.encode(df_movies['crew_blob'].fillna(""), convert_to_numpy=True)
genre_embeddings = model.encode(df_movies['genres'].fillna(""), convert_to_numpy=True)

w_overview = 0.5
w_crew = 0.4
w_genre = 0.1

# Combine all embeddings into one vector per movie
combined_embeddings = (
    w_overview * overview_embeddings +
    w_crew * crew_embeddings +
    w_genre * genre_embeddings
)


In [None]:
def recommend_movies(title, top_n=5):
    if title not in df_movies['title'].values:
        return f"'{title}' not found in dataset."

    idx = df_movies[df_movies['title'] == title].index[0]
    query_vec = combined_embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(query_vec, combined_embeddings)[0]


    top_indices = sims.argsort()[::-1][1:top_n+1]
    results = df_movies.iloc[top_indices][['title', 'genres', 'vote_average']].copy()
    results['similarity'] = sims[top_indices]
    
    return results

In [None]:
recommend_movies("Dune", top_n=5)


In [None]:
#