In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('movies.csv')
df['genres'] = df['genres'].fillna('').str.replace('|', ' ', regex=False).str.lower().str.strip()
df['title_lower'] = df['title'].str.lower()

In [3]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['genres'])

In [4]:
def find_best_k(data, max_k=15):
    best_k = 2
    best_score = -1
    scores = []
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(data)
        score = silhouette_score(data, labels)
        scores.append(score)
        if score > best_score:
            best_k = k
            best_score = score
    plt.plot(range(2, max_k + 1), scores, marker='o')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score vs k')
    plt.show()
    return best_k

In [5]:
k = 10  # or use best_k from silhouette method
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X)

In [8]:
def recommend(movie_title, top_n=5):
    movie_title = movie_title.strip().lower()
    if movie_title not in df['title_lower'].values:
        return f"'{movie_title}' not found."

    idx = df[df['title_lower'] == movie_title].index[0]
    cluster_id = df.loc[idx, 'cluster']

    cluster_movies = df[(df['cluster'] == cluster_id) & (df.index != idx)]
    if cluster_movies.empty:
        return ["No similar movies found in the same cluster."]
    movie_vec = X[idx]
    cluster_vecs = X[cluster_movies.index]
    similarities = cosine_similarity(movie_vec, cluster_vecs).flatten()

    top_indices = similarities.argsort()[::-1][:top_n]
    recommendations = cluster_movies.iloc[top_indices]['title'].tolist()
    return recommendations
    

In [11]:
movie = "Matrix, The (1999)"
print(f"\n Recommendations for '{movie}':")
print(recommend(movie))


 Recommendations for 'Matrix, The (1999)':
['Boy 7 (2015)', 'Universal Soldier: Day of Reckoning (2012)', 'Twisted Pair (2018)', 'Meteor Apocalypse (2010)', 'Arrival, The (1996)']
