In [7]:
import pandas as pd
import math
from collections import Counter

# Load and Prepare Data

df = pd.read_csv("TMDB_movie_dataset_v11.csv")
df = df.iloc[:8000].copy()

# Fill missing
df['overview'] = df['overview'].fillna('')
df['keywords'] = df['keywords'].fillna('')
df['genres'] = df['genres'].fillna('')
df['genres'] = df['genres'].str.replace(',', ' ')

# combine content
df['content'] = df['overview'] + ' ' + df['keywords'] + ' ' + df['genres']

#  Build Vocabulary and IDF 

def tokenize(text):
    return text.lower().split()

# tokenized docs
docs = [tokenize(text) for text in df['content']]

# build vocabulary
vocab = set()
for doc in docs:
    vocab.update(doc)
vocab = list(vocab)

# compute IDF
def compute_idf(docs):
    import math
    N = len(docs)
    idf_dict = {}
    for term in vocab:
        df_count = sum(1 for doc in docs if term in doc)
        idf_dict[term] = math.log((N + 1) / (df_count + 1)) + 1  # smoothed
    return idf_dict

idf = compute_idf(docs)

# compute TF-IDF vectors
def compute_tf(doc):
    tf = Counter(doc)
    total = len(doc)
    return {term: tf[term] / total for term in tf}

def compute_tfidf(doc, idf):
    tf = compute_tf(doc)
    return {term: tf[term] * idf[term] for term in tf}

tfidf_vectors = [compute_tfidf(doc, idf) for doc in docs]

#  Cosine Similarity Function 

def cosine_sim(vec1, vec2):
    dot_product = 0.0
    norm1 = 0.0
    norm2 = 0.0

    for term in vec1:
        if term in vec2:
            dot_product += vec1[term] * vec2[term]

    norm1 = math.sqrt(sum(val**2 for val in vec1.values()))
    norm2 = math.sqrt(sum(val**2 for val in vec2.values()))

    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)

#  Master Recommender 

indices = pd.Series(df.index, index=df['title']).drop_duplicates()

def smart_recommender(query, top_n=10):
    if not isinstance(query, str):
        return "❌ Error: Input must be a string."

    query = query.strip()

    #  If it's a title
    if query in indices:
        return get_recommendations(query, top_n)

    #  If it's a genre
    genre_match = df[df['genres'].str.lower().str.contains(query.lower())]
    if not genre_match.empty:
        sample_title = genre_match.sample(1).iloc[0]['title']
        print(f"\n🎞️ Using sample movie from genre '{query}': {sample_title}")
        return get_recommendations(sample_title, top_n)

    #  Otherwise treat it as free text
    return recommend_by_keywords(query, top_n)


#  Title-Based Recommender 

def get_recommendations(title, top_n=10):
    idx = indices.get(title)
    if idx is None:
        return f"❌ Error: '{title}' not found in dataset."

    query_vec = tfidf_vectors[idx]
    scores = []

    for i, vec in enumerate(tfidf_vectors):
        if i != idx:
            sim = cosine_sim(query_vec, vec)
            scores.append((i, sim))

    scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n🎬 Recommendations for '{title}':")
    for i, (movie_idx, sim) in enumerate(scores[:top_n], 1):
        print(f"{i}. {df['title'].iloc[movie_idx]} | Similarity: {sim:.4f}")

# Keyword Recommender

def recommend_by_keywords(text, top_n=10):
    query_doc = tokenize(text)
    query_vec = compute_tfidf(query_doc, idf)

    scores = []
    for i, vec in enumerate(tfidf_vectors):
        sim = cosine_sim(query_vec, vec)
        scores.append((i, sim))

    scores.sort(key=lambda x: x[1], reverse=True)

    print(f"\n🧠 Recommendations for keyword input: '{text}'")
    for i, (movie_idx, sim) in enumerate(scores[:top_n], 1):
        print(f"{i}. {df['title'].iloc[movie_idx]} | Similarity: {sim:.4f}")


In [8]:
smart_recommender("Inception")                
smart_recommender("action")                  
smart_recommender("robot time travel ai")     
smart_recommender(1234)                      



🎬 Recommendations for 'Inception':
1. The Cell | Similarity: 0.1615
2. The Thirteenth Floor | Similarity: 0.1302
3. The Matrix | Similarity: 0.1276
4. The Matrix Revolutions | Similarity: 0.1232
5. The Matrix Resurrections | Similarity: 0.1111
6. Enemy | Similarity: 0.1107
7. The Lawnmower Man | Similarity: 0.1089
8. Strange Days | Similarity: 0.1075
9. Kiss Kiss Bang Bang | Similarity: 0.1057
10. eXistenZ | Similarity: 0.1001

🎞️ Using sample movie from genre 'action': El Topo

🎬 Recommendations for 'El Topo':
1. Lucky | Similarity: 0.1117
2. Miracle in Cell No. 7 | Similarity: 0.0952
3. About Schmidt | Similarity: 0.0923
4. Thirteen Lives | Similarity: 0.0866
5. Innocent Voices | Similarity: 0.0857
6. My Name Is Nobody | Similarity: 0.0803
7. Flight of the Phoenix | Similarity: 0.0790
8. Life in a Year | Similarity: 0.0785
9. Tracks | Similarity: 0.0776
10. Ace in the Hole | Similarity: 0.0771

🧠 Recommendations for keyword input: 'robot time travel ai'
1. I, Robot | Similarity: 0.1

'❌ Error: Input must be a string.'