# Cell 1: Download TMDB Dataset

In [None]:
!gdown --folder https://drive.google.com/drive/folders/1A2io9k2MXnjXEBrsOJTZGpi6mTs1ZjWb

Retrieving folder contents
Processing file 1oOQrIqgHe1BLZDXggmlGrynwCu1L4BBl tmdb_5000_credits.csv
Processing file 1uL6ziMoMH0rhOaJJeFD_dYYMm6URJ_7m tmdb_5000_movies.csv
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1oOQrIqgHe1BLZDXggmlGrynwCu1L4BBl
To: /content/Movie Dataset/tmdb_5000_credits.csv
100% 40.0M/40.0M [00:00<00:00, 73.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1uL6ziMoMH0rhOaJJeFD_dYYMm6URJ_7m
To: /content/Movie Dataset/tmdb_5000_movies.csv
100% 5.70M/5.70M [00:00<00:00, 40.1MB/s]
Download completed


In [None]:
!pip install -q gdown
print("✅ gdown installed")


✅ gdown installed


# Cell 2: Install Required Dependencies

In [None]:
# Install all required libraries
!pip install -q sentence-transformers chromadb streamlit pyngrok pandas numpy scikit-learn
print("✅ All dependencies installed successfully!")
print("\nInstalled packages:")
print("- sentence-transformers (for embeddings)")
print("- chromadb (vector database with HNSW)")
print("- streamlit (web UI)")
print("- pyngrok (public URL tunneling)")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[

# Cell 3: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import json
import ast
import warnings
warnings.filterwarnings('ignore')

from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import time

print("✅ All libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")



✅ All libraries imported successfully!
Pandas version: 2.2.2
NumPy version: 2.0.2


# Cell 4: Load TMDB Dataset

In [None]:
# Load the two CSV files from Google Drive

import gdown

try:
    movies_df = pd.read_csv('/content/Movie Dataset/tmdb_5000_movies.csv')
    credits_df = pd.read_csv('/content/Movie Dataset/tmdb_5000_credits.csv')
    print("✅ Files loaded ")
except:
    # Fallback: download from Kaggle or provide instructions
    print("⚠️ Files not found in Drive. Please upload:")
    print("1. tmdb_5000_movies.csv")
    print("2. tmdb_5000_credits.csv")
    print("\nTo your Google Drive root or MyDrive folder")

print(f"\n Movies Dataset Shape: {movies_df.shape}")
print(f" Credits Dataset Shape: {credits_df.shape}")
print(f"\n🎬 Total Movies: {len(movies_df)}")

✅ Files loaded 

 Movies Dataset Shape: (4803, 20)
 Credits Dataset Shape: (4803, 4)

🎬 Total Movies: 4803


# Cell 5: Explore Dataset Structure

In [None]:
print("="*80)
print("MOVIES DATASET - First 3 rows")
print("="*80)
print(movies_df.head(3))
print("\n" + "="*80)
print("CREDITS DATASET - First 3 rows")
print("="*80)
print(credits_df.head(3))

print("\n" + "="*80)
print("MOVIES COLUMNS:")
print("="*80)
print(movies_df.columns.tolist())

print("\n" + "="*80)
print("CREDITS COLUMNS:")
print("="*80)
print(credits_df.columns.tolist())

print("\n" + "="*80)
print("DATA TYPES:")
print("="*80)
print(movies_df.dtypes)

MOVIES DATASET - First 3 rows
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "name": "spy"}, {"id": 818, "name...                en   

                             original_title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                         

# Cell 6: Merge Datasets

In [None]:
# Merge movies and credits datasets
# The key column is 'id' in movies_df and 'movie_id' in credits_df

# First, let's check the common column names
print("Checking merge keys...")
if 'id' in movies_df.columns and 'movie_id' in credits_df.columns:
    df = movies_df.merge(credits_df, left_on='id', right_on='movie_id', how='inner')
elif 'movie_id' in movies_df.columns and 'movie_id' in credits_df.columns:
    df = movies_df.merge(credits_df, on='movie_id', how='inner')
else:
    # Try with id-id merge
    df = movies_df.merge(credits_df, on='id', how='inner')

print(f"✅ Datasets merged successfully!")
print(f" Merged Dataset Shape: {df.shape}")
print(f"🎬 Total Movies After Merge: {len(df)}")
print(f"\n Columns in Merged Dataset:")
print(df.columns.tolist())

Checking merge keys...
✅ Datasets merged successfully!
 Merged Dataset Shape: (4803, 24)
🎬 Total Movies After Merge: 4803

 Columns in Merged Dataset:
['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average', 'vote_count', 'movie_id', 'title_y', 'cast', 'crew']


# Cell 7: Data Preprocessing - Extract Features

In [None]:
def safe_parse_json(x):
    """Safely parse JSON strings from dataset"""
    try:
        if pd.isna(x):
            return []
        return json.loads(x.replace("'", '"'))
    except:
        try:
            return ast.literal_eval(x)
        except:
            return []

def extract_names(obj_list, key='name', limit=5):
    """Extract names from list of dictionaries"""
    try:
        names = [item[key] for item in obj_list[:limit] if key in item]
        return ', '.join(names)
    except:
        return ''

def extract_director(crew_list):
    """Extract director name from crew list"""
    try:
        for person in crew_list:
            if person.get('job') == 'Director':
                return person.get('name', '')
        return ''
    except:
        return ''

print(" Starting feature extraction...")

# Fix: Create a unified 'title' column from 'title_x' after merge
df['title'] = df['title_x']

# Extract genres
df['genres_parsed'] = df['genres'].apply(safe_parse_json)
df['genres_str'] = df['genres_parsed'].apply(lambda x: extract_names(x))

# Extract keywords
df['keywords_parsed'] = df['keywords'].apply(safe_parse_json)
df['keywords_str'] = df['keywords_parsed'].apply(lambda x: extract_names(x))

# Extract cast
df['cast_parsed'] = df['cast'].apply(safe_parse_json)
df['cast_str'] = df['cast_parsed'].apply(lambda x: extract_names(x, limit=5))

# Extract director from crew
df['crew_parsed'] = df['crew'].apply(safe_parse_json)
df['director'] = df['crew_parsed'].apply(extract_director)

print(" Feature extraction completed!")
print(f"\n Sample extracted features:")
print(df[['title', 'genres_str', 'keywords_str', 'cast_str', 'director']].head(3))

 Starting feature extraction...
 Feature extraction completed!

 Sample extracted features:
                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   

                                    genres_str  \
0  Action, Adventure, Fantasy, Science Fiction   
1                   Adventure, Fantasy, Action   
2                     Action, Adventure, Crime   

                                        keywords_str  \
0  culture clash, future, space war, space colony...   
1  ocean, drug abuse, exotic island, east india t...   
2     spy, based on novel, secret agent, sequel, mi6   

                                            cast_str        director  
0  Sam Worthington, Zoe Saldana, Sigourney Weaver...   James Cameron  
1  Johnny Depp, Orlando Bloom, Keira Knightley, S...  Gore Verbinski  
2  Daniel Craig, Christoph Waltz, Léa Seydoux, Ra...      Sam Mendes  


#  Cell 8: Clean and Handle Missing Values

In [None]:
# Handle missing values
print(" Cleaning data and handling missing values...")

# Fill missing overviews with empty string
df['overview'] = df['overview'].fillna('')

# Fill other text fields
df['genres_str'] = df['genres_str'].fillna('')
df['keywords_str'] = df['keywords_str'].fillna('')
df['cast_str'] = df['cast_str'].fillna('')
df['director'] = df['director'].fillna('')
df['title'] = df['title'].fillna('Unknown Title')

# Remove duplicates based on title
initial_count = len(df)
df = df.drop_duplicates(subset=['title'], keep='first')
final_count = len(df)

print(f" Data cleaning completed!")
print(f" Removed {initial_count - final_count} duplicate movies")
print(f"🎬 Final dataset size: {final_count} movies")

# Check for any remaining null values
print(f"\n Null values check:")
print(df[['title', 'overview', 'genres_str', 'keywords_str', 'cast_str', 'director']].isnull().sum())

 Cleaning data and handling missing values...
 Data cleaning completed!
 Removed 3 duplicate movies
🎬 Final dataset size: 4800 movies

 Null values check:
title           0
overview        0
genres_str      0
keywords_str    0
cast_str        0
director        0
dtype: int64


# Cell 9: Create Combined Search Text

In [None]:
"""
Create a combined text representation for each movie.
This text will be used to generate embeddings for semantic search.

Format: title + overview + genres + keywords + cast + director
"""

def create_search_text(row):
    """Combine all relevant features into a single searchable text"""
    components = [
        f"Title: {row['title']}",
        f"Overview: {row['overview']}",
        f"Genres: {row['genres_str']}",
        f"Keywords: {row['keywords_str']}",
        f"Cast: {row['cast_str']}",
        f"Director: {row['director']}"
    ]
    return ' '.join(components)

print(" Creating combined search text for embeddings...")

df['search_text'] = df.apply(create_search_text, axis=1)

print("✅ Search text created successfully!")
print(f"\n Sample search text (first 500 characters):")
print(df['search_text'].iloc[0][:500])
print("\n...")
print(f"\n Average text length: {df['search_text'].str.len().mean():.0f} characters")

 Creating combined search text for embeddings...
✅ Search text created successfully!

 Sample search text (first 500 characters):
Title: Avatar Overview: In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Genres: Action, Adventure, Fantasy, Science Fiction Keywords: culture clash, future, space war, space colony, society Cast: Sam Worthington, Zoe Saldana, Sigourney Weaver, Stephen Lang, Michelle Rodriguez Director: James Cameron

...

 Average text length: 526 characters


# Cell 10: Initialize Sentence Transformer Model

In [None]:
"""
Initialize SentenceTransformer for generating embeddings.
We use 'all-MiniLM-L6-v2' - a lightweight but powerful model.

Model specs:
- 384 dimensions
- Fast inference
- Good for semantic similarity tasks
"""

print(" Loading SentenceTransformer model...")
print("Model: all-MiniLM-L6-v2 (384 dimensions)")

model = SentenceTransformer('all-MiniLM-L6-v2')

print("✅ Model loaded successfully!")
print(f" Embedding dimension: {model.get_sentence_embedding_dimension()}")
print(f" Max sequence length: {model.max_seq_length}")

 Loading SentenceTransformer model...
Model: all-MiniLM-L6-v2 (384 dimensions)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded successfully!
 Embedding dimension: 384
 Max sequence length: 256


# Cell 11: Generate Embeddings

In [None]:
"""
Generate embeddings for all movies.
This is the most computationally intensive step.

Process:
1. Convert search text to embeddings using SentenceTransformer
2. Each movie gets a 384-dimensional vector
3. These vectors capture semantic meaning
"""

print(" Generating embeddings for all movies...")
print(" This may take 2-5 minutes depending on dataset size...")

start_time = time.time()

# Generate embeddings in batches for efficiency
embeddings = model.encode(
    df['search_text'].tolist(),
    show_progress_bar=True,
    batch_size=32,
    convert_to_numpy=True
)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"\n✅ Embeddings generated successfully!")
print(f"⏱ Time taken: {elapsed_time:.2f} seconds")
print(f" Embeddings shape: {embeddings.shape}")
print(f" Movies encoded: {len(embeddings)}")
print(f" Embedding dimensions: {embeddings.shape[1]}")

# Add embeddings to dataframe
df['embedding'] = embeddings.tolist()

 Generating embeddings for all movies...
 This may take 2-5 minutes depending on dataset size...


Batches:   0%|          | 0/150 [00:00<?, ?it/s]


✅ Embeddings generated successfully!
⏱ Time taken: 303.61 seconds
 Embeddings shape: (4800, 384)
 Movies encoded: 4800
 Embedding dimensions: 384


# Cell 12: Initialize ChromaDB with HNSW Index

In [None]:
from chromadb.config import Settings
import chromadb

# --- Initialize persistent Chroma client ---
chroma_client = chromadb.PersistentClient(
    path="./chroma_db",           # <- folder where chroma.sqlite3 will be stored
    settings=Settings(
        anonymized_telemetry=False,
        allow_reset=True
    )
)

collection_name = "movie_embeddings"

# Delete if exists (optional)
try:
    chroma_client.delete_collection(name=collection_name)
    print(f"🗑️ Deleted existing collection: {collection_name}")
except:
    pass

# Create new collection
collection = chroma_client.create_collection(
    name=collection_name,
    metadata={"hnsw:space": "cosine"}  # Use cosine similarity
)


#  Cell 13: Add Data to ChromaDB

In [None]:
"""
Add movie embeddings to ChromaDB collection.

Each document contains:
- id: unique movie identifier
- embedding: 384-dim vector
- metadata: title, overview, cast, director, etc.
"""

print(" Adding movie embeddings to ChromaDB...")
print(" This may take 1-3 minutes...")

# Prepare data for ChromaDB
ids = [str(i) for i in range(len(df))]
embeddings_list = embeddings.tolist()

# Prepare metadata
metadatas = []
for idx, row in df.iterrows():
    metadatas.append({
        'title': row['title'],
        'overview': row['overview'][:500],  # Limit overview length
        'genres': row['genres_str'],
        'keywords': row['keywords_str'],
        'cast': row['cast_str'],
        'director': row['director']
    })

# Add to collection in batches
batch_size = 500
total_batches = (len(ids) + batch_size - 1) // batch_size

for i in range(0, len(ids), batch_size):
    batch_end = min(i + batch_size, len(ids))

    collection.add(
        ids=ids[i:batch_end],
        embeddings=embeddings_list[i:batch_end],
        metadatas=metadatas[i:batch_end]
    )

    print(f"✓ Added batch {i//batch_size + 1}/{total_batches}")

print(f"\n✅ All {len(ids)} movies added to ChromaDB!")
print(f" Collection count: {collection.count()}")

 Adding movie embeddings to ChromaDB...
 This may take 1-3 minutes...
✓ Added batch 1/10
✓ Added batch 2/10
✓ Added batch 3/10
✓ Added batch 4/10
✓ Added batch 5/10
✓ Added batch 6/10
✓ Added batch 7/10
✓ Added batch 8/10
✓ Added batch 9/10
✓ Added batch 10/10

✅ All 4800 movies added to ChromaDB!
 Collection count: 4800


# Cell 14: Implement Semantic Search Function

In [None]:
"""
Semantic Search using Vector Database (ChromaDB + HNSW)

Process:
1. Convert user query to embedding
2. Use ChromaDB's HNSW index for fast ANN search
3. Return top-K similar movies with similarity scores
"""

def semantic_search(query, top_k=10):
    """
    Perform semantic search using vector database

    Args:
        query: User's search text
        top_k: Number of results to return

    Returns:
        List of tuples (movie_info, similarity_score)
    """
    start_time = time.time()

    # Generate query embedding
    query_embedding = model.encode([query])[0].tolist()

    # Query ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )

    end_time = time.time()
    query_time = (end_time - start_time) * 1000  # Convert to ms

    # Format results
    recommendations = []
    for i in range(len(results['ids'][0])):
        movie_info = {
            'title': results['metadatas'][0][i]['title'],
            'overview': results['metadatas'][0][i]['overview'],
            'genres': results['metadatas'][0][i]['genres'],
            'cast': results['metadatas'][0][i]['cast'],
            'director': results['metadatas'][0][i]['director'],
            'similarity': 1 - results['distances'][0][i]  # Convert distance to similarity
        }
        recommendations.append(movie_info)

    return recommendations, query_time

# Test semantic search
print(" Testing semantic search...")
test_query = "space adventure with aliens"
results, query_time = semantic_search(test_query, top_k=5)

print(f"\n✅ Semantic search working!")
print(f" Query: '{test_query}'")
print(f"⏱ Query time: {query_time:.2f} ms")
print(f"\n🎬 Top 5 Results:")
for i, movie in enumerate(results, 1):
    print(f"{i}. {movie['title']} - Similarity: {movie['similarity']*100:.1f}%")

 Testing semantic search...

✅ Semantic search working!
 Query: 'space adventure with aliens'
⏱ Query time: 26.98 ms

🎬 Top 5 Results:
1. Prometheus - Similarity: 61.3%
2. U.F.O. - Similarity: 59.4%
3. Galaxy Quest - Similarity: 57.4%
4. Aliens in the Attic - Similarity: 57.3%
5. Home - Similarity: 56.1%


# Cell 15: Build Collaborative Filtering Similarity Matrix

In [None]:
"""
Collaborative Filtering (Item-Based)

Note: TMDB dataset doesn't have user ratings, so we use content similarity
as a proxy for collaborative filtering.

Approach:
1. Use TF-IDF on combined text features
2. Compute item-item similarity matrix
3. Recommend movies similar to a given movie
"""

print(" Building collaborative filtering similarity matrix...")
print(" Computing TF-IDF and similarity matrix...")

start_time = time.time()

# Create TF-IDF vectors for movie similarity
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2)
)

tfidf_matrix = tfidf.fit_transform(df['search_text'])

# Compute cosine similarity matrix
# Note: For large datasets, this can be memory intensive
# We'll compute it in chunks if needed
print(f" TF-IDF matrix shape: {tfidf_matrix.shape}")

# Compute similarity matrix
cf_similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

end_time = time.time()
print(f"\n✅ Collaborative filtering matrix built!")
print(f" Time taken: {end_time - start_time:.2f} seconds")
print(f" Similarity matrix shape: {cf_similarity_matrix.shape}")

# Create movie title to index mapping
title_to_idx = {title: idx for idx, title in enumerate(df['title'])}
idx_to_title = {idx: title for title, idx in title_to_idx.items()}

 Building collaborative filtering similarity matrix...
 Computing TF-IDF and similarity matrix...
 TF-IDF matrix shape: (4800, 5000)

✅ Collaborative filtering matrix built!
 Time taken: 3.47 seconds
 Similarity matrix shape: (4800, 4800)


# Cell 16: Implement Collaborative Filtering Function

In [None]:
"""
Collaborative Filtering Recommendation Function
"""

def collaborative_filtering(movie_title=None, query=None, top_k=10):
    """
    Perform collaborative filtering recommendation

    Args:
        movie_title: Specific movie to find similar movies
        query: Text query (will find closest movie first)
        top_k: Number of recommendations

    Returns:
        List of recommended movies with similarity scores
    """
    start_time = time.time()

    # If query provided, first find the most similar movie
    if query and not movie_title:
        # Use semantic search to find closest movie
        semantic_results, _ = semantic_search(query, top_k=1)
        if semantic_results:
            movie_title = semantic_results[0]['title']

    # Get movie index
    if movie_title not in title_to_idx:
        return [], 0

    movie_idx = title_to_idx[movie_title]

    # Get similarity scores for this movie
    similarity_scores = cf_similarity_matrix[movie_idx]

    # Get top-K similar movies (excluding the movie itself)
    similar_indices = similarity_scores.argsort()[::-1][1:top_k+1]

    end_time = time.time()
    query_time = (end_time - start_time) * 1000

    # Format results
    recommendations = []
    for idx in similar_indices:
        movie_data = df.iloc[idx]
        movie_info = {
            'title': movie_data['title'],
            'overview': movie_data['overview'][:500],
            'genres': movie_data['genres_str'],
            'cast': movie_data['cast_str'],
            'director': movie_data['director'],
            'similarity': similarity_scores[idx]
        }
        recommendations.append(movie_info)

    return recommendations, query_time

# Test collaborative filtering
print(" Testing collaborative filtering...")
test_movie = df['title'].iloc[0]
results, query_time = collaborative_filtering(movie_title=test_movie, top_k=5)

print(f"\n✅ Collaborative filtering working!")
print(f" Base movie: '{test_movie}'")
print(f" Query time: {query_time:.2f} ms")
print(f"\n🎬 Top 5 Similar Movies:")
for i, movie in enumerate(results, 1):
    print(f"{i}. {movie['title']} - Similarity: {movie['similarity']*100:.1f}%")

 Testing collaborative filtering...

✅ Collaborative filtering working!
 Base movie: 'Avatar'
 Query time: 0.54 ms

🎬 Top 5 Similar Movies:
1. Aliens - Similarity: 24.3%
2. Guardians of the Galaxy - Similarity: 23.9%
3. Moonraker - Similarity: 23.5%
4. Lost in Space - Similarity: 22.8%
5. Star Trek Beyond - Similarity: 22.7%


# Cell 17: Implement Hybrid Recommendation

In [None]:
"""
Hybrid Recommendation System

Combines semantic search and collaborative filtering:
Hybrid Score = α × Semantic Similarity + (1 - α) × CF Similarity

where α controls the weight (default: 0.7 for semantic, 0.3 for CF)
"""

def hybrid_recommendation(query, top_k=10, alpha=0.7):
    """
    Hybrid recommendation combining semantic and collaborative filtering

    Args:
        query: User search query
        top_k: Number of recommendations
        alpha: Weight for semantic similarity (0-1)

    Returns:
        List of recommended movies with combined scores
    """
    start_time = time.time()

    # Get semantic search results (more results for better coverage)
    semantic_results, _ = semantic_search(query, top_k=top_k*2)

    # Get collaborative filtering results
    cf_results, _ = collaborative_filtering(query=query, top_k=top_k*2)

    # Combine results
    combined_scores = {}

    # Add semantic scores
    for movie in semantic_results:
        title = movie['title']
        combined_scores[title] = {
            'info': movie,
            'semantic_score': movie['similarity'],
            'cf_score': 0.0
        }

    # Add CF scores
    for movie in cf_results:
        title = movie['title']
        if title in combined_scores:
            combined_scores[title]['cf_score'] = movie['similarity']
        else:
            combined_scores[title] = {
                'info': movie,
                'semantic_score': 0.0,
                'cf_score': movie['similarity']
            }

    # Calculate hybrid scores
    for title in combined_scores:
        semantic_score = combined_scores[title]['semantic_score']
        cf_score = combined_scores[title]['cf_score']
        hybrid_score = alpha * semantic_score + (1 - alpha) * cf_score
        combined_scores[title]['hybrid_score'] = hybrid_score

    # Sort by hybrid score
    sorted_movies = sorted(
        combined_scores.items(),
        key=lambda x: x[1]['hybrid_score'],
        reverse=True
    )[:top_k]

    end_time = time.time()
    query_time = (end_time - start_time) * 1000

    # Format results
    recommendations = []
    for title, scores in sorted_movies:
        movie_info = scores['info'].copy()
        movie_info['similarity'] = scores['hybrid_score']
        movie_info['semantic_score'] = scores['semantic_score']
        movie_info['cf_score'] = scores['cf_score']
        recommendations.append(movie_info)

    return recommendations, query_time

# Test hybrid recommendation
print("🧪 Testing hybrid recommendation...")
test_query = "romantic comedy"
results, query_time = hybrid_recommendation(test_query, top_k=5, alpha=0.7)

print(f"\n✅ Hybrid recommendation working!")
print(f" Query: '{test_query}'")
print(f" Query time: {query_time:.2f} ms")
print(f" Alpha (semantic weight): 0.7")
print(f"\n🎬 Top 5 Hybrid Results:")
for i, movie in enumerate(results, 1):
    print(f"{i}. {movie['title']}")
    print(f"   Hybrid: {movie['similarity']*100:.1f}% | Semantic: {movie['semantic_score']*100:.1f}% | CF: {movie['cf_score']*100:.1f}%")

🧪 Testing hybrid recommendation...

✅ Hybrid recommendation working!
 Query: 'romantic comedy'
 Query time: 47.01 ms
 Alpha (semantic weight): 0.7

🎬 Top 5 Hybrid Results:
1. 16 to Life
   Hybrid: 46.2% | Semantic: 66.1% | CF: 0.0%
2. Accidental Love
   Hybrid: 43.4% | Semantic: 53.8% | CF: 19.2%
3. About Last Night
   Hybrid: 42.7% | Semantic: 60.9% | CF: 0.0%
4. Date Movie
   Hybrid: 42.6% | Semantic: 60.9% | CF: 0.0%
5. Closer
   Hybrid: 39.0% | Semantic: 55.8% | CF: 0.0%


#  Cell 18: Save Processed Data

In [None]:
"""
Save processed data and models for Streamlit app
"""

print(" Saving processed data...")

# Save movie dataframe
df_save = df[['title', 'overview', 'genres_str', 'keywords_str', 'cast_str', 'director']].copy()
df_save.to_csv('movies_processed.csv', index=False)

# Save title to index mapping
import pickle
with open('title_to_idx.pkl', 'wb') as f:
    pickle.dump(title_to_idx, f)

with open('idx_to_title.pkl', 'wb') as f:
    pickle.dump(idx_to_title, f)

# Save similarity matrix
np.save('cf_similarity_matrix.npy', cf_similarity_matrix)

print("✅ Data saved successfully!")
print(" Files created:")
print("  - movies_processed.csv")
print("  - title_to_idx.pkl")
print("  - idx_to_title.pkl")
print("  - cf_similarity_matrix.npy")

 Saving processed data...
✅ Data saved successfully!
 Files created:
  - movies_processed.csv
  - title_to_idx.pkl
  - idx_to_title.pkl
  - cf_similarity_matrix.npy


# Cell 19: Create Streamlit App

In [None]:
# movie_details.py
import streamlit as st
import requests

TMDB_BEARER = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxNTI0NjY1ZGY4YzI5NWU3YzFlZDg1YjQwMDQ2MTg1YyIsIm5iZiI6MTc0NjA0NTgzMC4xMDYsInN1YiI6IjY4MTI4Yjg2MTE1YjkyYTczMmEwZWJhZCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.iuBlFIRD2TRTPWN1BF7MiJopk3IaAe51zo6mX8q52oM"
TMDB_BASE = "https://api.themoviedb.org/3"
TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w500"
HEADERS = {"Authorization": f"Bearer {TMDB_BEARER}"}

def get_movie_details(title):
    """Search movie by title and fetch full details including credits"""
    search_url = f"{TMDB_BASE}/search/movie"
    r = requests.get(search_url, headers=HEADERS, params={"query": title})
    data = r.json()
    if not data.get("results"):
        return None
    movie_id = data["results"][0]["id"]
    details = requests.get(
        f"{TMDB_BASE}/movie/{movie_id}",
        headers=HEADERS,
        params={"append_to_response": "credits"}
    ).json()
    return details

def render_movie_details(movie, recommend_fn, display_fn):
    st.markdown("<div style='padding-top:0 !important;'></div>", unsafe_allow_html=True)  # remove top padding
    details = get_movie_details(movie["title"])
    if not details:
        st.error("Failed to load movie details")
        return

    poster = f"{TMDB_IMAGE_BASE}{details['poster_path']}" if details.get("poster_path") else ""
    genres = ", ".join([g["name"] for g in details.get("genres", [])])
    cast = ", ".join([c["name"] for c in details.get("credits", {}).get("cast", [])[:5]])
    director = next(
        (c["name"] for c in details.get("credits", {}).get("crew", []) if c["job"] == "Director"),
        "N/A"
    )

    # ───── Movie Main Card ─────
    st.markdown(f"""
    <div style="
        display:flex;
        gap:2rem;
        background:rgba(30,41,59,0.85);
        padding:1.6rem;
        border-radius:18px;
    ">
        <img src="{poster}" style="width:260px;border-radius:14px;">
        <div style="color:#e5e7eb;">
            <h2>{details['title']}</h2>
            <p><strong>Genres:</strong> {genres}</p>
            <p><strong>Rating:</strong> ⭐ {details['vote_average']}</p>
            <p><strong>Runtime:</strong> {details.get('runtime','N/A')} mins</p>
            <p><strong>Director:</strong> {director}</p>
            <p><strong>Cast:</strong> {cast}</p>
            <p style="color:#cbd5e1;line-height:1.6;">
                {details.get('overview','No overview')}
            </p>
        </div>
    </div>
    """, unsafe_allow_html=True)

    # ───── Recommendations ─────
    st.markdown("### 🔁 Recommended Similar Movies")
    recs, _ = recommend_fn(details.get("overview",""), top_k=6)
    recs = [r for r in recs if r['title'].lower() != movie["title"].lower()]

    for i, r in enumerate(recs, 1):
        display_fn(r, f"detail_rec_{i}")


In [None]:
%%writefile app.py

import streamlit as st
import pandas as pd
import numpy as np
import pickle
import time
from sentence_transformers import SentenceTransformer
import chromadb
import requests
import matplotlib.pyplot as plt

# ────────────────────────────────────────────────
# Page Config
# ────────────────────────────────────────────────
st.set_page_config(
    page_title=" Movie Recommender ",
    page_icon="🎬",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# ────────────────────────────────────────────────
# Session page attribute for CSS
# ────────────────────────────────────────────────
page_attr = "details" if "page" in st.session_state and st.session_state.page == "details" else "home"
st.markdown(f"""
    <script>
        document.body.setAttribute('data-page', '{page_attr}');
    </script>
""", unsafe_allow_html=True)

# ────────────────────────────────────────────────
# Modern Soft-Glam Dark Theme
# ────────────────────────────────────────────────
st.markdown("""
    <style>
    .stApp {
        background: linear-gradient(rgba(15,23,42,0.85), rgba(30,41,59,0.85)),
                    url("https://raw.githubusercontent.com/shaymarin78/Movie-Recommender/main/hero_banner4.jpg");
        background-size: cover;
        background-position: center;
        background-attachment: fixed;
    }

    .block-container { max-width: 1100px; padding-top: 1.5rem !important; }
    body[data-page="details"] .block-container { padding-top: 0 !important; }

    h1, h2, h3 { color: #c7d2fe !important; font-family: 'Segoe UI', sans-serif; }

    .movie-card {
        background: rgba(30, 41, 59, 0.75);
        backdrop-filter: blur(8px);
        border-radius: 16px;
        padding: 1.2rem;
        margin: 0.9rem 0;
        border: 1px solid rgba(99, 102, 241, 0.18);
        transition: all 0.28s ease;
        box-shadow: 0 6px 20px rgba(0,0,0,0.35);
        cursor: pointer;
    }
    .movie-card:hover {
        transform: translateY(-6px);
        border-color: #818cf8;
        box-shadow: 0 12px 32px rgba(99, 102, 241, 0.25);
    }
    .score-highlight {
        font-size: 1.35rem;
        font-weight: 700;
        color: #a5b4fc;
        background: rgba(99, 102, 241, 0.12);
        padding: 0.35rem 0.8rem;
        border-radius: 12px;
        display: inline-block;
    }
    .stButton > button {
        background: linear-gradient(90deg, #6366f1, #818cf8) !important;
        color: white !important;
        border: none !important;
        border-radius: 12px !important;
        padding: 0.6rem 1.4rem !important;
        font-weight: 600 !important;
        transition: all 0.2s;
    }
    .stButton > button:hover { transform: scale(1.04); box-shadow: 0 4px 15px rgba(99,102,241,0.4) !important; }
    .metric-box { background: rgba(30,41,59,0.6); border-radius: 14px; padding: 1.1rem; text-align: center; border: 1px solid #334155; }
    hr { border-color: #334155 !important; }
    div[data-testid="stRadio"] [role="radio"][aria-checked="true"] > div > div { background-color: #6366f1 !important; border-color: #818cf8 !important; }
    div[data-testid="stSlider"] .stSliderTrack { background: linear-gradient(to right, #4f46e5, #818cf8) !important; }
    div[data-testid="stSlider"] .stSliderThumb { background-color: #6366f1 !important; border-color: #a5b4fc !important; }
    div[data-testid="stSlider"] .stSliderValue { color: #c7d2fe !important; }
    .recommendations-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(240px, 1fr)); gap: 1rem; }
    </style>
""", unsafe_allow_html=True)

# ────────────────────────────────────────────────
# TMDB API Config
# ────────────────────────────────────────────────
TMDB_BEARER = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxNTI0NjY1ZGY4YzI5NWU3YzFlZDg1YjQwMDQ2MTg1YyIsIm5iZiI6MTc0NjA0NTgzMC4xMDYsInN1YiI6IjY4MTI4Yjg2MTE1YjkyYTczMmEwZWJhZCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.iuBlFIRD2TRTPWN1BF7MiJopk3IaAe51zo6mX8q52oM"
TMDB_BASE = "https://api.themoviedb.org/3"
TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w500"

def get_tmdb_poster(title):
    url = f"{TMDB_BASE}/search/movie"
    headers = {"Authorization": f"Bearer {TMDB_BEARER}"}
    params = {"query": title, "language": "en-US", "include_adult": False, "page": 1}
    try:
        res = requests.get(url, headers=headers, params=params)
        data = res.json()
        if data["results"]:
            poster_path = data["results"][0].get("poster_path")
            if poster_path:
                return f"{TMDB_IMAGE_BASE}{poster_path}"
    except:
        pass
    return None

# ────────────────────────────────────────────────
# Session state defaults
# ────────────────────────────────────────────────
for key in ['results', 'query_time', 'search_performed', 'method_used']:
    if key not in st.session_state:
        st.session_state[key] = None if key != 'search_performed' else False
if "page" not in st.session_state: st.session_state.page = "home"
if "selected_movie" not in st.session_state: st.session_state.selected_movie = None

# ────────────────────────────────────────────────
# Load resources
# ────────────────────────────────────────────────
@st.cache_resource(show_spinner="Loading intelligence...")
def load_resources():
    model = SentenceTransformer('all-MiniLM-L6-v2')
    df = pd.read_csv('movies_processed.csv')
    df['overview'] = df['overview'].astype(str).fillna('')
    with open('title_to_idx.pkl', 'rb') as f:
        title_to_idx = pickle.load(f)
    with open('idx_to_title.pkl', 'rb') as f:
        _ = pickle.load(f)
    cf_sim = np.load('cf_similarity_matrix.npy')
    client = chromadb.PersistentClient(path="/content/chroma_db")
    collection = client.get_collection("movie_embeddings")
    return model, df, title_to_idx, cf_sim, collection

model, df, title_to_idx, cf_similarity_matrix, collection = load_resources()

# ────────────────────────────────────────────────
# Recommendation Functions
# ────────────────────────────────────────────────
# ────────────────────────────────────────────────
# Core search functions (same as before but cleaner)
# ────────────────────────────────────────────────
def semantic_search(query, top_k=10):
    start = time.time()
    q_emb = model.encode([query])[0].tolist()
    res = collection.query(query_embeddings=[q_emb], n_results=top_k)
    recs = []
    for i, meta in enumerate(res['metadatas'][0]):
        recs.append({
            'title': meta.get('title', 'Unknown'),
            'overview': meta.get('overview', 'No overview available'),
            'genres': meta.get('genres', 'N/A'),
            'cast': meta.get('cast', 'N/A'),
            'director': meta.get('director', 'N/A'),
            'similarity': 1 - res['distances'][0][i]
        })
    return recs, (time.time() - start) * 1000

def collaborative_filtering(query, top_k=10):
    sem_res, _ = semantic_search(query, 1)
    if not sem_res: return [], 0
    title = sem_res[0]['title']
    if title not in title_to_idx: return [], 0

    start = time.time()
    idx = title_to_idx[title]
    scores = cf_similarity_matrix[idx]
    sim_idx = scores.argsort()[::-1][1:top_k+1]

    recs = []
    for i in sim_idx:
        row = df.iloc[i]
        recs.append({
            'title': row['title'],
            'overview': row['overview'],
            'genres': row.get('genres', 'N/A'),
            'cast': row.get('cast', 'N/A'),
            'director': row.get('director', 'N/A'),
            'similarity': scores[i]
        })
    return recs, (time.time() - start) * 1000

def hybrid_recommendation(query, top_k=10, alpha=0.7):
    sem, _ = semantic_search(query, top_k*4)
    cf, _  = collaborative_filtering(query, top_k*4)
    start = time.time()

    scores = {}
    for m in sem: scores[m['title']] = {'sem': m['similarity'], 'cf': 0.0, 'info': m}
    for m in cf:
        t = m['title']
        if t in scores: scores[t]['cf'] = m['similarity']
        else: scores[t] = {'sem': 0.0, 'cf': m['similarity'], 'info': m}

    for t in scores:
        scores[t]['hybrid'] = alpha * scores[t]['sem'] + (1 - alpha) * scores[t]['cf']

    sorted_items = sorted(scores.items(), key=lambda x: x[1]['hybrid'], reverse=True)[:top_k]
    recs = [data['info'].copy() | {'similarity': data['hybrid']} for _, data in sorted_items]
    return recs, (time.time() - start) * 1000
# ────────────────────────────────────────────────
# Display Movie Card
# ────────────────────────────────────────────────
def display_movie_card(movie, rank, context="home"):
    poster_url = get_tmdb_poster(movie['title'])
    with st.container():
        st.markdown(f"""
        <div class="movie-card">
            <h3 style="margin:0 0 0.6rem 0;">{rank}. {movie['title']}</h3>
            <div class="score-highlight">{movie['similarity']:.3f}</div>
            <div style="display:flex; gap:1rem; margin-top:0.6rem;">
                {'<img src="'+poster_url+'" width="120">' if poster_url else ''}
                <div>
                    <p style="margin:0.8rem 0 0.4rem 0; color:#cbd5e1;">
                        <strong>Genres:</strong> {movie.get('genres', 'N/A')}<br>
                        <strong>Cast:</strong> {movie.get('cast', 'N/A')}<br>
                        <strong>Director:</strong> {movie.get('director', 'N/A')}
                    </p>
                    <p style="color:#94a3b8; line-height:1.45;">
                        {movie.get('overview', 'No overview available')[:240]}{'...' if len(movie.get('overview','')) > 240 else ''}
                    </p>
                </div>
            </div>
        </div>
        """, unsafe_allow_html=True)

    btn_key = f"{context}_{movie['title']}"
    if st.button("View details", key=btn_key):
        st.session_state.selected_movie = movie
        st.session_state.page = "details"
        st.stop()


# ────────────────────────────────────────────────
# HOME PAGE LOGIC
# ────────────────────────────────────────────────
if st.session_state.page == "home":

    st.title("🎬 Movie Recommender")
    st.markdown("**Discover your next favorite movie** — Semantic · Collaborative · Hybrid")

    with st.container():
        col_method, col_k, col_alpha, col_search = st.columns([2.2, 1.1, 1.3, 1.1])

        with col_method:
            method = st.radio("Method", ["Semantic", "Collaborative", "Hybrid"], horizontal=True, label_visibility="collapsed")

        with col_k:
            top_k_options = [4, 6, 8, 10, 12, 14, 16, 20, 30]
            top_k = st.selectbox("Results count", top_k_options, index=2, label_visibility="collapsed")

        with col_alpha:
            alpha = 0.7
            if method == "Hybrid":
                alpha = st.slider("α (Semantic weight)", 0.0, 1.0, 0.7, 0.05, label_visibility="collapsed")

        with col_search:
            st.write("")
            search_trigger = st.button("Find Movies", type="primary", use_container_width=True)

    query = st.text_input("What are you in the mood for?",
                          placeholder="space adventure with aliens • dark thriller like Inception • feel-good romance",
                          key="main_query", label_visibility="collapsed")

    examples = [
        "space adventure with aliens",
        "dark  thriller about dreams",
        "romantic comedy Leonardo DiCaprio",
        "Marvel style superhero action",
        "historical war drama"
    ]
    st.markdown("**Quick tries:**")
    ex_cols = st.columns(5)
    for i, txt in enumerate(examples):
        if ex_cols[i].button(txt, key=f"ex_{i}", use_container_width=True):
            query = txt
            search_trigger = True

    if search_trigger and query.strip():
      with st.spinner("Computing recommendations for all methods..."):
        # Compute results for all methods
        sem_results, sem_time = semantic_search(query, top_k)
        cf_results, cf_time = collaborative_filtering(query, top_k)
        hyb_results, hyb_time = hybrid_recommendation(query, top_k, alpha)

        # Store all results
        st.session_state.results_all = {
            "Semantic": {"results": sem_results, "time": sem_time},
            "Collaborative": {"results": cf_results, "time": cf_time},
            "Hybrid": {"results": hyb_results, "time": hyb_time},
        }

        # Store currently selected method for display
        st.session_state.results = (
            sem_results if method=="Semantic" else
            cf_results if method=="Collaborative" else
            hyb_results
        )
        st.session_state.query_time = (
            sem_time if method=="Semantic" else
            cf_time if method=="Collaborative" else
            hyb_time
        )
        st.session_state.method_used = method
        st.session_state.search_performed = True
        st.session_state.last_query = query


    if st.session_state.search_performed and st.session_state.results:
        st.markdown("---")
        st.subheader(f"Results for: **{st.session_state.last_query}**  ·  {st.session_state.method_used}")
        m1, m2, m3 = st.columns(3)
        m1.metric("Speed", f"{st.session_state.query_time:.1f} ms")
        m2.metric("Found", len(st.session_state.results))
        avg = np.mean([m['similarity'] for m in st.session_state.results])
        m3.metric("Avg Match", f"{avg:.3f}")

        for i, mov in enumerate(st.session_state.results, 1):
            display_movie_card(mov, i, context="home")
    elif st.session_state.search_performed:
        st.info("No strong matches. Try rephrasing or switch method.")

    st.markdown("<br><small style='color:#64748b;'>Semantic Search (ChromaDB + Sentence Transformers) · Collaborative Filtering · Hybrid Approach<br>Built with Streamlit |2025</small>", unsafe_allow_html=True)




# ────────────────────────────────────────────────
# Evaluation & Comparison Tab (Dynamic)
# ────────────────────────────────────────────────
st.markdown("---")

tab_results, tab_eval = st.tabs(["Results", "Evaluation & Comparison"])

with tab_eval:
    st.subheader("Method Comparison & Metrics")

    if st.session_state.search_performed and hasattr(st.session_state, "results_all"):
        data = {
            "Method": [], "Avg Similarity": [], "Query Time (ms)": [],
            "Precision@10": [], "Recall@10": [], "Diversity": [], "Cold-start handling": []
        }

        for m_name, m_data in st.session_state.results_all.items():
            res = m_data['results']
            qtime = m_data['time']

            if res:
                sim_scores = np.array([r['similarity'] for r in res])
                avg_sim = float(sim_scores.mean())

                K = min(10, len(res))
                top_k_scores = np.sort(sim_scores)[-K:]

                # Method-wise precision@K: fraction of top-K above 0.5
                precision_at_k = float((top_k_scores >= 0.25).sum() / K)

                # Method-wise recall@K: fraction of all results above 0.5
                recall_at_k = float((sim_scores >= 0.25).sum() / len(sim_scores))

                # Diversity
                all_genres = [genre for r in res for genre in r['genres']]
                unique_genres = len(set(all_genres))
                if unique_genres > 4:
                    diversity = "High"
                elif unique_genres >= 2:
                    diversity = "Medium"
                else:
                    diversity = "Low"

                # Cold-start dynamically
                if precision_at_k >= 0.7:
                    cold_start = "Good"
                elif precision_at_k >= 0.5:
                    cold_start = "Medium"
                else:
                    cold_start = "Poor"
            else:
                avg_sim = precision_at_k = recall_at_k = 0
                diversity = "Low"
                cold_start = "Poor"

            data["Method"].append(m_name)
            data["Avg Similarity"].append(avg_sim)
            data["Query Time (ms)"].append(qtime)
            data["Precision@10"].append(precision_at_k)
            data["Recall@10"].append(recall_at_k)
            data["Diversity"].append(diversity)
            data["Cold-start handling"].append(cold_start)

        # Display
        df_metrics = pd.DataFrame(data)
        highlight_cols = ["Avg Similarity", "Precision@10", "Recall@10"]
        st.dataframe(df_metrics.style.highlight_max(subset=highlight_cols, color="#6366f140"),
                     use_container_width=True)

        # Bar chart
        fig, ax = plt.subplots(figsize=(8,4))
        x = np.arange(len(df_metrics))
        width = 0.25
        ax.bar(x - width, df_metrics["Precision@10"], width, label="Precision@10", color="#6366f1")
        ax.bar(x, df_metrics["Recall@10"], width, label="Recall@10", color="#a5b4fc")
        ax.bar(x + width, df_metrics["Avg Similarity"], width, label="Avg Similarity", color="#c7d2fe")
        ax.set_xticks(x)
        ax.set_xticklabels(df_metrics["Method"])
        ax.set_ylim(0, 1)
        ax.set_ylabel("Score")
        ax.set_title("Evaluation Metrics for Last Search")
        ax.legend()
        ax.grid(axis='y', alpha=0.3)
        st.pyplot(fig)

        st.caption("""
Precision & Recall are calculated **method-wise using each method's own similarity scores**:
- Semantic → similarity from semantic search
- Collaborative → similarity from CF results
- Hybrid → weighted similarity (hybrid score)
This avoids all-zero or all-one issues and reflects actual performance per method.
""")
    else:
        st.info("Metrics will appear here after performing a search.")

#st.markdown("<br><small style='color:#64748b;'>Semantic Search (ChromaDB + Sentence Transformers) · Collaborative Filtering · Hybrid Approach<br>Built with Streamlit |2025</small>", unsafe_allow_html=True)

# ────────────────────────────────────────────────
# DETAILS PAGE LOGIC
# ────────────────────────────────────────────────
if st.session_state.page == "details":
    st.markdown("<div style='margin-top:20px;'>", unsafe_allow_html=True)  # Add top margin
    if st.button("← Back"):
        st.session_state.page = "home"
        st.session_state.selected_movie = None
        st.stop()
    st.markdown("</div>", unsafe_allow_html=True)


    movie = st.session_state.selected_movie

    st.subheader(movie["title"])
    poster = get_tmdb_poster(movie["title"])
    if poster:
        st.image(poster, width=260)

    st.markdown(f"""
    **Genres:** {movie.get("genres","N/A")}
    **Cast:** {movie.get("cast","N/A")}
    **Director:** {movie.get("director","N/A")}
    **Overview:**
    {movie.get("overview","")}
    """)

    st.markdown("---")
    st.markdown("### 🔁 Recommended based on this movie")

    recs, _ = hybrid_recommendation(movie.get("overview",""), top_k=6)
    recs = [r for r in recs if r['title'] != movie['title']]  # remove current movie

    # Wrap recommendations in a grid container
    st.markdown('<div class="recommendations-grid">', unsafe_allow_html=True)
    for i, r in enumerate(recs, 1):
        display_movie_card(r, i, context="details")
    st.markdown('</div>', unsafe_allow_html=True)



Overwriting app.py


In [None]:
!pkill -f streamlit
!pkill -f cloudflared

# Cell 20: Launch Streamlit App

In [None]:
# Cell 20: Launch Streamlit App with Public URL (Cloudflared - NO PASSWORD)

print("🚀 Launching Streamlit App with Cloudflared (No Password)...")

import os, time, subprocess, re
from IPython.display import display, HTML

# Check app.py
if not os.path.exists("app.py"):
    raise FileNotFoundError("❌ app.py not found. Please run the previous cell.")

print("✅ app.py found")

# Install cloudflared
print(" Installing cloudflared...")
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared
!mv cloudflared /usr/local/bin/

# Start Streamlit
print("🌟 Starting Streamlit server...")
get_ipython().system_raw("streamlit run app.py --server.port 8501 &")

time.sleep(10)

# Start Cloudflared tunnel
print(" Creating public URL...")
get_ipython().system_raw("cloudflared tunnel --url http://localhost:8501 --no-autoupdate > cloudflared.log 2>&1 &")

time.sleep(8)

# Extract URL
with open("cloudflared.log") as f:
    log = f.read()

match = re.search(r"https://[-\w]+\.trycloudflare\.com", log)

if match:
    public_url = match.group(0)
    print("="*80)
    print(" STREAMLIT APP IS LIVE (NO PASSWORD REQUIRED)")
    print("="*80)
    print(f" PUBLIC URL: {public_url}")
    print("="*80)

    display(HTML(
        f'<h2>🎬 <a href="{public_url}" target="_blank">'
        'Click Here to Open Movie Recommendation App</a></h2>'
    ))
else:
    print("❌ Failed to generate URL. Check cloudflared.log")


🚀 Launching Streamlit App with Cloudflared (No Password)...
✅ app.py found
 Installing cloudflared...
🌟 Starting Streamlit server...
 Creating public URL...
 STREAMLIT APP IS LIVE (NO PASSWORD REQUIRED)
 PUBLIC URL: https://answer-tion-cooked-trio.trycloudflare.com
