In [59]:
# The '!' allows you to run terminal commands inside the notebook
# The '--user' ensures it bypasses permission issues
!pip install -U sentence-transformers spacytextblob scikit-learn



In [2]:
# import spacy
# import pandas as pd
# from spacytextblob.spacytextblob import SpacyTextBlob
# import ast
# import pickle
import pandas as pd
import numpy as np
import ast
import pickle
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [104]:
# 1. Initialize NLP Tools
print("Initializing spaCy...")
try:
    nlp = spacy.load("en_core_web_sm")
except:
    import os
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

nlp.add_pipe('spacytextblob')

Initializing spaCy...


<spacytextblob.spacytextblob.SpacyTextBlob at 0x171355850>

In [105]:
# 2. Data Cleaning
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
df = movies.merge(credits, on='title')

In [106]:
def convert_list(obj):
    return [i['name'] for i in ast.literal_eval(obj)]

In [107]:
df['genres'] = df['genres'].apply(convert_list)

In [108]:
df['keywords'] = df['keywords'].apply(convert_list)

In [109]:
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:3]])

In [110]:
# 3. NLP Feature Extraction
def process_text(text):
    if pd.isna(text): return "", 0.0
    doc = nlp(text)
    # Get lemmas and remove noise
    clean = " ".join([t.lemma_.lower() for t in doc if not t.is_stop and not t.is_punct])
    return clean, doc._.blob.polarity

In [111]:
print("Analyzing movie tones and cleaning text...")
nlp_data = df['overview'].apply(process_text)
df['cleaned_overview'] = [x[0] for x in nlp_data]
df['tone_score'] = [x[1] for x in nlp_data]

Analyzing movie tones and cleaning text...


In [112]:
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,status,tagline,title,vote_average,vote_count,movie_id,cast,crew,cleaned_overview,tone_score
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,19995,"[culture clash, future, space war, space colon...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",22nd century paraplegic marine dispatch moon p...,0.041667


In [113]:
# Create the search soup
df['soup'] = df['cleaned_overview'] + " " + \
             df['genres'].apply(lambda x: " ".join(x)) + " " + \
             df['keywords'].apply(lambda x: " ".join(x))

In [114]:
# 4. Generate Semantic Vectors
print("Generating movie embeddings (this powers the 'smart' search)...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['soup'].tolist(), show_progress_bar=True)

Generating movie embeddings (this powers the 'smart' search)...


Batches:   0%|          | 0/151 [00:00<?, ?it/s]

In [124]:
def semantic_search_test(query):
    query_vec = model.encode([query])
    similarity = cosine_similarity(query_vec, embeddings).flatten()
    
    test_df = df[['title', 'tone_score']].copy()
    test_df['similarity'] = similarity
    
    # Custom Logic: If 'sad' or 'dark' is in query, prioritize lower tone_scores
    if any(word in query.lower() for word in ['sad', 'dark', 'past']):
        test_df['similarity'] = test_df['similarity'] * (1 - test_df['tone_score'])
        
    return test_df.sort_values(by='similarity', ascending=False).head(10)

In [146]:
def smart_search(query, top_n=10):
    # 1. Get Semantic Similarity (Using YOUR variable name 'model')
    query_vec = model.encode([query])
    similarity = cosine_similarity(query_vec, embeddings).flatten()
    
    # 2. Prepare Results DataFrame (Using YOUR variable name 'df')
    # We create a copy to avoid modifying the original data
    test_df = df[['title', 'tone_score', 'popularity']].copy()
    test_df['similarity'] = similarity
    
    # --- STEP A: KILL THE NOISE (The Threshold) ---
    # This ensures irrelevant movies like 'Minions' don't win just by popularity
    test_df = test_df[test_df['similarity'] > 0.30] 
    
    # --- STEP B: TONE & VIBE LOGIC ---
    # We combine the "Happy/Sad" and "Animated/Kids" logic here
    query_lower = query.lower()
    
    if any(word in query_lower for word in ['happy', 'fun', 'cheerful', 'animated', 'kids']):
        # Boost movies with POSITIVE sentiment
        test_df['similarity'] = test_df['similarity'] * (1 + test_df['tone_score'])
    elif any(word in query_lower for word in ['sad', 'dark', 'depressing']):
        # Boost movies with NEGATIVE sentiment
        test_df['similarity'] = test_df['similarity'] * (1 - test_df['tone_score'])

    # --- STEP C: NORMALIZATION & FINAL SCORE ---
    # Normalize popularity to a 0-1 scale
    test_df['pop_score'] = test_df['popularity'] / test_df['popularity'].max()
    
    # Final Calculation: 90% text match, 10% popularity
    # This matches the 'Favor Text Match heavily' logic from your top function
    test_df['final_score'] = (test_df['similarity'] * 0.9) + (test_df['pop_score'] * 0.1)
    
    return test_df.sort_values(by='final_score', ascending=False).head(top_n)

In [125]:
print(semantic_search_test("animated movies"))

                                                  title  tone_score  \
4475  Aqua Teen Hunger Force Colon Movie Film for Th...    0.175000   
4407                                The Helix... Loaded    0.000000   
2687                         Jonah: A VeggieTales Movie    0.247222   
3453                                       Mary Poppins    0.325902   
1432                                            Valiant    0.025000   
2200                                     Disaster Movie    0.015000   
152                                     Kung Fu Panda 3    0.159028   
1057                                      Scary Movie 2    0.011667   
348                      Ice Age: Dawn of the Dinosaurs    0.312121   
1863                                    Rugrats Go Wild    0.132359   

      similarity  
4475    0.566932  
4407    0.537386  
2687    0.530996  
3453    0.527979  
1432    0.524043  
2200    0.516951  
152     0.516328  
1057    0.501517  
348     0.495737  
1863    0.491520  


In [1]:
# Try the new search
print(smart_search("animated"))

NameError: name 'smart_search' is not defined

In [119]:
import pickle

# 1. Ensure you are using the dataframe that has the NLP columns
# (title, tone_score, popularity, and soup)
processed_movies = df[['movie_id', 'title', 'tone_score', 'popularity', 'vote_average', 'soup']]

# 2. Save it as a tuple: (Dataframe, Embeddings)
# This matches your previous 'movie_data.pkl' structure
with open('smart_movie_models.pkl', 'wb') as file:
    pickle.dump((processed_movies, embeddings), file)

print("Saved successfully! The .pkl contains my movie metadata and NLP vectors.")

Saved successfully! The .pkl contains your movie metadata and NLP vectors.
