# 🎬 Movie Subtitle Semantic Search Engine

This notebook demonstrates semantic search on movie subtitles using TF-IDF vectorization and cosine similarity.

## Features:
- Load movie subtitle data from SQLite database
- Text preprocessing and cleaning
- TF-IDF vectorization for semantic understanding
- Interactive search functionality
- Results ranking by similarity scores

## 1. Import Required Libraries

In [None]:
import os
import warnings
import sqlite3
import zipfile
import io
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully!")

## 2. Database Connection and Data Loading

In [None]:
# Database path - uses local file in current directory
database_path = "eng_subtitles_database.db"

# Check if database exists
if not os.path.exists(database_path):
    print("❌ Database not found. Make sure 'eng_subtitles_database.db' is in the current directory.")
else:
    print(f"✅ Database found: {database_path}")

# Connect to database and load data
try:
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    
    # Execute query to get subtitle data
    cursor.execute("SELECT num, name, content FROM zipfiles")
    rows = cursor.fetchall()
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=['subtitle_id', 'subtitle_name', 'subtitle_content'])
    
    print(f"✅ Loaded {len(df)} subtitle files from database")
    
    conn.close()
    
except Exception as e:
    print(f"❌ Error loading data: {e}")

In [None]:
# Display basic information about the dataset
print("Dataset Information:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few entries:")
df.head()

## 3. Data Preprocessing

In [None]:
def extract_zip_content(content):
    """Extract content from zip files"""
    try:
        with io.BytesIO(content) as bio:
            with zipfile.ZipFile(bio, "r") as zipf:
                for file_name in zipf.namelist():
                    with zipf.open(file_name) as file:
                        text = file.read().decode("latin-1")
                        return text
    except:
        return ""

def clean_text(text):
    """Clean subtitle text by removing timestamps and formatting"""
    if not text:
        return ""
    
    # Remove timestamps
    text = re.sub(r'\d{1,2}:\d{2}:\d{2},\d{3} --> \d{1,2}:\d{2}:\d{2},\d{3}\r?\n', '', text)
    # Remove line breaks
    text = re.sub(r'\r?\n', ' ', text)
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase and strip
    text = text.lower().strip()
    
    return text

print("✅ Text processing functions defined")

In [None]:
# Process the subtitle content
print("Processing subtitle content...")

# Extract zip content
print("  Extracting zip files...")
df['subtitle_content'] = df['subtitle_content'].apply(extract_zip_content)

# Clean text
print("  Cleaning text...")
df['subtitle_content'] = df['subtitle_content'].apply(clean_text)

# Remove empty content
df = df[df['subtitle_content'].str.len() > 50]  # Keep only meaningful content

# Clean movie names
df['subtitle_name'] = df['subtitle_name'].str.replace('.', ' ', regex=False)
df['subtitle_name'] = df['subtitle_name'].str.replace('eng 1cd', '', regex=False).str.strip()

# Reset index
df = df.reset_index(drop=True)

print(f"✅ Processed {len(df)} movies with meaningful content")

In [None]:
# Display processed data
print("Processed Data Sample:")
for i in range(min(3, len(df))):
    print(f"\n{i+1}. Movie: {df.iloc[i]['subtitle_name'].title()}")
    print(f"   Content Preview: {df.iloc[i]['subtitle_content'][:150]}...")

## 4. Create Search Index with TF-IDF

In [None]:
# Create TF-IDF vectorizer
print("Creating TF-IDF search index...")

# Extract documents for vectorization
documents = df['subtitle_content'].tolist()

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=1000,      # Limit vocabulary size
    stop_words='english',   # Remove common English words
    ngram_range=(1, 2),     # Use both single words and pairs
    min_df=1,               # Minimum document frequency
    max_df=0.8              # Maximum document frequency
)

# Fit and transform documents
tfidf_matrix = vectorizer.fit_transform(documents)

print(f"✅ TF-IDF matrix created with shape: {tfidf_matrix.shape}")
print(f"   Vocabulary size: {len(vectorizer.vocabulary_)}")

## 5. Search Function

In [None]:
def search_movies(query, top_k=5):
    """Search for movies using semantic similarity"""
    # Transform query using the same vectorizer
    query_vector = vectorizer.transform([query.lower()])
    
    # Calculate cosine similarities
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top results
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    results = []
    for i, idx in enumerate(top_indices):
        if similarities[idx] > 0:  # Only return relevant results
            results.append({
                'rank': i + 1,
                'movie': df.iloc[idx]['subtitle_name'].title(),
                'similarity': round(similarities[idx], 3),
                'content_preview': df.iloc[idx]['subtitle_content'][:200] + "..."
            })
    
    return results

def display_results(results, query):
    """Display search results in a formatted way"""
    print(f"\n🔍 Search Results for: '{query}'")
    print("=" * 50)
    
    if not results:
        print("No relevant results found.")
        return
    
    for result in results:
        print(f"\n{result['rank']}. {result['movie']}")
        print(f"   Similarity Score: {result['similarity']}")
        print(f"   Content: \"{result['content_preview']}\"")
        print("-" * 50)

print("✅ Search functions defined")

## 6. Test Searches

In [None]:
# Test different types of queries
test_queries = [
    "betrayal by friend",
    "life is like chocolates",
    "drunk party night",
    "force with you",
    "never let go"
]

print("🎬 Testing Semantic Search with Different Queries")
print("=" * 60)

for query in test_queries:
    results = search_movies(query, top_k=3)
    display_results(results, query)
    print("\n" + "="*60)

## 7. Interactive Search

In [None]:
# Interactive search function
def interactive_search():
    """Run interactive search session"""
    print("\n🎯 Interactive Movie Search")
    print("Enter your search queries. Type 'quit' to exit.")
    print("\nExample queries:")
    print("- 'betrayal by friend'")
    print("- 'life wisdom advice'")
    print("- 'love scene romantic'")
    print("- 'drunk party night'")
    
    while True:
        try:
            query = input("\n🔍 Enter search query: ").strip()
            
            if query.lower() in ['quit', 'exit', 'q']:
                print("Thanks for testing! 👋")
                break
                
            if not query:
                print("Please enter a search query.")
                continue
                
            results = search_movies(query, top_k=3)
            display_results(results, query)
            
        except KeyboardInterrupt:
            print("\nSearch session ended.")
            break
        except Exception as e:
            print(f"Error: {e}")

# Uncomment the line below to start interactive search
# interactive_search()

## 8. Analysis and Visualization

In [None]:
# Analyze the vocabulary and most important terms
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1

# Get top terms by TF-IDF score
top_terms_idx = tfidf_scores.argsort()[-20:][::-1]
top_terms = [(feature_names[i], tfidf_scores[i]) for i in top_terms_idx]

print("📊 Top 20 Terms by TF-IDF Score:")
for i, (term, score) in enumerate(top_terms):
    print(f"{i+1:2d}. {term:15} (score: {score:.3f})")

In [None]:
# Visualize similarity scores distribution
sample_query = "love and friendship"
query_vector = vectorizer.transform([sample_query.lower()])
similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

plt.figure(figsize=(10, 6))
plt.hist(similarities, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
plt.title(f'Distribution of Similarity Scores for Query: "{sample_query}"')
plt.xlabel('Cosine Similarity Score')
plt.ylabel('Number of Movies')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Statistics for query '{sample_query}':")
print(f"Mean similarity: {similarities.mean():.3f}")
print(f"Max similarity: {similarities.max():.3f}")
print(f"Movies with similarity > 0: {(similarities > 0).sum()}")

## 9. Summary and Conclusion

In [None]:
print("🎉 Movie Subtitle Semantic Search Engine - Summary")
print("=" * 55)
print(f"✅ Successfully processed {len(df)} movie subtitle files")
print(f"✅ Created TF-IDF search index with {tfidf_matrix.shape[1]} features")
print(f"✅ Implemented semantic search using cosine similarity")
print(f"✅ Tested with multiple query types")

print("\n🚀 Key Features:")
print("   • Semantic understanding (finds meaning, not just keywords)")
print("   • Fast TF-IDF based similarity search")
print("   • Robust text preprocessing")
print("   • Interactive search capability")
print("   • Similarity scoring and ranking")

print("\n🎯 Perfect for:")
print("   • Finding movies by theme or emotion")
print("   • Content discovery based on dialogue")
print("   • Semantic similarity analysis")
print("   • Natural language movie search")

print("\n📋 Technical Stack:")
print("   • Python + pandas for data processing")
print("   • scikit-learn for TF-IDF vectorization")
print("   • SQLite for data storage")
print("   • Regular expressions for text cleaning")
print("   • Cosine similarity for semantic matching")