In [7]:
# ============================================================
# PROJECT: Movie Recommendation System
# TYPE   : Hybrid (Content-Based + Popularity)
# DATA   : TMDB Top Rated Movies (10K)
# AUTHOR : Tirth Desai
# ============================================================


# =========================
# STEP 0: INSTALL & IMPORT
# =========================
!pip install -q kagglehub

import kagglehub
import pandas as pd
import numpy as np
import os
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import seaborn as sns


# ============================================================
# 1. PROBLEM DEFINITION & OBJECTIVE
# ============================================================
print("""
--- 1. PROBLEM DEFINITION & OBJECTIVE ---

Project Track:
AI / ML – Recommendation Systems

Problem Statement:
Users often face difficulty in selecting movies from large catalogs.
This project aims to reduce choice paralysis by recommending relevant movies.

Objective:
To build a hybrid movie recommendation system using content-based
filtering and popularity-based ranking.

Real-World Relevance:
Used by platforms like Netflix, IMDb, Prime Video.
""")


# ============================================================
# 2. DATA UNDERSTANDING & PREPARATION
# ============================================================
print("""
--- 2. DATA UNDERSTANDING & PREPARATION ---

Dataset Source:
Public Kaggle Dataset – Top Rated TMDB Movies (10K)

Type:
Public, real-world movie metadata
""")


# ------------------------------------------
# Data Loading
# ------------------------------------------
path = kagglehub.dataset_download("ahsanaseer/top-rated-tmdb-movies-10k")
files = os.listdir(path)

df = pd.read_csv(os.path.join(path, files[0]))
print("Dataset Loaded | Shape:", df.shape)
df.head()


# ------------------------------------------
# Data Cleaning & Column Selection
# ------------------------------------------
df = df[
    ['id', 'title', 'genre', 'original_language',
     'overview', 'popularity', 'vote_average', 'vote_count']
]

df.rename(columns={
    'overview': 'description',
    'vote_average': 'rating',
    'vote_count': 'review_count',
    'original_language': 'language'
}, inplace=True)

df.fillna('', inplace=True)
print("After Cleaning | Shape:", df.shape)


# ------------------------------------------
# Feature Engineering
# ------------------------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    return text

df['tags'] = (
    df['genre'] + ' ' +
    df['description'] + ' ' +
    df['language']
).apply(clean_text)

print("Feature engineering completed (tags created).")


# ============================================================
# 3. MODEL / SYSTEM DESIGN
# ============================================================
print("""
--- 3. MODEL / SYSTEM DESIGN ---

Techniques Used:
1. Content-Based Filtering (TF-IDF + Cosine Similarity)
2. Popularity-Based Recommendation
3. Hybrid Recommendation System

Justification:
• Content-based handles personalization
• Popularity-based handles cold-start
• Hybrid improves robustness and relevance
""")


# ============================================================
# 4. CORE IMPLEMENTATION
# ============================================================

# ------------------------------------------
# Popularity-Based Recommendation
# ------------------------------------------
def popular_movies(top_n=10):
    return df.sort_values(
        by=['rating', 'review_count', 'popularity'],
        ascending=False
    ).head(top_n)[['title', 'rating', 'review_count']]


# ------------------------------------------
# Content-Based Recommendation (TF-IDF)
# ------------------------------------------
tfidf = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    max_features=600
)

tfidf_matrix = tfidf.fit_transform(df['tags'])
cosine_sim = cosine_similarity(tfidf_matrix)

indices = pd.Series(df.index, index=df['title']).drop_duplicates()

def content_recommend(movie_title, top_n=10):
    if movie_title not in indices:
        return pd.DataFrame()

    idx = indices[movie_title]
    sim_scores = sorted(
        list(enumerate(cosine_sim[idx])),
        key=lambda x: x[1],
        reverse=True
    )

    movie_indices = [i[0] for i in sim_scores[1:top_n+1]]
    return df[['title', 'rating', 'review_count']].iloc[movie_indices]


# ------------------------------------------
# Hybrid Recommendation
# ------------------------------------------
def hybrid_recommend(movie_title, top_n=10):
    content_rec = content_recommend(movie_title, top_n=5)
    popular_rec = popular_movies(top_n=5)
    hybrid = pd.concat([content_rec, popular_rec]).drop_duplicates()
    return hybrid.head(top_n)


# ============================================================
# 5. EVALUATION & ANALYSIS
# ============================================================
print("\n--- 5. EVALUATION & ANALYSIS ---")

# Test Case 1: Content-Based
input_movie = "The Dark Knight"
print(f"\nUser Input: '{input_movie}' (Content-Based)")
print(content_recommend(input_movie, 5).to_string(index=False))

# Test Case 2: Popularity-Based Recommendation
input_movie = "Bahubali"
print(f"\nUser Input: '{input_movie}' (Popularity-Based)")
print("System Recommendations:")
print(popular_movies(5).to_string(index=False))

# Test Case 3: Hybrid
input_movie = "Inception"
print(f"\nUser Input: '{input_movie}' (Hybrid)")
print(hybrid_recommend(input_movie, 5).to_string(index=False))

# Test Case 4: Genre Consistency
input_movie = "The Lord of the Rings: The Return of the King"
print(f"\nUser Input: '{input_movie}' (Genre Consistency)")
print(content_recommend(input_movie, 5).to_string(index=False))

print("""
Analysis:
• Content-based system clusters movies by genre and theme.
• Popularity-based system highlights globally well-rated movies.
• Hybrid system provides balanced and robust recommendations.

The system performs as expected across multiple test cases.
""")


# ============================================================
# 6. ETHICAL CONSIDERATIONS & RESPONSIBLE AI
# ============================================================
print("""
--- 6. ETHICAL CONSIDERATIONS & RESPONSIBLE AI ---

• Dataset is public and contains no personal user data
• Recommendations are explainable (TF-IDF based)
• Potential bias toward English-language content
""")


# ============================================================
# 7. CONCLUSION & FUTURE SCOPE
# ============================================================
print("""
--- 7. CONCLUSION & FUTURE SCOPE ---

Conclusion:
A hybrid movie recommendation system was successfully built
using real-world TMDB data.

Future Scope:
• Add collaborative filtering
• Add deep learning embeddings (BERT)
• Deploy as a web application using Streamlit or Flask
""")



--- 1. PROBLEM DEFINITION & OBJECTIVE ---

Project Track:
AI / ML – Recommendation Systems

Problem Statement:
Users often face difficulty in selecting movies from large catalogs.
This project aims to reduce choice paralysis by recommending relevant movies.

Objective:
To build a hybrid movie recommendation system using content-based
filtering and popularity-based ranking.

Real-World Relevance:
Used by platforms like Netflix, IMDb, Prime Video.


--- 2. DATA UNDERSTANDING & PREPARATION ---

Dataset Source:
Public Kaggle Dataset – Top Rated TMDB Movies (10K)

Type:
Public, real-world movie metadata

Using Colab cache for faster access to the 'top-rated-tmdb-movies-10k' dataset.
Dataset Loaded | Shape: (10000, 9)
After Cleaning | Shape: (10000, 8)
Feature engineering completed (tags created).

--- 3. MODEL / SYSTEM DESIGN ---

Techniques Used:
1. Content-Based Filtering (TF-IDF + Cosine Similarity)
2. Popularity-Based Recommendation
3. Hybrid Recommendation System

Justification:
• Con