# TMDB Data Engineering Project: Modular Pipeline

## Project Structure
- **extraction.api**: Handles API requests.
- **transformation.cleaning**: Handles data cleaning.
- **analysis.analysis**: Contains logic for KPIs and Ranking.
- **visualization.plots**: Contains plotting functions.
- **config.settings**: Configuration (API keys).

In [None]:
import sys
import os
sys.path.append(os.path.abspath('..'))

import pandas as pd
from config.settings import MOVIE_IDS
from extraction.api import fetch_movie_data
from transformation.cleaning import clean_movie_data
from analysis.analysis import (
    calculate_kpis, rank_movies, get_franchise_performance, 
    get_director_performance, filter_bruce_willis_scifi, 
    filter_uma_tarantino, compare_franchise_vs_standalone
)
from visualization.plots import (
    set_style, plot_revenue_vs_budget, plot_roi_by_genre, 
    plot_popularity_vs_rating, plot_franchise_comparison, plot_yearly_trends
)

set_style()

# Step 1: Fetch & Clean Data

In [None]:
print(f"Fetching data for {len(MOVIE_IDS)} movies...")
raw_df = fetch_movie_data(MOVIE_IDS)
df = clean_movie_data(raw_df)
df = calculate_kpis(df)

# Display cleaned data table
df[['id', 'title', 'release_date', 'revenue_musd', 'budget_musd', 'profit_musd', 'roi', 'vote_average']].head()

# Step 2: KPI Analysis

## 2.1 Best & Worst Performing Movies

In [None]:
print("TOP 5 Highest Revenue")
display(rank_movies(df, 'revenue_musd', top_n=5, ascending=False)[['title', 'revenue_musd']])

print("\nTOP 5 Highest Budget")
display(rank_movies(df, 'budget_musd', top_n=5, ascending=False)[['title', 'budget_musd']])

print("\nTOP 5 Highest Profit")
display(rank_movies(df, 'profit_musd', top_n=5, ascending=False)[['title', 'profit_musd']])

print("\nBOTTOM 5 Lowest Profit")
display(rank_movies(df, 'profit_musd', top_n=5, ascending=True)[['title', 'profit_musd']])

print("\nTOP 5 ROI (Budget >= 10M)")
display(rank_movies(df, 'roi', top_n=5, ascending=False, min_budget=10)[['title', 'roi']])

print("\nBOTTOM 5 ROI (Budget >= 10M)")
display(rank_movies(df, 'roi', top_n=5, ascending=True, min_budget=10)[['title', 'roi']])

print("\nMost Voted Movies")
display(rank_movies(df, 'vote_count', top_n=5, ascending=False)[['title', 'vote_count']])

print("\nHighest Rated Movies (>= 10 votes)")
display(rank_movies(df, 'vote_average', top_n=5, ascending=False, min_votes=10)[['title', 'vote_average', 'vote_count']])

print("\nLowest Rated Movies (>= 10 votes)")
display(rank_movies(df, 'vote_average', top_n=5, ascending=True, min_votes=10)[['title', 'vote_average', 'vote_count']])

print("\nMost Popular Movies")
display(rank_movies(df, 'popularity', top_n=5, ascending=False)[['title', 'popularity']])

## 2.2 Advanced Filtering

In [None]:
print("Search 1: Sci-Fi Action starring Bruce Willis")
display(filter_bruce_willis_scifi(df))

print("\nSearch 2: Uma Thurman directed by Quentin Tarantino")
display(filter_uma_tarantino(df))

## 2.3 Franchise vs Standalone Analysis

In [None]:
print("Franchise vs Standalone Performance Comparison")
display(compare_franchise_vs_standalone(df))

## 2.4 Most Successful Franchises & Directors

In [None]:
print("Top Franchises")
display(get_franchise_performance(df).head())

print("\nTop Directors")
display(get_director_performance(df).head())

# Step 3: Visualization

In [None]:
plot_revenue_vs_budget(df)

In [None]:
plot_roi_by_genre(df)

In [None]:
plot_popularity_vs_rating(df)

In [None]:
plot_yearly_trends(df)

In [None]:
franchises = get_franchise_performance(df)
plot_franchise_comparison(franchises)