In [1]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as mp
import seaborn as sb

from pathlib import Path
from IPython.display import display
from prophet import Prophet
from scipy.stats import f_oneway

db = sqlite3.connect('../data/movies_project.db')

movies_df = pd.read_sql_query("SELECT * FROM movies", db)
starring_df = pd.read_sql_query("SELECT * FROM starring", db)
genres_df = pd.read_sql_query("SELECT * FROM genres", db)
directors_df = pd.read_sql_query("SELECT * FROM directors", db)
platforms_df = pd.read_sql_query("SELECT * FROM platforms", db)

### 3.1 Platform Genre Economics
**Objective:** Identify top-performing genres per platform by ratings, revenue, and ROI.

Question: *What are the top 3 genres per platform based on revenue and ratings?*


In [2]:
query = """

WITH joined AS (
    SELECT 
        p.platform AS platform,
        g.genre,
        m.ratings,
        m.revenue,
        m.budget,
        m.roi
    FROM movies m
    JOIN platforms p ON p.title = m.title
    JOIN genres g ON g.movie_id = m.id
    WHERE 
        p.title = m.title 
        AND m.revenue >= 560471.5
        AND m.budget >= 900000
),
best AS (
    SELECT 
        platform,
        genre,
        ROUND(AVG(ratings), 1) AS avg_rating,
        ROUND(AVG(revenue)) AS avg_revenue,
        ROUND(AVG(roi), 2) AS avg_roi
    FROM joined
    GROUP BY platform, genre
)
SELECT 
    platform,
    genre,
    avg_rating,
    avg_revenue,
    avg_roi
FROM (
    SELECT 
        platform,
        genre,
        avg_rating,
        avg_revenue,
        avg_roi,
        DENSE_RANK() OVER (PARTITION BY platform ORDER BY avg_rating DESC) AS rn
    FROM best
) AS ranked
WHERE rn <= 3
ORDER BY platform, avg_rating DESC, avg_roi DESC;


"""

q6=pd.read_sql_query(query,db)
display(q6)

Unnamed: 0,platform,genre,avg_rating,avg_revenue,avg_roi
0,Disney+,Horror,7.6,6877900.0,282.11
1,Disney+,Animation,7.1,338846357.0,729.54
2,Disney+,Drama,6.9,155213120.0,342.98
3,Disney+,History,6.9,98889795.0,147.52
4,Disney+,Crime,6.9,93614301.0,144.29
5,Hulu,Documentary,7.4,6616815.0,514.23
6,Hulu,Western,7.2,21495278.0,93.8
7,Hulu,History,7.1,54148307.0,101.85
8,Hulu,War,7.1,51160792.0,17.48
9,Netflix,Western,7.3,140263374.0,378.48


#### Core Insights

- Disney+ thrives on family-friendly Animation for both critical and commercial success, while Horror delivers niche acclaim.

- Hulu, Netflix, and Prime skew toward mature audiences with Documentaries, Western, and History driving profitability even at lower budgets.

- Ratings ≠ Revenue: High-rated genres like Documentary can deliver strong ROI despite modest absolute revenue.

>Having established which genres drive platform success, we next examine >whether these trends differ by audience age group and budget allocation.

### 3.2 Audience Demographics & Budget Allocation
**Objective:** Explore how age groups correlate with ROI and whether budgets skew toward certain audiences.

Frequency of top age groups that deliver highest roi, dominating genres

In [3]:
query = """

SELECT
    age,
    count(*)
    
FROM
(SELECT 
    g.genre, 
    p.age, 
    ROUND(AVG(m.roi),0) AS avg_roi,
    ROW_NUMBER() OVER (PARTITION BY g.genre ORDER BY AVG(m.roi) DESC) AS rn
FROM movies m
JOIN genres g on g.movie_id = m.id
JOIN platforms p ON p.title = m.title
WHERE 
    age != 'Unknown' AND 
    m.budget > 0 AND
    m.revenue > 0
GROUP BY genre, age) AS x
WHERE rn = 1
GROUP BY age
ORDER BY count(*) DESC;

"""
q16 = pd.read_sql_query(query,db)
display(q16)

Unnamed: 0,age,count(*)
0,18+,9
1,all,5
2,7+,2
3,13+,2
4,16+,1


#### Budget Allocation by Age
Are budgets higher for certain audiences?

In [4]:
query = """

SELECT
    age,
    ROUND(AVG(budget)) as avg_budget
FROM movies m
JOIN platforms p ON p.title = m.title
WHERE 
    age != 'Unknown' AND 
    m.budget > 900000 AND
    m.revenue > 0
GROUP BY age
ORDER BY AVG(budget) DESC;

"""
q17 = pd.read_sql_query(query,db)
display(q17)
#Top 3 ages lie in close range 7+,13+, and all

Unnamed: 0,age,avg_budget
0,7+,49581658.0
1,13+,46994055.0
2,all,45550996.0
3,16+,22506625.0
4,18+,20947271.0


#### Findings: 
18+ dominates ROI frequency; famility and kid friendly titles attract highest budgets.

These patterns suggest that streaming profitability is concentrated in adult-oriented titles, even as studios continue to invest heavily in family-rated blockbusters.

### 3.3 Historical Prestige Content
**Objective:** Evaluate how golden-era films (e.g., 1925 Silent Film Era) are distributed across platforms.

#### Question: *Among the golden-era movies identified earlier, which platforms host the most of these films?*

Earlier we identified 1925 as a standout year, with films averaging a 7.2/10 rating and generating an estimated $120.4 million (2025 dollars) in revenue.

This year marked the beginning of the Silent Film Era.
Notable figures such as Charlie Chaplin and Alfred Hitchcock rose to prominence, creating works where actors conveyed profound or exaggerated emotions while live or recorded instrumental music—often piano—synchronized to set the mood.

Fact:
Approximately 70% of the 10,000+ silent films produced between 1912 and 1930 have been lost due to poor preservation.
Most were filmed on highly flammable nitrate stock, which was prone to fires and natural deterioration, [Vast Number of Silent Films Lost to History](https://www.wnycstudios.org/podcasts/takeaway/segments/vast-record-silent-films-lost-history).

In [5]:
query = """
SELECT 
    p.platform AS platform,
    count(*) AS num_films,
    AVG(m.ratings) AS avg_rating,
    AVG(m.revenue) AS avg_revenue
FROM movies m 
JOIN platforms p on p.title = m.title
WHERE p.title = m.title AND m.year = 1925
GROUP BY platform
ORDER BY num_films DESC;
"""

q7 = pd.read_sql_query(query,db)
display(q7)


Unnamed: 0,platform,num_films,avg_rating,avg_revenue
0,Prime Video,14,5.335714,276303.571429
1,Netflix,10,1.58,0.0
2,Disney+,4,3.0,0.0
3,Hulu,1,6.0,0.0


There currently isn't many ratings nor revenue documented for this year for 75% of the platforms so going based on the profit won't be condusive in this case.
Based on quantity Prime and Neflix hold majority of these titles.

In [6]:
query = """
SELECT 
    m.title,
    m.ratings,
    m.revenue
FROM movies m 
JOIN platforms p on p.title = m.title
WHERE p.title = m.title AND m.year = 1925 AND platform = 'Prime Video'
ORDER BY m.ratings DESC;
"""

q7 = pd.read_sql_query(query,db)
display(q7)

Unnamed: 0,title,ratings,revenue
0,Battleship Potemkin,7.8,45100
1,The Phantom Of The Opera,7.3,2000000
2,Stella Dallas,7.2,1500000
3,Little Annie Rooney,6.8,0
4,Don Q Son Of Zorro,6.7,0
5,Les Misérables,6.6,0
6,Santa Claus,6.5,0
7,The Eagle,6.4,323150
8,Cyrano De Bergerac,6.2,0
9,An Eye For An Eye,5.2,0


#### 3.4 Iconic Film Hosting & Platform Strategy
**Objective:** Determine if hosting critically acclaimed films correlates with revenue efficiency.

**Question:** *Does hosting more iconic movies make a platform more attractive to invest in?*

Because budget data for 1925 films is limited, we shift focus to the <u>Best Movies of All Time</u> to identify truly iconic titles.
To qualify as iconic, a film must:
- Have a rating of at least 8/10
- Receive at least 7,120 votes (the dataset’s median) to ensure broad audience support


By analyzing the proportion of these iconic films hosted by each platform, we can assess whether platforms that feature more critically acclaimed, high-volume titles also achieve higher ratings and stronger revenue returns—a key factor in platform investment potential.

In [7]:
query = """
SELECT 
    year,
    title,
    minutes,
    budget,
    revenue,
    CONCAT(roi, '%') AS roi,
    ratings,
    count
FROM movies m 
WHERE count >= 7120
  AND ratings >= 8
  AND revenue > 0
  AND budget > 0
ORDER BY ratings DESC, count DESC;

"""

q8 = pd.read_sql_query(query,db)
display(q8)


Unnamed: 0,year,title,minutes,budget,revenue,roi,ratings,count
0,1994,The Shawshank Redemption,142,25000000,28341469,13.37%,9.0,1557426
1,1972,The Godfather,175,6000000,245066411,3984.44%,8.9,1086864
2,2008,The Dark Knight,152,185000000,1004558444,443.0%,8.8,1547754
3,1993,Schindler'S List,195,22000000,321365567,1360.75%,8.8,778744
4,1974,The Godfather Part Ii,202,13000000,102600000,689.23%,8.8,730094
...,...,...,...,...,...,...,...,...
291,2024,How To Make Millions Before Grandma Dies,126,1000000,73800000,7280.0%,8.0,9358
292,1949,The Heiress,115,2600000,2300000,-11.54%,8.0,9332
293,2018,Mahanati,176,4346100,11589600,166.67%,8.0,8050
294,2023,2018,150,3009565,24076520,700.0%,8.0,7659


Since not all of the 296 films are included in the platforms, we will base the proportion of those that are available about what proportion each platform occupies
So far we see that Netflix is a leading contender (as expected from my own experience).

In [8]:
#how many films are actually matched-- 72 OUT OF 296
query = """
SELECT 
    COUNT(DISTINCT m.title) AS total_films
FROM movies m 
JOIN platforms p ON p.title = m.title
WHERE m.title IN (
      SELECT title
      FROM movies
      WHERE count >= 7120 
        AND ratings >= 8 
        AND revenue > 0 
        AND budget > 0
  );
"""

q8 = pd.read_sql_query(query,db)
display(q8)



Unnamed: 0,total_films
0,72


In [9]:
#24.3% actually being streamed according to filtered joined data


query1 = """
WITH iconic AS (
    SELECT title
    FROM movies
    WHERE count >= 7120
      AND ratings >= 8
      AND revenue > 0 
      AND budget > 0
)
SELECT 
    p.platform,
    COUNT(DISTINCT m.title) AS total_films,
    CONCAT(ROUND(COUNT(*) * 100.0 / 296, 2), '%') AS percentage,
    ROUND(AVG(m.revenue), 2) AS avg_revenue,
    CONCAT(m.roi,'%') AS ROI
FROM movies m 
JOIN platforms p ON p.title = m.title
WHERE m.title IN (SELECT DISTINCT title FROM iconic)
GROUP BY p.platform
ORDER BY total_films DESC;

"""
q81 = pd.read_sql_query(query1, db)
display(q81)


Unnamed: 0,platform,total_films,percentage,avg_revenue,ROI
0,Netflix,23,28.72%,50548190.0,622.55%
1,Prime Video,20,38.85%,22572970.0,735.58%
2,Hulu,16,24.66%,40591430.0,759.65%
3,Disney+,16,25.0%,209170400.0,900.36%


This analysis shows that prestigious films are fairly distributed across platforms. Although Netflix hosts the largest collection of iconic titles, Disney+ dominates in terms of revenue and has shown the most budget effeciency with 900.36% ROI, making its titles the most profitable. Some Netflix titles rank near the lower end of revenue, which may be influenced by incomplete data. Based on the available information, this is the trend observed across the platforms.

#### Key Takeaways:

- Genre Strategy: Disney+ focuses on Animation for high revenue, while Hulu, Netflix, and Prime highlight Documentaries and Westerns for ratings.

- Golden-Era Films: Rarely fully available; Prime and Netflix host most, but revenue data is limited.

- Iconic Films: Only ~24% of top-rated films (≥8/10, ≥7120 votes) are streamed. Netflix has the largest collection; Disney+ hosts fewer but higher-grossing titles.

- Revenue vs. Quantity: More films don’t always mean more revenue—strategic curation of high-impact titles drives profitability.

- Investment Implication: Platforms with high-performing films attract audiences and investors, emphasizing quality over quantity.