In [2]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as mp
import seaborn as sb

from pathlib import Path
from IPython.display import display
from prophet import Prophet
from scipy.stats import f_oneway

In [3]:
db = sqlite3.connect('../data/movies_project.db')

movies_df = pd.read_sql_query("SELECT * FROM movies", db)
starring_df = pd.read_sql_query("SELECT * FROM starring", db)
genres_df = pd.read_sql_query("SELECT * FROM genres", db)
directors_df = pd.read_sql_query("SELECT * FROM directors", db)
platforms_df = pd.read_sql_query("SELECT * FROM platforms", db)

### Phase 2 — Revenue & Ratings Insights
If quality and budget influence success, how does that translate into revenue?

Question: *Which years produced the highest-rated and highest-revenue films?*

Top-rated years (filtered to exclude placeholders and unreliable data)

In [5]:
query = """
SELECT 
    year, 
    ROUND(AVG(ratings),1) AS avg_ratings, 
    ROUND(AVG(revenue),2) AS avg_revenue,
    CONCAT(ROUND(AVG(ROI),2),'%') AS avg_ROI
FROM movies
WHERE 
    revenue >= 560471.5 AND 
    budget >= 900000.0
GROUP BY year
ORDER BY avg_ratings DESC, avg_revenue DESC;
"""

q32 = pd.read_sql_query(query,db)
display(q32.head(10))

Unnamed: 0,year,avg_ratings,avg_revenue,avg_ROI
0,1925.0,7.8,6500000.0,232.07%
1,1939.0,7.5,51554382.89,1355.32%
2,1930.0,7.5,5635000.0,114.11%
3,1924.0,7.5,1490419.0,31.24%
4,1927.0,7.3,3266738.2,99.17%
5,1962.0,7.2,25219538.47,524.37%
6,1940.0,7.2,23471737.58,952.63%
7,1959.0,7.2,20193222.22,384.74%
8,1961.0,7.1,27135294.12,644.9%
9,1963.0,7.1,24257310.59,726.58%


#### Observation: 
1925 stands out as a “golden year,” with rating of ~ 7.8 and average revenue \\$6.5M (\\$120.4M in 2025).

Bottom-rated years

In [6]:
query = """
SELECT 
    year, 
    ROUND(AVG(ratings),1) AS avg_ratings, 
    ROUND(AVG(revenue),2) AS avg_revenue,
    CONCAT(ROUND(AVG(ROI),2),'%') AS avg_ROI
FROM movies
WHERE 
    revenue >= 560471.5 AND 
    budget >= 900000.0 AND 
    count >= 665
GROUP BY year
ORDER BY avg_ratings, avg_revenue;
"""

q33 = pd.read_sql_query(query,db)
display(q33.head(10))

Unnamed: 0,year,avg_ratings,avg_revenue,avg_ROI
0,1983,6.2,41989753.0,247.59%
1,1987,6.3,38695728.43,252.04%
2,1996,6.3,62874740.13,158.04%
3,1994,6.3,71053816.01,226.46%
4,2000,6.3,75735901.54,148.28%
5,2005,6.3,76348038.76,177.11%
6,2002,6.3,78599141.93,183.16%
7,2001,6.3,80284272.1,152.06%
8,1997,6.3,84087035.39,169.04%
9,2008,6.3,86366792.51,166.56%


#### Observation: 
1983 was lowest rated year with average rating 6.2 and revenue \\$41.99M (\\$136.16 million 2025 dollars).

#### “So Bad They’re Good” Films

In [7]:
query = """
SELECT 
    title,
    release_date, 
    ratings, 
    revenue,
    CONCAT(ROI,'%') AS ROI
FROM movies
WHERE 
    revenue >= 560471.5 AND 
    budget >= 900000.0  AND 
    ratings <=3 AND 
    count >= 665 AND ROI > 0
ORDER BY ROI DESC;
"""

q4 = pd.read_sql_query(query,db)
display(q4.head(10))


Unnamed: 0,title,release_date,ratings,revenue,ROI
0,Dragonball Evolution,2009-03-12 00:00:00,2.7,58228460,94.09%
1,Disaster Movie,2008-08-29 00:00:00,2.5,34816824,39.27%
2,Crossover,2006-09-01 00:00:00,2.8,7000000,20.69%
3,House Of The Dead,2003-04-11 00:00:00,2.6,13818181,15.15%


Most Successful low rated film
*Dragonball Evolution* (2009)

- Genres: Action, Adventure, Fantasy, Sci-Fi, Thriller

- Rating: 2.7

- Runtime: 1 hour 25 minutes

- Budget: \\$30M → Revenue: \\$58.2M (+697.5% vs global revenue median)

- Insight: Despite extremely low ratings, some films can be commercially successful.

In [8]:
query = """
SELECT m.*, g.genre
FROM movies m
INNER JOIN genres g on g.movie_id = m.id
WHERE m.title = 'Dragonball Evolution';
"""

q41 = pd.read_sql_query(query,db)
display(q41.head(10))

Unnamed: 0,id,title,release_date,revenue,minutes,budget,ratings,count,year,ROI,genre
0,14164,Dragonball Evolution,2009-03-12 00:00:00,58228460,85,30000000,2.7,42798,2009,94.09,Action
1,14164,Dragonball Evolution,2009-03-12 00:00:00,58228460,85,30000000,2.7,42798,2009,94.09,Adventure
2,14164,Dragonball Evolution,2009-03-12 00:00:00,58228460,85,30000000,2.7,42798,2009,94.09,Fantasy
3,14164,Dragonball Evolution,2009-03-12 00:00:00,58228460,85,30000000,2.7,42798,2009,94.09,Science Fiction
4,14164,Dragonball Evolution,2009-03-12 00:00:00,58228460,85,30000000,2.7,42798,2009,94.09,Thriller


#### Revenue and Ratings by Genre


In [9]:
query = """
SELECT 
    genre,
    avg_ratings,
    avg_revenue,
    Avg_roi,
    CASE
        WHEN avg_ratings <= 5.7 THEN 'Low'
        WHEN avg_ratings BETWEEN 5.71 AND 6.2 THEN 'Lower Mid'
        WHEN avg_ratings BETWEEN 6.21 AND 6.5 THEN 'Mid'
        WHEN avg_ratings BETWEEN 6.51 AND 6.8 THEN 'Upper Mid'
        WHEN avg_ratings >= 6.81 THEN 'High'
    END AS rating_tier
FROM (
    SELECT 
        g.genre,
        AVG(m.ratings) AS avg_ratings,
        AVG(m.revenue) AS avg_revenue,
        CONCAT(ROUND(AVG(m.ROI),2),'%') AS avg_roi
    FROM movies m
    INNER JOIN genres g 
        ON g.movie_id = m.id
    WHERE 
        m.revenue >= 560471.5 AND budget > 900000
    GROUP BY g.genre
) AS t
ORDER BY avg_ratings DESC, avg_roi DESC;
"""

q5 = pd.read_sql_query(query,db)
display(q5)


Unnamed: 0,genre,avg_ratings,avg_revenue,avg_roi,rating_tier
0,History,6.917553,51208110.0,141.73%,High
1,War,6.855041,64932600.0,210.6%,High
2,Western,6.673214,41550960.0,212.17%,Upper Mid
3,Documentary,6.65814,27524260.0,389.56%,Upper Mid
4,Drama,6.646459,49238060.0,221.66%,Upper Mid
5,Animation,6.601903,176732300.0,355.7%,Upper Mid
6,Music,6.480429,50469680.0,384.81%,Mid
7,Crime,6.478567,57271810.0,195.4%,Mid
8,Romance,6.393543,54321150.0,251.53%,Mid
9,Mystery,6.371638,58791380.0,248.86%,Mid


#### Observation

History films achieved the highest average ratings (~6.92), while most genres cluster around a rating of 6.4 with an average revenue of approximately $80.9M.

When dividing ratings into bins:

- Low: ≤ 5.7

- Lower Mid: 5.71–6.2

- Mid: 6.21–6.5

- Upper Mid: 6.51–6.8

- High: ≥ 6.81

…the majority of genres fall into the Mid category.

### Interestingly:

- Animation generates the highest revenue (~$176.7M) but does not have the highest ROI.

- History films are critically acclaimed (highest ratings) at 6.9.

- Documentaries are commercially strong, showing nearly quadruple ROI relative to other genres (389.6%).

#### Takeaways

- Critical acclaimed vs. popularity diverge: High ratings (e.g., History at ~6.92) doesn't guarantee top revenue.

- Genre clustering: Most genres fall within the Mid rating tier (6.2–6.5), reinforcing that ratings are relatively consistent across the industry.

- Revenue outliers: Animation, despite being only Upper Mid in ratings, far outpaces all other genres in revenue ($176.4M).

- Commercial vs. critical success: Revenue leaders often don’t align with critical leaders, showing studios may prioritize marketable genres over critical prestige.

- Strategic insight: Success depends on balancing both critical ratings (prestige) and revenue (profitability). Animation and Adventure genres show the strongest commercial potential, while History and War highlight critical strength.