
# Disney Gross Prediction Notebook

Reproducible end-to-end pipeline for predicting Disney movie inflation-adjusted grosses using scikit-learn.



## Setup
Install dependencies listed in `requirements.txt`, then run the notebook from the repo root so relative paths resolve.


In [1]:

from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [2]:

RANDOM_STATE = 42
DATA_PATH = Path("data/disney_movies_total_gross.csv")
METRICS_PATH = Path("artifacts/metrics.json")
METRICS_PATH.parent.mkdir(parents=True, exist_ok=True)

if not DATA_PATH.exists():
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH.resolve()}")


In [3]:

@dataclass
class Dataset:
    features: pd.DataFrame
    target: pd.Series


def clean_currency(series: pd.Series) -> pd.Series:
    cleaned = (
        series.astype(str)
        .str.replace(r"[^\d.]", "", regex=True)
        .replace("", np.nan)
    )
    return pd.to_numeric(cleaned, errors="coerce")


def load_dataset(path: Path) -> Dataset:
    df = pd.read_csv(path)
    df["release_year"] = pd.to_datetime(df["release_date"], errors="coerce").dt.year
    df["title_length"] = df["movie_title"].str.len()
    df["inflation_adjusted_gross"] = clean_currency(df["inflation_adjusted_gross"])
    df = df.dropna(subset=["release_year", "inflation_adjusted_gross", "title_length"])
    features = df[["release_year", "title_length", "genre", "MPAA_rating"]]
    target = df["inflation_adjusted_gross"]
    return Dataset(features=features, target=target)


def build_pipeline() -> Pipeline:
    numeric_features = ["release_year", "title_length"]
    categorical_features = ["genre", "MPAA_rating"]

    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            (
                "onehot",
                OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            ),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    model = HistGradientBoostingRegressor(random_state=RANDOM_STATE)
    return Pipeline(steps=[("preprocess", preprocessor), ("model", model)])


def evaluate(y_true: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    metrics = {
        "r2": r2_score(y_true, y_pred),
        "mae": mean_absolute_error(y_true, y_pred),
    }

    positive_mask = np.abs(y_true) > 1e-9
    if positive_mask.any():
        mape = np.mean(
            np.abs((y_true[positive_mask] - y_pred[positive_mask]) / y_true[positive_mask])
        )
        metrics["mape"] = float(mape)
    else:
        metrics["mape"] = float("nan")

    return metrics


In [4]:

dataset = load_dataset(DATA_PATH)
print(f"Samples: {len(dataset.features):,}")
dataset.features.head()


Samples: 579


Unnamed: 0,release_year,title_length,genre,MPAA_rating
0,1937,31,Musical,G
1,1940,9,Adventure,G
2,1940,8,Musical,G
3,1946,17,Adventure,G
4,1950,10,Drama,G


In [5]:

X_train, X_test, y_train, y_test = train_test_split(
    dataset.features,
    dataset.target,
    test_size=0.2,
    random_state=RANDOM_STATE,
)

pipeline = build_pipeline()
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
metrics = evaluate(y_test, y_pred)
metrics


{'r2': 0.46876816432928536,
 'mae': 100709864.67138211,
 'mape': 5.5573472249098055}

In [6]:

# Persist metrics for parity with the CLI workflow
METRICS_PATH.write_text(json.dumps(metrics, indent=2))
pd.DataFrame([metrics])


Unnamed: 0,r2,mae,mape
0,0.468768,100709900.0,5.557347
