Movie Box Office Predictor

Imports

In [11]:
# Cell 1: Imports & constants 
import os
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# GUI-related imports (used in GUI cell)
import tkinter as tk
from tkinter import ttk, messagebox
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

# Constants
CSV_PATH = "movies_metadata.csv"
MODEL_PATH = None   # not used when not saving to disk
SAMPLE_FRAC = 0.50    # use 1.0 to train on full dataset (may be slow)
RANDOM_STATE = 42


Data Cleaning

In [12]:
# --- Data Cleaning ---
def parse_genres(genres_str):
    """Safely parse genre strings from JSON-like format."""
    if pd.isna(genres_str):
        return []
    try:
        parsed = ast.literal_eval(genres_str)
        return [g.get("name", "") for g in parsed] if isinstance(parsed, list) else []
    except:
        return []

# Load data
df_movies = pd.read_csv("movies_metadata.csv", low_memory=False)

# Clean numeric columns
numeric_cols = ['budget', 'revenue', 'runtime', 'popularity', 'vote_average', 'vote_count']
for col in numeric_cols:
    df_movies[col] = pd.to_numeric(df_movies[col], errors='coerce')

# Remove rows with missing essential values
df_movies = df_movies.dropna(subset=['revenue', 'budget', 'popularity'])


Feature Engineering

In [13]:
# --- Feature Engineering ---

# Parse genres
df_movies['genres_list'] = df_movies['genres'].apply(parse_genres)

# Select top 12 genres for encoding
all_genres = df_movies['genres_list'].explode().value_counts()
top_genres = all_genres.head(12).index.tolist()

# One-hot encode genres
for g in top_genres:
    df_movies[f'genre_{g}'] = df_movies['genres_list'].apply(lambda lst: int(g in lst))

# Approximate cast_count (some datasets may not have it)
df_movies['cast_count'] = df_movies['cast'].apply(lambda c: len(ast.literal_eval(c)) if isinstance(c, str) else 0) if 'cast' in df_movies.columns else np.random.randint(1, 10, len(df_movies))

# Log-transform skewed features
df_movies['log_budget'] = np.log1p(df_movies['budget'])
df_movies['log_popularity'] = np.log1p(df_movies['popularity'])
df_movies['log_vote_count'] = np.log1p(df_movies['vote_count'])

# Define feature list
feature_list = ['log_budget', 'runtime', 'log_popularity', 'vote_average', 'log_vote_count', 'cast_count'] + [f'genre_{g}' for g in top_genres]

# Fill missing with 0
df_movies[feature_list] = df_movies[feature_list].fillna(0)


Model Training and Evaluation

In [None]:

# target in log-space
y = np.log1p(df_movies['revenue'])
X = df_movies[feature_list]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Model: Decision Tree (interpretable baseline)
model = RandomForestRegressor(
    n_estimators=100,           # number of trees
    max_depth=12,               # max depth of each tree
    min_samples_split=10,       # min samples to split
    min_samples_leaf=5,         # min samples per leaf
    random_state=42,
)
start_time = time.time()
model.fit(X_train, y_train)
training_time = time.time() - start_time

# Predictions (log-space)
y_pred_log = model.predict(X_test)

# Metrics in log-space
rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_log))
mae_log = mean_absolute_error(y_test, y_pred_log)
r2_log = r2_score(y_test, y_pred_log)

# Convert to USD original units and compute USD-space metrics (inverse transform)
y_test_usd = np.expm1(y_test)
y_pred_usd = np.expm1(y_pred_log)
rmse_usd = np.sqrt(mean_squared_error(y_test_usd, y_pred_usd))
mae_usd = mean_absolute_error(y_test_usd, y_pred_usd)
r2_usd = r2_score(y_test_usd, y_pred_usd)

# Median revenue (original units)
median_rev = df_movies['revenue'].median()

# Print results
print("=== Model evaluation (on sample test split) ===")
print(f"Log-space RMSE: {rmse_log:.4f}, MAE: {mae_log:.4f}, R2: {r2_log:.4f}")
print(f"USD-space RMSE: ${rmse_usd:,.2f}, MAE: ${mae_usd:,.2f}, R2: {r2_usd:.4f}")
print(f"Training time: {training_time:.4f} seconds")

# Feature importances
fi = pd.Series(model.feature_importances_, index=feature_list).sort_values(ascending=False)
print("Top features:")
display(fi.head(10))


=== Model evaluation (on sample test split) ===
Log-space RMSE: 3.7575, MAE: 1.8535, R2: 0.6168
USD-space RMSE: $30,203,599.45, MAE: $6,249,357.10, R2: 0.7440
Note: log-space metrics are more stable; USD metrics show real monetary error and can be dominated by outliers.

Top features:


log_budget        0.580630
log_vote_count    0.267425
log_popularity    0.043957
runtime           0.038394
vote_average      0.028228
cast_count        0.013886
genre_Comedy      0.005024
genre_Romance     0.004059
genre_Drama       0.003522
genre_Thriller    0.003043
dtype: float64

GUI

In [15]:
# --- GUI Application ---

def show_stats():
    stats_win = tk.Toplevel(root)
    stats_win.title("Training Data Statistics")
    stats_win.geometry("700x500")
    
    stats_text = tk.Text(stats_win, wrap='word')
    stats_text.pack(expand=True, fill='both', padx=10, pady=10)
    
    stats_text.insert(tk.END, "Top 10 Movies by Revenue:\n")
    stats_text.insert(tk.END, df_movies[['title','revenue']].sort_values('revenue', ascending=False).head(10).to_string(index=False))
    
    stats_text.insert(tk.END, "\n\nSummary Statistics (Original Units):\n")
    stats_text.insert(tk.END, df_movies[['budget','revenue','runtime','popularity','vote_average','vote_count','cast_count']].describe().to_string())

    corr = df_movies[['revenue','budget','runtime','popularity','vote_average','vote_count','cast_count']].corr()['revenue']
    stats_text.insert(tk.END, "\n\nCorrelation with Revenue:\n")
    stats_text.insert(tk.END, corr.to_string())

def predict_movie():
    try:
        budget = float(entry_budget.get())
        runtime = float(entry_runtime.get())
        popularity = float(entry_popularity.get())
        vote_avg = float(entry_voteavg.get())
        vote_count = float(entry_votecount.get())
        cast_count = float(entry_cast.get())
        genres_sel = [top_genres[i] for i, var in enumerate(genres_vars) if var.get()]
        
        fv = {
            'log_budget': np.log1p(budget),
            'runtime': runtime,
            'log_popularity': np.log1p(popularity),
            'vote_average': vote_avg,
            'log_vote_count': np.log1p(vote_count),
            'cast_count': cast_count
        }
        for g in top_genres:
            fv[f'genre_{g}'] = int(g in genres_sel)
        
        X_new = pd.DataFrame([fv], columns=feature_list)
        pred_log = model.predict(X_new)[0]
        pred_rev = np.expm1(pred_log)
        
        result_label.config(text=f"Predicted Revenue: ${pred_rev:,.0f}")
        success = "SUCCESS" if pred_rev > median_rev else "NOT SUCCESSFUL"
        success_label.config(text=f"Prediction: {success} (Threshold: ${median_rev:,.0f})")
    except Exception as e:
        messagebox.showerror("Error", str(e))

# --- Tkinter GUI Layout ---
root = tk.Tk()
root.title("🎬 Movie Box Office Predictor")
root.geometry("600x650")
root.resizable(False, False)

# Header
header = ttk.Label(root, text="Movie Box Office Predictor", font=("Helvetica", 18, "bold"))
header.pack(pady=10)

# Buttons
frame_buttons = tk.Frame(root)
frame_buttons.pack(pady=5)
ttk.Button(frame_buttons, text="Show Training Stats", command=show_stats).pack(side=tk.LEFT, padx=10)
ttk.Label(frame_buttons, text="Predict New Movie", font=("Helvetica", 12, "bold")).pack(side=tk.LEFT, padx=10)

# Form
frame_form = tk.Frame(root)
frame_form.pack(pady=10, padx=10, fill='x')

labels = ["Budget (USD)", "Runtime (min)", "Popularity", "Vote Average", "Vote Count", "Cast Count"]
entries = []
for i, label in enumerate(labels):
    ttk.Label(frame_form, text=label+":").grid(row=i, column=0, sticky=tk.W, pady=3)
    entry = ttk.Entry(frame_form, width=20); entry.grid(row=i, column=1, pady=3)
    entries.append(entry)
entry_budget, entry_runtime, entry_popularity, entry_voteavg, entry_votecount, entry_cast = entries

# Genre checkboxes
ttk.Label(frame_form, text="Select Genres:").grid(row=6, column=0, sticky=tk.W, pady=3, columnspan=2)
genres_vars = [tk.BooleanVar() for _ in top_genres]
for i, g in enumerate(top_genres):
    ttk.Checkbutton(frame_form, text=g, variable=genres_vars[i]).grid(row=7+i//4, column=i%4, sticky=tk.W)

# Predict Button & Output
ttk.Button(root, text="Predict", command=predict_movie).pack(pady=15)
result_label = ttk.Label(root, text="Predicted Revenue: $0", font=("Helvetica", 12))
result_label.pack(pady=3)
success_label = ttk.Label(root, text="Prediction: ---", font=("Helvetica", 12, "bold"))
success_label.pack(pady=3)

root.mainloop()
