# Compare Two Movies Prediction

This notebook demonstrates how to load the trained XGBoost models and compare the predicted performance of two movies side-by-side.

In [None]:
import joblib
import pandas as pd
import numpy as np
import json
import shap
import matplotlib.pyplot as plt

In [None]:
# Load Artifacts
models = {}
models['opening'] = joblib.load('../artifacts/model_opening.pkl')
models['revenue'] = joblib.load('../artifacts/model_revenue.pkl')
vectorizer = joblib.load('../artifacts/genre_vectorizer.pkl')
person_power = joblib.load('../artifacts/person_power.pkl')

with open('../artifacts/model_columns.json', 'r') as f:
    model_columns = json.load(f)
    
print("Models loaded successfully.")

In [None]:
def preprocess_movie(movie_dict):
    # Feature Engineering Logic
    log_budget = np.log1p(movie_dict['budget'])
    
    dt = pd.to_datetime(movie_dict['release_date'])
    year = dt.year
    month = dt.month
    quarter = dt.quarter
    
    # Star Power
    def get_power(crew_str):
        if not crew_str: return 0
        parts = [x.strip() for x in crew_str.split(',')]
        names = parts[0::2]
        if not names: return 0
        powers = [person_power.get(n, 0) for n in names]
        return np.mean(powers) if powers else 0

    star_power = get_power(movie_dict['crew'])
    log_star_power = np.log1p(star_power)
    score = movie_dict.get('score', 0)
    
    # Genres
    genre_vec = vectorizer.transform([movie_dict['genres']]).toarray()[0]
    
    # Concatenate features
    # Base features: ['log_budget', 'release_year', 'release_month', 'release_quarter', 'log_star_power', 'score']
    base_feats = [log_budget, year, month, quarter, log_star_power, score]
    full_feats = np.concatenate([base_feats, genre_vec])
    
    return pd.DataFrame([full_feats], columns=model_columns)

In [None]:
# Define Two Movies
movie1 = {
    'title': 'Space Adventure 2026',
    'budget': 200000000,
    'release_date': '2026-07-15',
    'genres': 'Science Fiction, Action, Adventure',
    'crew': 'Tom Cruise, Actor, Christopher Nolan, Director',
    'score': 85
}

movie2 = {
    'title': 'Romantic Comedy in Paris',
    'budget': 40000000,
    'release_date': '2026-02-14',
    'genres': 'Romance, Comedy',
    'crew': 'Jennifer Aniston, Actor, Adam Sandler, Actor',
    'score': 65
}

In [None]:
X1 = preprocess_movie(movie1)
X2 = preprocess_movie(movie2)

# Predict
pred1_ow = models['opening'].predict(X1)[0]
pred1_rev = models['revenue'].predict(X1)[0]

pred2_ow = models['opening'].predict(X2)[0]
pred2_rev = models['revenue'].predict(X2)[0]

print(f"--- {movie1['title']} ---")
print(f"Predicted Opening Weekend: ${pred1_ow:,.2f}")
print(f"Predicted Total Revenue: ${pred1_rev:,.2f}")

print(f"\n--- {movie2['title']} ---")
print(f"Predicted Opening Weekend: ${pred2_ow:,.2f}")
print(f"Predicted Total Revenue: ${pred2_rev:,.2f}")

In [None]:
# SHAP Explanation for Movie 1 (Total Revenue)
explainer = shap.TreeExplainer(models['revenue'])
shap_values = explainer.shap_values(X1)

shap.summary_plot(shap_values, X1, plot_type="bar")