In [1]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import pandas as pd

In [2]:
X = pd.read_pickle("../data/processed/matrix_ready_for_regression.pkl")
data = pd.read_pickle("../data/processed/final_merged_data.pkl")

In [3]:
# Remove F2P games (they have $0 revenue, not useful for prediction)
# $0 revenue would skew the regression results by creating a large cluster of points at zero and attributing profitability to features that indicate F2P status

y = data["log_estimated_revenue"]
mask_paid = X['f2p_flag'] == 0
X_paid = X[mask_paid].copy()
y_paid = y[mask_paid].copy()

In [4]:
# Target variable in log scale
X_paid.columns

Index(['peak_players', 'win', 'mac', 'linux', 'user_reviews', 'Price',
       'release_year', 'copies_sold_reviews_proxy', 'f2p_flag',
       'avg_sentiment', 'has_sentiment', 'Tags_Indie', 'Tags_Singleplayer',
       'Tags_Action', 'Tags_Casual', 'Tags_Adventure', 'Tags_2D',
       'Tags_Strategy', 'Tags_Simulation', 'Tags_RPG', 'Tags_Puzzle',
       'Tags_Atmospheric', 'Tags_Story Rich', 'Tags_Pixel Graphics',
       'Tags_Early Access', 'Tags_Multiplayer', 'Tags_3D', 'Tags_Colorful',
       'Tags_Cute', 'Tags_Arcade', 'Tags_First-Person', 'Tags_Exploration',
       'Tags_Fantasy', 'Tags_Funny', 'Tags_Shooter', 'Tags_Retro',
       'Tags_Platformer', 'Tags_Anime', 'Tags_Horror', 'Tags_Family Friendly',
       'Tags_Difficult', 'Genres_Indie', 'Genres_Action', 'Genres_Adventure',
       'Genres_Casual', 'Genres_Simulation', 'Genres_Strategy', 'Genres_RPG',
       'Genres_Free To Play', 'Genres_Early Access', 'Genres_Sports',
       'Genres_Racing', 'Genres_Massively Multiplayer', 'Gen

In [5]:
# Drop f2p_flag and related features
cols_to_drop = ['f2p_flag', 'peak_players', 'copies_sold_reviews_proxy', 'user_reviews']
if 'Genres_Free To Play' in X_paid.columns:
    cols_to_drop.append('Genres_Free To Play')

X_paid = X_paid.drop(columns=cols_to_drop)

print(f"Original dataset: {len(X)} games")
print(f"Paid games only: {len(X_paid)} games")
print(f"Removed features: {cols_to_drop}")

Original dataset: 37491 games
Paid games only: 30536 games
Removed features: ['f2p_flag', 'peak_players', 'copies_sold_reviews_proxy', 'user_reviews', 'Genres_Free To Play']


In [None]:
# Split data into training and testing sets (80% train, 20% test) 
X_train, X_test, y_train, y_test = train_test_split(
    X_paid, y_paid, test_size=0.2, random_state=42
)

In [9]:
# Creates and trains the XGBoost regression model
model = XGBRegressor(
    n_estimators=500, # Number of trees
    learning_rate=0.05, # Step size shrinkage, or how much each tree is allowed to correct the previous one
    max_depth=8, # Maximum tree depth for base learners, not overall tree depth
    subsample=0.8, # Trains each tree on 80% of the data to prevent overfitting
    colsample_bytree=0.8, # Use 80% of the features for each tree
    objective="reg:squarederror", # Regression with squared loss
    tree_method="hist", # Histogram based algorithms are known to be faster for large datasets
    random_state=42, 
    early_stopping_rounds=50,  # Stop training if no improvement in 50 rounds
    eval_metric="rmse" # Evaluation metric is Root Mean Squared Error (RMSE)
)

# X_train and y_train correspond to the dataset attributes and target variable
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# Process: Build tree to predict y_train, then check performance on X_test and y_test, build another tree if not good enough, repeat until early stopping or max trees reached
# Each tree can be thought of as a weak learner that corrects the errors of the previous trees and has a max depth of 8 levels

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,50
,enable_categorical,False


In [10]:
# Model predicts LOG revenue, and we convert it back to actual revenue
y_pred_log = model.predict(X_test)
y_pred_revenue = np.exp(y_pred_log) 

# Also convert actual values back to dollars for comparison
y_test_revenue = np.exp(y_test)

In [11]:
rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_log)) # RMSE on log scale 
mae_log = mean_absolute_error(y_test, y_pred_log) # MAE on log scale
r2_log = r2_score(y_test, y_pred_log) # R² on log scale

# Metrics on ACTUAL dollar scale (for interpretability), will be larger due to exponentiation
rmse_actual = np.sqrt(mean_squared_error(y_test_revenue, y_pred_revenue))
mae_actual = mean_absolute_error(y_test_revenue, y_pred_revenue)
r2_actual = r2_score(y_test_revenue, y_pred_revenue)

print("=" * 50)
print("REGRESSION PERFORMANCE (Revenue Prediction)")
print("=" * 50)
print("\nLog-Scale Metrics (what model optimizes):")
print(f"  RMSE: {rmse_log:.4f}")
print(f"  MAE:  {mae_log:.4f}")
print(f"  R²:   {r2_log:.4f}")

print("\nActual Dollar Metrics (for interpretation):")
print(f"  RMSE: ${rmse_actual:,.2f}")
print(f"  MAE:  ${mae_actual:,.2f}")
print(f"  R²:   {r2_actual:.4f}")

REGRESSION PERFORMANCE (Revenue Prediction)

Log-Scale Metrics (what model optimizes):
  RMSE: 1.3995
  MAE:  1.0599
  R²:   0.6864

Actual Dollar Metrics (for interpretation):
  RMSE: $26,568,658.94
  MAE:  $1,888,019.81
  R²:   0.1102


In [12]:
# Define success threshold
SUCCESS_THRESHOLD = 100_000  # $100k revenue

# Create binary labels using ACTUAL revenue (not log)
y_test_success = (y_test_revenue >= SUCCESS_THRESHOLD).astype(int)
y_pred_success = (y_pred_revenue >= SUCCESS_THRESHOLD).astype(int)


In [13]:
# Calculates the accuracy of the classification by comparing predicted success labels to actual success labels
accuracy = accuracy_score(y_test_success, y_pred_success)

# Check if we have both classes in predictions
unique_pred = np.unique(y_pred_success)
unique_actual = np.unique(y_test_success)

print("\n" + "=" * 50)
print(f"CLASSIFICATION PERFORMANCE (Success >= ${SUCCESS_THRESHOLD:,})")
print("=" * 50)

print(f"\nActual successes in test set: {y_test_success.sum()} / {len(y_test_success)} ({y_test_success.mean():.1%})") # Also known as prevalence
print(f"Predicted successes: {y_pred_success.sum()} / {len(y_pred_success)} ({y_pred_success.mean():.1%})") # Prevalence of predicted positives

print(f"\nRevenue distribution in test set:")
print(f"  Min:     ${y_test_revenue.min():,.2f}")
print(f"  25th %:  ${np.percentile(y_test_revenue, 25):,.2f}")
print(f"  Median:  ${np.percentile(y_test_revenue, 50):,.2f}")
print(f"  75th %:  ${np.percentile(y_test_revenue, 75):,.2f}")
print(f"  Max:     ${y_test_revenue.max():,.2f}")
print(f"  Mean:    ${y_test_revenue.mean():,.2f}")

# Only calculate precision/recall/F1 if both classes exist in predictions and actuals
if len(unique_pred) > 1 and len(unique_actual) > 1:
    precision = precision_score(y_test_success, y_pred_success, zero_division=0) # Zero division means if no predicted positives, precision is set to 0
    recall = recall_score(y_test_success, y_pred_success, zero_division=0)
    f1 = f1_score(y_test_success, y_pred_success, zero_division=0)
    cm = confusion_matrix(y_test_success, y_pred_success)
    
    print(f"\nAccuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f} (of predicted successes, % actually successful)")
    print(f"Recall:    {recall:.4f} (of actual successes, % we caught)")
    print(f"F1 Score:  {f1:.4f}")
    
    print("\nConfusion Matrix:")
    print(f"                 Predicted")
    print(f"               Fail  Success")
    print(f"Actual Fail    {cm[0,0]:4d}    {cm[0,1]:4d}")
    print(f"      Success  {cm[1,0]:4d}    {cm[1,1]:4d}")
else:
    print(f"\nAccuracy:  {accuracy:.4f}")
    print("\n⚠️  WARNING: Model predicts only one class!")
    print(f"   Try adjusting SUCCESS_THRESHOLD (current: ${SUCCESS_THRESHOLD:,})")
    print(f"   Revenue range in test set: ${y_test.min():,.2f} to ${y_test.max():,.2f}")
    print(f"   Median revenue: ${y_test.median():,.2f}")
    print(f"   Mean revenue: ${y_test.mean():,.2f}")



CLASSIFICATION PERFORMANCE (Success >= $100,000)

Actual successes in test set: 1453 / 6108 (23.8%)
Predicted successes: 1275 / 6108 (20.9%)

Revenue distribution in test set:
  Min:     $162.70
  25th %:  $3,294.40
  Median:  $13,486.75
  75th %:  $88,923.85
  Max:     $1,101,059,867.20
  Mean:    $2,068,424.07

Accuracy:  0.8579
Precision: 0.7294 (of predicted successes, % actually successful)
Recall:    0.6401 (of actual successes, % we caught)
F1 Score:  0.6818

Confusion Matrix:
                 Predicted
               Fail  Success
Actual Fail    4310     345
      Success   523     930


In [14]:
# This shows the top 20 most important features used by the model by importance score
# By default, the importance is calculated based on the number of times a feature is used to split the data across all trees (weight)
importances = pd.Series(model.feature_importances_, index=X_paid.columns)
top20 = importances.sort_values(ascending=False).head(20)

print("\n" + "=" * 50)
print("TOP 20 MOST IMPORTANT FEATURES")
print("=" * 50)
for feature, importance in top20.items():
    print(f"{feature:40s} {importance:.6f}")




TOP 20 MOST IMPORTANT FEATURES
has_sentiment                            0.194738
Categories_Steam Trading Cards           0.136802
Price                                    0.135973
Categories_Family Sharing                0.043950
avg_sentiment                            0.029159
Categories_Steam Cloud                   0.027535
Categories_Multi-player                  0.018761
Genres_Casual                            0.018234
Genres_Indie                             0.018047
release_year                             0.013673
Categories_Co-op                         0.013440
Categories_Steam Achievements            0.013182
Categories_Full controller support       0.012115
Genres_Simulation                        0.011979
Genres_Action                            0.009510
Categories_Partial Controller Support    0.009454
Categories_Single-player                 0.009423
Genres_RPG                               0.009370
Categories_PvP                           0.008403
Genres_Early Acces

In [18]:
X_test["Price"]

28200     2.99
26139     4.99
20278    19.99
35405     9.99
15107     7.99
         ...  
17821    19.99
9683      2.79
16215     5.99
8034      9.99
7258     13.99
Name: Price, Length: 6108, dtype: float64

In [None]:
# Analyze relationship between Price and Revenue
print("\n" + "=" * 50)
print("PRICE VS REVENUE INSIGHTS")
print("=" * 50)


if 'Price' in X_test.columns:
    X_test_reset = X_test.reset_index(drop=True)
    y_test_reset = y_test.reset_index(drop=True)
    
    # Convert y_test from log space -> dollars
    y_test_dollars = np.exp(y_test_reset)   # or np.expm1(...) depending on your transform

    # CORRELATION in dollar space
    price_revenue_corr = np.corrcoef(X_test_reset['Price'], y_test_dollars)[0, 1]
    print(f"Price–Revenue Correlation (dollars): {price_revenue_corr:.4f}")

    # PRICE BINS
    price_bins = [0, 10, 20, 30, 50, 100]

    for i in range(len(price_bins) - 1):
        mask = (X_test_reset['Price'] >= price_bins[i]) & (X_test_reset['Price'] < price_bins[i+1])

        if mask.sum() > 0:
            avg_rev = y_test_dollars[mask].mean()
            count = mask.sum()
            print(f"${price_bins[i]}-${price_bins[i+1]}: Avg Revenue ${avg_rev:,.2f} ({count} games)")



PRICE VS REVENUE INSIGHTS
Price–Revenue Correlation (dollars): 0.1553
$0-$10: Avg Revenue $370,615.73 (4232 games)
$10-$20: Avg Revenue $1,949,411.62 (1423 games)
$20-$30: Avg Revenue $6,712,427.72 (269 games)
$30-$50: Avg Revenue $23,376,405.38 (139 games)
$50-$100: Avg Revenue $89,882,342.34 (36 games)


In [None]:
if 'Price' in X_test.columns:
    X_test_reset = X_test.reset_index(drop=True)
    y_test_reset = y_test.reset_index(drop=True)

    # Convert log revenue → actual dollars
    y_test_dollars = np.exp(y_test_reset)

    # Price bins
    price_bins = [0, 10, 20, 30, 50, 100]

    print("==================================================")
    print("PRICE VS REVENUE INSIGHTS (MEDIAN)")
    print("==================================================")

    for i in range(len(price_bins) - 1):
        low = price_bins[i]
        high = price_bins[i+1]

        mask = (X_test_reset['Price'] >= low) & (X_test_reset['Price'] < high)
        count = mask.sum()

        if count > 0:
            median_rev = np.median(y_test_dollars[mask])
            print(f"${low}-${high}: Median Revenue ${median_rev:,.2f} ({count} games)")


PRICE VS REVENUE INSIGHTS (MEDIAN)
$0-$10: Median Revenue $5,978.50 (4232 games)
$10-$20: Median Revenue $77,962.00 (1423 games)
$20-$30: Median Revenue $323,871.40 (269 games)
$30-$50: Median Revenue $1,471,633.00 (139 games)
$50-$100: Median Revenue $4,315,731.70 (36 games)


In [16]:

print("\n" + "=" * 50)
print("SAMPLE PREDICTIONS")
print("=" * 50)

# Show 5 random sample predictions with actual vs predicted revenue and success/fail labels
# This is data from the test set, so the model has not seen these games during training
sample_indices = np.random.choice(len(y_test), size=5, replace=False)
for idx in sample_indices:
    actual = y_test_revenue.iloc[idx]
    predicted = y_pred_revenue[idx]
    actual_label = "SUCCESS" if actual >= SUCCESS_THRESHOLD else "FAIL"
    pred_label = "SUCCESS" if predicted >= SUCCESS_THRESHOLD else "FAIL"
    
    print(f"\nGame #{idx}")
    print(f"  Actual:    ${actual:,.2f} ({actual_label})")
    print(f"  Predicted: ${predicted:,.2f} ({pred_label})")
    print(f"  Match: {'✓' if actual_label == pred_label else '✗'}")


SAMPLE PREDICTIONS

Game #519
  Actual:    $71,953.00 (FAIL)
  Predicted: $45,069.61 (FAIL)
  Match: ✓

Game #6061
  Actual:    $2,389.00 (FAIL)
  Predicted: $2,149.72 (FAIL)
  Match: ✓

Game #15
  Actual:    $1,647.70 (FAIL)
  Predicted: $5,235.33 (FAIL)
  Match: ✓

Game #2868
  Actual:    $4,111,615.50 (SUCCESS)
  Predicted: $289,808.53 (SUCCESS)
  Match: ✓

Game #5961
  Actual:    $29,442.00 (FAIL)
  Predicted: $25,068.68 (FAIL)
  Match: ✓
