In [15]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import median_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [16]:
df = pd.read_json(r'C:\Code\IT3190E_Group_32\Source Code\Data\preprocessed_movieDB.json')
df.drop(columns = 'title', axis = 1, inplace = True)

Unnamed: 0,duration,user_score,number_of_vote,budget,revenue,restriction,year,month,star_power_actors,star_power_directors,...,genre_history,genre_horror,genre_music,genre_mystery,genre_romance,genre_science fiction,genre_thriller,genre_tv movie,genre_war,genre_western
0,98,58,2560,4000000,4257354,5,1995,12,9.098045e+08,5.001889e+08,...,0,0,0,0,0,0,0,0,0,0
1,109,65,321,21000000,12136938,5,1993,10,5.573047e+08,3.604714e+08,...,0,0,0,0,0,0,1,0,0,0
2,121,82,19970,11000000,775398007,3,1977,5,1.336120e+09,3.341551e+09,...,0,0,0,0,0,1,0,0,0,0
3,100,78,18701,94000000,940335536,0,2003,5,3.378325e+09,1.745747e+09,...,0,0,0,0,0,0,0,0,0,0
4,142,85,26574,55000000,677387716,4,1994,7,9.892606e+08,3.613586e+09,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6229,97,71,1992,2900000,1700000,5,2018,9,1.694463e+07,8.954249e+06,...,0,1,0,0,0,0,0,0,0,0
6230,147,67,5769,165000000,1001978080,4,2022,6,1.420629e+09,1.001978e+09,...,0,0,0,0,0,1,0,0,0,0
6231,110,76,3628,20000000,294803785,4,2023,10,3.598118e+08,2.948038e+08,...,0,1,0,1,0,0,0,0,0,0
6232,140,80,1,70000,75000,2,2009,5,7.500000e+04,1.480475e+09,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df['revenue'] = np.log(df['revenue'] + 1)
df['budget'] = np.log(df['budget'] + 1)
df

Unnamed: 0,duration,user_score,number_of_vote,budget,revenue,restriction,year,month,star_power_actors,star_power_directors,...,genre_history,genre_horror,genre_music,genre_mystery,genre_romance,genre_science fiction,genre_thriller,genre_tv movie,genre_war,genre_western
0,98,58,2560,15.201805,15.264159,5,1995,12,9.098045e+08,5.001889e+08,...,0,0,0,0,0,0,0,0,0,0
1,109,65,321,16.860033,16.311764,5,1993,10,5.573047e+08,3.604714e+08,...,0,0,0,0,0,0,1,0,0,0
2,121,82,19970,16.213406,20.468887,3,1977,5,1.336120e+09,3.341551e+09,...,0,0,0,0,0,1,0,0,0,0
3,100,78,18701,18.358805,20.661747,0,2003,5,3.378325e+09,1.745747e+09,...,0,0,0,0,0,0,0,0,0,0
4,142,85,26574,17.822844,20.333754,4,1994,7,9.892606e+08,3.613586e+09,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6229,97,71,1992,14.880222,14.346139,5,2018,9,1.694463e+07,8.954249e+06,...,0,1,0,0,0,0,0,0,0,0
6230,147,67,5769,18.921456,20.725242,4,2022,6,1.420629e+09,1.001978e+09,...,0,0,0,0,0,1,0,0,0,0
6231,110,76,3628,16.811243,19.501821,4,2023,10,3.598118e+08,2.948038e+08,...,0,1,0,1,0,0,0,0,0,0
6232,140,80,1,11.156265,11.225257,2,2009,5,7.500000e+04,1.480475e+09,...,0,0,0,0,0,0,0,0,0,0


In [18]:
X = df.drop(['revenue'], axis=1)
y = df['revenue']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
gbdt = GradientBoostingRegressor(
    loss='squared_error',
    learning_rate=0.05,
    n_estimators=300,
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42
)
gbdt.fit(X_train_scaled, y_train)

In [22]:
y_pred = gbdt.predict(X_test_scaled)
mae = median_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f'GBDT Test MAE: {mae}')
print(f'GBDT Test RMSE: {rmse}')
print(f'GBDT Test R²: {r2}')

GBDT Test MAE: 0.4734918648148785
GBDT Test RMSE: 1.1630179439156336
GBDT Test R²: 0.8022476389551915


In [23]:
from scipy.stats import randint, uniform
param_dist = {
    'learning_rate': uniform(0.01, 0.09),
    'n_estimators': randint(100, 501),
    'max_depth': randint(3, 8),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2', None],
    'subsample': uniform(0.6, 0.4)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

gbdt = GradientBoostingRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist, n_iter=250, cv=cv, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits


In [24]:
best_params = random_search.best_params_
best_model = random_search.best_estimator_

In [25]:
y_pred = best_model.predict(X_test_scaled)

mae = median_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f'Best Parameters: {best_params}')
print(f'GBDT MAE: {mae}')
print(f'GBDT RMSE: {rmse}')
print(f'GBDT R²: {r2}')

Best Parameters: {'learning_rate': 0.050124947756823204, 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 199, 'subsample': 0.6571467271687763}
GBDT MAE: 0.4167697734713691
GBDT RMSE: 1.1368209242166454
GBDT R²: 0.8110560615590713
