In [1]:
from pandas import read_json
from sklearn.metrics import mean_absolute_error as mae
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV , KFold
import joblib
from sklearn.metrics import r2_score

df = read_json(r'C:\Code\IT3190E_Group_32\Source Code\Data\preprocessed_movieDB.json')

In [2]:
df

Unnamed: 0,title,duration,user_score,number_of_vote,budget,revenue,restriction,year,month,star_power_actors,...,genre_history,genre_horror,genre_music,genre_mystery,genre_romance,genre_science fiction,genre_thriller,genre_tv movie,genre_war,genre_western
0,Four Rooms,98,58,2560,4000000,4257354,5,1995,12,9.098045e+08,...,0,0,0,0,0,0,0,0,0,0
1,Judgment Night,109,65,321,21000000,12136938,5,1993,10,5.573047e+08,...,0,0,0,0,0,0,1,0,0,0
2,Star Wars,121,82,19970,11000000,775398007,3,1977,5,1.336120e+09,...,0,0,0,0,0,1,0,0,0,0
3,Finding Nemo,100,78,18701,94000000,940335536,0,2003,5,3.378325e+09,...,0,0,0,0,0,0,0,0,0,0
4,Forrest Gump,142,85,26574,55000000,677387716,4,1994,7,9.892606e+08,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6229,Climax,97,71,1992,2900000,1700000,5,2018,9,1.694463e+07,...,0,1,0,0,0,0,0,0,0,0
6230,Jurassic World Dominion,147,67,5769,165000000,1001978080,4,2022,6,1.420629e+09,...,0,0,0,0,0,1,0,0,0,0
6231,Five Nights at Freddy's,110,76,3628,20000000,294803785,4,2023,10,3.598118e+08,...,0,1,0,1,0,0,0,0,0,0
6232,Alpviram,140,80,1,70000,75000,2,2009,5,7.500000e+04,...,0,0,0,0,0,0,0,0,0,0


In [3]:
import numpy as np
cols =['revenue','budget']
for col in cols:
    df[col] = np.log(df[col] + 1)


In [4]:
X = df.drop(['revenue','title'], axis=1)
y = df['revenue'].values

In [5]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [6]:
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest

In [7]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train,y_train)

In [8]:
val_preds = rf_reg.predict(X_val)
validation_error = mae(y_val, val_preds)
print(f'Validation Error: {validation_error}')

Validation Error: 0.7436820620409428


In [9]:

y_pred = rf_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = rf_reg.score(X_test, y_test)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')

Mean Squared Error: 1.11
R-squared: 0.83


In [10]:


param_grid = {
    'n_estimators': [ 100, 200,300],
    'max_depth': [5,10,15, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestRegressor()
cv = KFold(n_splits=5, shuffle=True, random_state=42)
# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, scoring=['r2','neg_mean_squared_error'], refit='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END max_depth=5, min_samples_leaf

In [11]:
print(f"Best Hyperparameters: {grid_search.best_params_}")
y_pred = grid_search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Mean Squared Error: 1.10
R-squared: 0.83


In [12]:

best_model = grid_search.best_estimator_
cross_val_scores = cross_val_score(best_model, X, y, cv=5)
print("Cross-Validation Scores:", cross_val_scores)
print("Mean Cross-Validation Score:", np.mean(cross_val_scores))

Cross-Validation Scores: [0.72081534 0.74650209 0.72180229 0.77858459 0.87523789]
Mean Cross-Validation Score: 0.7685884383984212


In [13]:
model = RandomForestRegressor(max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
# Fit the model to the data
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 1.09
R-squared: 0.83
