In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

one_m_data = pd.read_csv('./processed_data.csv')

y = one_m_data["popularity"].values
X = one_m_data.drop(["popularity"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regressor = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550],
    'max_depth': [10, 20, 30, 40, 50, 60, 70],
    'min_samples_split': [2, 3, 5, 7, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6],
    'max_features': ["auto", "sqrt", "log2"],
    'oob_score': [True, False]
}

grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

best_regressor = grid_search.best_estimator_
y_pred = best_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

joblib.dump(best_regressor, 'random_forest_regressor.pkl')