
# Random Forest



In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],        # Maximum depth of the tree
    'min_samples_split': [2, 10, 20]        # Minimum number of samples required to split
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Perform grid search with cross-validation (cv=3)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters from grid search
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Evaluate the best model
best_rf_model = grid_search.best_estimator_

# Predict on validation and test sets
y_pred_val_best_rf = best_rf_model.predict(X_val)
y_pred_test_best_rf = best_rf_model.predict(X_test)

# Evaluate performance
mse_val_best_rf = mean_squared_error(y_val, y_pred_val_best_rf)
r2_val_best_rf = r2_score(y_val, y_pred_val_best_rf)
print(f"Best Random Forest - Mean Squared Error (Validation Data): {mse_val_best_rf}")
print(f"Best Random Forest - R-squared (Validation Data): {r2_val_best_rf}")

mse_test_best_rf = mean_squared_error(y_test, y_pred_test_best_rf)
r2_test_best_rf = r2_score(y_test, y_pred_test_best_rf)
print(f"Best Random Forest - Mean Squared Error (Test Data): {mse_test_best_rf}")
print(f"Best Random Forest - R-squared (Test Data): {r2_test_best_rf}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

importances = best_rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importances (Best Random Forest)")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.ylabel('Importance')
plt.show()

# Validation set plot
plt.figure(figsize=(8, 6))
plt.scatter(y_val, y_pred_val_best_rf, color='green', alpha=0.3, label='Predicted vs Actual')
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], color='red', lw=2, label='Perfect Fit Line')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Validation Data: Actual vs Predicted Energy (Best Random Forest)')
plt.legend()
plt.grid(True)
plt.show()

# Test set plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_test_best_rf, color='blue', alpha=0.3, label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2, label='Perfect Fit Line')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Test Data: Actual vs Predicted Energy (Best Random Forest)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Step 1: Plot the actual vs predicted for Validation set
plt.figure(figsize=(8, 6))
plt.scatter(y_val, y_pred_val_ridge, color='green', alpha=0.3, label='Predicted vs Actual')
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], color='red', lw=2, label='Perfect Fit Line')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Validation Data: Actual vs Predicted Energy')
plt.legend()
plt.grid(True)
plt.show()

# Step 2: Plot the actual vs predicted for Test set
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_test_ridge, color='blue', alpha=0.3, label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2, label='Perfect Fit Line')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Test Data: Actual vs Predicted Energy')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
residuals = y_test - y_pred_test
plt.scatter(X_test, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Loudness')
plt.ylabel('Residuals')
plt.title('Residuals Plot')
plt.show()
