# Random Forest Tutorial
## A Complete Guide to Random Forest Classification and Regression

This notebook covers:
- Random Forest fundamentals
- Classification example
- Regression example
- Feature importance
- Hyperparameter tuning
- Model evaluation

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score
from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Random Forest Classification Example
### Using the Iris Dataset

In [None]:
# Load and explore the Iris dataset
iris = load_iris()
X_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
y_iris = iris.target

print("Dataset shape:", X_iris.shape)
print("\nFirst 5 rows:")
print(X_iris.head())
print("\nTarget classes:", iris.target_names)

In [None]:
# Split the data
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.3, random_state=42, stratify=y_iris
)

# Create and train Random Forest Classifier
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=3,
    random_state=42,
    n_jobs=-1
)

rf_classifier.fit(X_train_iris, y_train_iris)

# Make predictions
y_pred_iris = rf_classifier.predict(X_test_iris)
y_pred_proba_iris = rf_classifier.predict_proba(X_test_iris)

print("Classification Results:")
print(classification_report(y_test_iris, y_pred_iris, target_names=iris.target_names))

In [None]:
# Visualize confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test_iris, y_pred_iris)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, 
            yticklabels=iris.target_names)
plt.title('Confusion Matrix - Random Forest Classification')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Feature importance visualization
feature_importance = pd.DataFrame({
    'feature': X_iris.columns,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance - Random Forest Classification')
plt.xlabel('Importance Score')
plt.show()

print("Feature Importance Ranking:")
print(feature_importance)

## 2. Random Forest Regression Example
### Using the California Housing Dataset

In [None]:
# Load and explore California housing dataset

housing = fetch_california_housing()
X_california = pd.DataFrame(data=housing.data, columns=housing.feature_names)
y_california = housing.target

print("Dataset shape:", X_california.shape)
print("\nFirst 5 rows:")
print(X_california.head())
print("\nTarget statistics:")
print(f"Mean: {y_california.mean():.2f}")
print(f"Std: {y_california.std():.2f}")
print(f"Min: {y_california.min():.2f}")
print(f"Max: {y_california.max():.2f}")

In [None]:
# Split the data
X_train_california, X_test_california, y_train_california, y_test_california = train_test_split(
    X_california, y_california, test_size=0.3, random_state=42
)

# Create and train Random Forest Regressor
rf_regressor = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_regressor.fit(X_train_california, y_train_california)

# Make predictions
y_pred_california = rf_regressor.predict(X_test_california)

# Calculate metrics
mse = mean_squared_error(y_test_california, y_pred_california)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_california, y_pred_california)

print("Regression Results:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.3f}")
print(f"Mean Absolute Error: {np.mean(np.abs(y_test_california - y_pred_california)):.2f}")

In [None]:
# Visualize predictions vs actual values
plt.figure(figsize=(12, 5))

# Scatter plot
plt.subplot(1, 2, 1)
plt.scatter(y_test_california, y_pred_california, alpha=0.6)
plt.plot([y_test_california.min(), y_test_california.max()], 
         [y_test_california.min(), y_test_california.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')

# Residuals plot
plt.subplot(1, 2, 2)
residuals = y_test_california - y_pred_california
plt.scatter(y_pred_california, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals Plot')

plt.tight_layout()
plt.show()

In [None]:
# Feature importance for regression
feature_importance_reg = pd.DataFrame({
    'feature': X_california.columns,
    'importance': rf_regressor.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance_reg, x='importance', y='feature')
plt.title('Feature Importance - Random Forest Regression')
plt.xlabel('Importance Score')
plt.show()

print("Top 5 Most Important Features:")
print(feature_importance_reg.head())

## 3. Hyperparameter Tuning with Grid Search

In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search for classification
rf_grid = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    rf_grid, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1
)

print("Performing Grid Search...")
grid_search.fit(X_train_iris, y_train_iris)

print("\nBest Parameters:")
print(grid_search.best_params_)
print(f"\nBest Cross-Validation Score: {grid_search.best_score_:.3f}")

# Test the best model
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test_iris)
print(f"\nTest Accuracy with Best Model: {best_rf.score(X_test_iris, y_test_iris):.3f}")

## 4. Cross-Validation and Model Comparison

In [None]:
# Compare different n_estimators values
n_estimators_range = [10, 50, 100, 200, 500]
cv_scores = []

for n_est in n_estimators_range:
    rf_temp = RandomForestClassifier(n_estimators=n_est, random_state=42)
    scores = cross_val_score(rf_temp, X_train_iris, y_train_iris, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())
    print(f"n_estimators={n_est}: CV Score = {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, cv_scores, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Estimators')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Random Forest Performance vs Number of Estimators')
plt.grid(True, alpha=0.3)
plt.show()

## 5. Understanding Random Forest Parameters

In [None]:
# Demonstrate the effect of max_features parameter
max_features_options = ['sqrt', 'log2', None, 0.5]
feature_results = []

for max_feat in max_features_options:
    rf_temp = RandomForestClassifier(
        n_estimators=100, 
        max_features=max_feat, 
        random_state=42
    )
    scores = cross_val_score(rf_temp, X_train_iris, y_train_iris, cv=5)
    feature_results.append({
        'max_features': str(max_feat),
        'mean_score': scores.mean(),
        'std_score': scores.std()
    })

feature_df = pd.DataFrame(feature_results)
print("Effect of max_features parameter:")
print(feature_df)

# Visualize
plt.figure(figsize=(10, 6))
plt.bar(feature_df['max_features'], feature_df['mean_score'], 
        yerr=feature_df['std_score'], capsize=5)
plt.xlabel('max_features Parameter')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Effect of max_features on Random Forest Performance')
plt.show()

## 6. Out-of-Bag (OOB) Score

In [None]:
# Random Forest with OOB scoring
rf_oob = RandomForestClassifier(
    n_estimators=100,
    oob_score=True,
    random_state=42
)

rf_oob.fit(X_train_iris, y_train_iris)

print(f"Out-of-Bag Score: {rf_oob.oob_score_:.3f}")
print(f"Test Score: {rf_oob.score(X_test_iris, y_test_iris):.3f}")

# Compare OOB score with different n_estimators
oob_scores = []
estimator_range = range(10, 201, 10)

for n_est in estimator_range:
    rf_temp = RandomForestClassifier(
        n_estimators=n_est, 
        oob_score=True, 
        random_state=42
    )
    rf_temp.fit(X_train_iris, y_train_iris)
    oob_scores.append(rf_temp.oob_score_)

plt.figure(figsize=(10, 6))
plt.plot(estimator_range, oob_scores, 'g-', linewidth=2)
plt.xlabel('Number of Estimators')
plt.ylabel('OOB Score')
plt.title('Out-of-Bag Score vs Number of Estimators')
plt.grid(True, alpha=0.3)
plt.show()

## 7. Summary and Best Practices

### Parameter Tuning Guidelines

1. **Start with default parameters**, then tune if needed
2. **n_estimators**: More trees = better performance (diminishing returns after ~100)
3. **max_depth**: Control overfitting (None for full depth, 3-10 for regularization)
4. **max_features**: `'sqrt'` for classification, `'log2'` or 1/3 for regression
5. **min_samples_split/leaf**: Increase to prevent overfitting
6. **Use OOB score** for quick model evaluation
7. **Feature importance** helps with feature selection
8. **Random Forest handles** missing values and mixed data types well
9. **No need for feature scaling**
10. **Use n_jobs=-1** for parallel processing

### Key Advantages

- Reduces overfitting compared to single decision trees
- Provides feature importance rankings
- Handles both numerical and categorical features
- Robust to outliers
- Works well with default parameters

### Key Disadvantages

- Can overfit with very noisy data
- Less interpretable than single decision trees
- Memory intensive for large datasets
- Biased towards categorical variables with more levels
