# Homework 5 RF Accuracy Improvement

This assignment is inspired by examples of Shan-Hung Wu from National Tsing Hua University.

Requirement: improve the accuracy per feature of the following code from 0.03 up to at least 0.45 and accuracy should be more than 0.92

Here are three hints:

    You can improve the ratio by picking out or "creating" several features.
    Tune hyperparameters
    The ratio can be improved from 0.03 up to 0.47.

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)

print(X.shape)

# TODO Select some features (X), hint: based on the connections with
# our Y (importance? correlation?)
# TODO need 5 fold cross validation
# TODO Tune parameters for RandomForestClassifier
# TODO Calculate Average accuracy score
# TODO Calculate Average (accuracy score/number of features)

(569, 30)
Accuracy: 0.95
Accuracy per feature: 0.47


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Feature Selection using RandomForest importance
feature_selector = RandomForestClassifier(n_estimators=100, random_state=42)
feature_selector.fit(X, y)

# Get feature importances
importances = feature_selector.feature_importances_

# Use SelectFromModel to select features based on importance
# Setting a threshold to the mean of the feature importances
threshold = 3.5 * np.mean(importances)  # You can adjust the threshold (e.g., '1.5*mean') for stricter selection
selector = SelectFromModel(feature_selector, threshold=threshold, prefit=True)

# Reduce X to selected features
X_selected = selector.transform(X)
selected_feature_indices = selector.get_support(indices=True)

# Print selected feature indices and their importance
selected_importances = importances[selected_feature_indices]
print(f"Selected Feature Indices: {selected_feature_indices}")
print(f"Selected Feature Importances: {selected_importances}")
print(f"Number of Selected Features: {X_selected.shape[1]}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit model
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model to predict
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Cross-validation to check overall performance
cv_scores = cross_val_score(best_rf, X_selected, y, cv=5)
cv_accuracy = np.mean(cv_scores)
print(f"Cross-validated Accuracy: {cv_accuracy:.4f}")

# Calculate Accuracy per feature
num_features = X_selected.shape[1]
accuracy_per_feature = cv_accuracy / num_features
print(f"Accuracy per feature: {accuracy_per_feature:.4f}")

# Check if the conditions are met
assert cv_accuracy > 0.92, "Accuracy should be more than 0.92"
assert accuracy_per_feature > 0.45, "Accuracy per feature should be more than 0.45"


Selected Feature Indices: [23 27]
Selected Feature Importances: [0.13935694 0.13222509]
Number of Selected Features: 2
Best Hyperparameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 150}
Accuracy: 0.9561
Cross-validated Accuracy: 0.9403
Accuracy per feature: 0.4701
