# Lecture 14 Notes

The plan:
1. set up imports
2. load the data
3. split into training and testing data
4. define the different hyperparameter values
5. loop over combinations of hyperparameters, train model, and evaluate accuracy

In [12]:
# set up imports
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
# load the data
digits = load_digits()

X = digits.data
y = digits.target

X.shape, y.shape

((1797, 64), (1797,))

In [3]:
# do the train test split (75% of data for training)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

y_train

array([5, 2, 0, ..., 2, 7, 1], shape=(1347,))

In [8]:
# define hyperparameter values to test
max_samples = [0.25, 0.5, 0.75, 0.99, 1]
max_features = [0.25, 0.5, 0.75, 0.99, 1]

In [None]:
results = []

for max_s in max_samples:
    for max_f in max_features:
        # create a random forest classifier with these parameters
        rf_clf = RandomForestClassifier(max_samples=max_s, max_features=max_f, random_state=42)

        # fit the model
        rf_clf.fit(X_train, y_train)

        # make predictions
        y_preds = rf_clf.predict(X_test)

        # evaluate accuracy
        acc = accuracy_score(y_test, y_preds)

        results.append({
            'max features': max_f,
            'max samples': max_s,
            'accuracy': acc,
        })

result_df = pd.DataFrame(results)
result_df 

Unnamed: 0,max features,max samples,accuracy
0,0.25,0.25,0.962222
1,0.5,0.25,0.964444
2,0.75,0.25,0.957778
3,0.99,0.25,0.96
4,1.0,0.25,0.953333
5,0.25,0.5,0.975556
6,0.5,0.5,0.968889
7,0.75,0.5,0.968889
8,0.99,0.5,0.964444
9,1.0,0.5,0.962222


In [26]:
# define hyperparameter values to test
n_estimator_list = [5, 10, 50, 100, 200, 1000]
max_depth_list = [2, 4, 6, 8, 10, None]

In [28]:
results = []

for num_estimator in n_estimator_list:
    for max_depth in max_depth_list:
        # create a random forest classifier with these parameters
        rf_clf = RandomForestClassifier(n_estimators=num_estimator, max_depth=max_depth, random_state=42)

        # fit the model
        rf_clf.fit(X_train, y_train)

        # make predictions
        y_preds = rf_clf.predict(X_test)

        # evaluate accuracy
        acc = accuracy_score(y_test, y_preds)

        results.append({
            'num estimators': num_estimator,
            'max depth': max_depth,
            'accuracy': acc,
        })

result_df = pd.DataFrame(results)
result_df 

Unnamed: 0,num estimators,max depth,accuracy
0,5,2.0,0.646667
1,5,4.0,0.815556
2,5,6.0,0.895556
3,5,8.0,0.922222
4,5,10.0,0.913333
5,5,,0.931111
6,10,2.0,0.726667
7,10,4.0,0.884444
8,10,6.0,0.924444
9,10,8.0,0.955556
