# Ranom Forest

In [291]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import data_preprocess as dp
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from joblib import dump

## Preprocess training data

In [292]:
# Load training and test data
features_train, labels_train = dp.load_training_data() 
features_test, labels_test = dp.load_test_data()  

# Initialize the Random Forest Classifier
random_forest_model = RandomForestClassifier()

# Initialize PCA with 8 components
pca = PCA(n_components=8)

# Fit PCA on training data
pca.fit(features_train)

# Transform training and test data to its first 8 principal components
features_train_pca = pca.transform(features_train)
features_test_pca = pca.transform(features_test)

# Print the transformed training data
features_train_pca

array([[ 4.48091200e+01,  2.39357625e+00,  1.81904250e+00, ...,
        -3.56951687e-02, -3.43122903e-02, -7.93313095e-02],
       [-9.57552934e+00, -6.82532997e+00,  3.74211277e+00, ...,
        -5.28223275e-02, -1.40999621e-01,  9.16049756e-03],
       [-2.95063958e+01, -4.38364477e+00, -1.72558142e-01, ...,
         6.64725324e-02, -1.38461124e-01,  1.27169872e-01],
       ...,
       [ 1.95381795e+01,  4.08599756e+00, -1.16878183e+00, ...,
        -4.26433129e-02, -1.17503983e-01, -1.38066501e-02],
       [ 8.88923869e+01, -1.57313963e+01,  6.96616887e-03, ...,
         6.27896605e-01,  1.07027659e+00, -6.08949387e-02],
       [-1.26992464e+01,  2.08694387e+00,  1.69257957e+00, ...,
        -9.58516496e-02, -6.90567671e-02, -1.03406103e-01]])

## 5-fold cross validation

In [293]:
# Define hyperparameters for the model
hyperparameters = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 50, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True]
}

# Initialize GridSearchCV with 5-fold cross validation
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=hyperparameters, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)

# Fit GridSearchCV on training data
grid_search.fit(features_train, labels_train)

# Print the best parameters
grid_search.best_params_

Fitting 5 folds for each of 162 candidates, totalling 810 fits


{'bootstrap': True,
 'max_depth': 100,
 'max_features': 'log2',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 50}

## Train the final model and make predictions

In [299]:
# Fit the model on the training data
grid_search.best_estimator_.fit(features_train_pca, labels_train)

# Predict the labels of the test set
predictions = grid_search.best_estimator_.predict(features_test_pca)

# calculate the accuracy and f1 score of the model
f1 = f1_score(labels_test, predictions, average='weighted')
accuracy = accuracy_score(predictions, labels_test)

# Save the model
dump(grid_search.best_estimator_, 'models/random_forest.joblib')

# Print
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.7125
F1 Score: 0.7009427024058242
