In [12]:
import numpy as np
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, r2_score

In [14]:
dataDict = pickle.load(open('./data.pickle', 'rb'))

data = np.asarray(dataDict['data'])
labels = np.asarray(dataDict['labels'])

In [16]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size = 0.3, stratify=labels, random_state=6)

In [18]:

model1 = RandomForestClassifier(n_estimators=150, max_depth = 150, random_state=10, min_samples_split= 2, min_samples_leaf=1, max_features='sqrt')



model1.fit(x_train, y_train)



y_predict = model1.predict(x_test)



score = accuracy_score(y_predict, y_test)
score

0.8388998035363457

In [19]:
f = open('model.pickle', 'wb')
pickle.dump({'model': model1}, f)
f.close()

In [21]:
modelDict = pickle.load(open('./model.pickle', 'rb'))
model = modelDict['model']

y_predict = model1.predict(x_test)
score = accuracy_score(y_predict, y_test)
score

0.8388998035363457

In [24]:


y_predict = model1.predict(x_test)
score = accuracy_score(y_predict, y_test)
score

0.8388998035363457

In [26]:
precision = precision_score(y_test, y_predict, average='weighted')
print(f"Precision: {precision:.2f}")

Precision: 0.85


In [28]:
recall = recall_score(y_test, y_predict, average='weighted')
print(f"Recall: {recall:.2f}")

Recall: 0.84


In [30]:
f1 = f1_score(y_test, y_predict, average='weighted')
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.84


In [351]:
report = classification_report(y_test, y_predict)
print("\nClassification Report:\n", report)


Classification Report:
               precision    recall  f1-score   support

           A       0.68      0.91      0.78        23
           B       0.90      1.00      0.95        18
           C       1.00      0.93      0.96        14
           D       0.83      1.00      0.91        20
           E       0.81      0.72      0.76        18
           F       1.00      0.96      0.98        23
           G       0.84      1.00      0.91        21
           H       1.00      0.88      0.94        17
           I       0.86      0.96      0.91        26
           J       0.64      0.56      0.60        25
           K       0.91      1.00      0.95        20
           L       0.92      0.92      0.92        24
           M       0.92      0.71      0.80        17
           N       0.75      0.94      0.83        16
           O       1.00      0.78      0.88        18
           P       1.00      0.81      0.90        16
           Q       0.79      0.94      0.86        16
  

# Parameters

n_estimators the more the higher accuracy
max_depth the more the higher accuracy
min_samples_split the less the higher accuracy
min_samples_leaf the less the higher accuracy
max_features = sqrt is better than log in

n_estimators=150, max_depth = 150, random_state=10, min_samples_split= 2, min_samples_leaf=1, max_features='sqrt'
accuracy -> 83.9%
    
n_estimators=1, max_depth = 1, random_state=10, min_samples_split= 150, min_samples_leaf=150, max_features='log2'
accuracy -> 9.2%



In [305]:
model1 = RandomForestClassifier(n_estimators=1, max_depth = 1, random_state=10, min_samples_split= 150, min_samples_leaf=150, max_features='log2')



model1.fit(x_train, y_train)



y_predict = model1.predict(x_test)

score = accuracy_score(y_predict, y_test)
score

0.0855457227138643

In [315]:
#Parameter tuning for random forest code from here https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
#Gread search code from here https://scikit-learn.org/1.6/modules/generated/sklearn.model_selection.GridSearchCV.html

from sklearn.model_selection import RandomizedSearchCV

mode2 = RandomForestClassifier(random_state=10)

# Define the parameter grid for RandomizedSearchCV
param_grid = {
     'n_estimators': [10, 20, 30, 40, 50],
    'max_depth': [None, 1, 4, 6, 7, 9, 10, 15, 20, 30],
    'min_samples_split': [2, 3, 5, 10],
    'min_samples_leaf': [1, 2, 4, 5, 6, 7],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=mode2,
    param_distributions=param_grid,
    n_iter= 1,  # Number of random combinations to try
    cv=5,
    scoring='accuracy',
    n_jobs=-1,  # Utilize all CPU core
    random_state=10
)
random_search.fit(x_train, y_train)

# Get the best model from RandomizedSearchCV
best_model = random_search.best_estimator_

# Make predictions on the test set
y_predict = best_model.predict(x_test)

# Evaluate accuracy
score = accuracy_score(y_predict, y_test)

# Print results
print(f"Best Parameters: {random_search.best_params_}")
print(f"Accuracy Score: {score}")

Best Parameters: {'n_estimators': 50, 'min_samples_split': 3, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 6}
Accuracy Score: 0.5958702064896755


Parameter turning:

1)
Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}
Accuracy Score: 0.805500982318271

2) Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None}
Accuracy Score: 0.740667976424361
3) Best Parameters: {'n_estimators': 400, 'min_samples_split': 3, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 6}
Accuracy Score: 0.64047151277013754


2