In [None]:
import numpy as np
from typing import Tuple

def load_data(filename: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    data = np.load(f'{filename}')
    return data['features'], data['domains'], data['digits']

features_train, domains_train, digits_train = load_data('train_data.npz')
features_test, domains_test, digits_test = load_data('test_data.npz')

print(features_train.shape)
print(np.unique(domains_train, return_counts=True))

# print(features_test.shape)
# print(np.unique(domains_test, return_counts=True))

(25000, 1024)
(array([0, 1, 2, 3, 4]), array([5000, 5000, 5000, 5000, 5000]))


In [None]:
train = np.hstack((features_train, domains_train.reshape(-1, 1)))
train = np.hstack((train, digits_train.reshape(-1, 1)))
test = np.hstack((features_test, domains_test.reshape(-1, 1)))
test = np.hstack((test, digits_test.reshape(-1, 1)))
print(train.shape)
print(test.shape)

(25000, 1026)
(25000, 1026)


## finding best hyper parameters for Random forest

In [None]:
# first take a small part of data
from sklearn.model_selection import train_test_split

param_train, dummy= train_test_split(train, train_size=0.1, random_state=42)
param_train.shape

(2500, 1026)

In [None]:
# testing different hyper params to find the estimation of best params
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import time
start = time.time()

n_estimators = [i for i in range(100, 300, 20)]
max_depths = [i for i in range(10, 50, 4)]
estimated_n_estimator = None
estimated_max_depth = None
best_accuracy = 0

for n_estimator in n_estimators:
    for max_depth in max_depths:
        clf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimator, random_state=42)
        clf.fit(param_train[:, 0:1024], param_train[:, 1025])
        acc = accuracy_score(clf.predict(test[:, 0:1024]), test[:, 1025])
        if acc > best_accuracy:
            best_accuracy = acc
            estimated_n_estimator = n_estimator
            estimated_max_depth = max_depth

print(best_accuracy)
print(estimated_max_depth)
print(estimated_n_estimator)
time.time() - start

KeyboardInterrupt: ignored

In [None]:
# testing different hyper params around the estimated hyper prams to find 
# the best parameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import time
start = time.time()

n_estimators = [i for i in range(estimated_n_estimator - 20, estimated_n_estimator + 20, 2)]
max_depths = [i for i in range(estimated_max_depth - 4, estimated_max_depth + 4)]
best_n_estimator = None
best_max_depth = None
best_accuracy = 0

for n_estimator in n_estimators:
    for max_depth in max_depths:
        clf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimator, random_state=42)
        clf.fit(param_train[:, 0:1024], param_train[:, 1025])
        acc = accuracy_score(clf.predict(test[:, 0:1024]), test[:, 1025])
        if acc > best_accuracy:
            best_accuracy = acc
            best_n_estimator = n_estimator
            best_max_depth = max_depth

print(best_accuracy)
print(best_max_depth)
print(best_n_estimator)
time.time() - start

0.75252
22
298


1159.21426820755

In [None]:
# testing different hyper params around the estimated hyper prams to find 
# the best parameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import time
start = time.time()

n_estimators = [i for i in range(estimated_n_estimator - 20, estimated_n_estimator + 20, 2)]
max_depths = [i for i in range(estimated_max_depth - 4, estimated_max_depth + 4)]
best_n_estimator = None
best_max_depth = None
best_accuracy = 0

for n_estimator in n_estimators:
    for max_depth in max_depths:
        clf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimator, random_state=42)
        clf.fit(param_train[:, 0:1024], param_train[:, 1025])
        acc = accuracy_score(clf.predict(test[:, 0:1024]), test[:, 1025])
        if acc > best_accuracy:
            best_accuracy = acc
            best_n_estimator = n_estimator
            best_max_depth = max_depth

print(best_accuracy)
print(best_max_depth)
print(best_n_estimator)
time.time() - start

0.75252
22
298


1159.21426820755

In [None]:
from sklearn.ensemble import RandomForestClassifier
 
clf = RandomForestClassifier(max_depth=best_max_depth, n_estimators=best_n_estimator, random_state=0)
clf.fit(features_train, digits_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier
 
clf = RandomForestClassifier(max_depth=best_max_depth, n_estimators=best_n_estimator, random_state=0)
clf.fit(features_train, digits_train)


In [None]:
predict_y = clf.predict(features_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predict_y, digits_test)

0.8784

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predict_y, digits_test)

0.8784

In [None]:
train[:, 1025]

array([8., 2., 5., ..., 6., 2., 4.])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
import time
start = time.time()
# Generate a random dataset for demonstration
X, y = make_classification(n_samples=1000, n_features=10, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [i for i in range(100, 300, 20)],
    'max_depth': [i for i in range(10, 50, 4)]
}

# Create the Random Forest classifier
rf = RandomForestClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(param_train[:, 0:1024], param_train[:, 1025])

# Retrieve the best parameters and score
best_n_estimators = grid_search.best_params_['n_estimators']
best_max_depth = grid_search.best_params_['max_depth']
best_score = grid_search.best_score_

# Print the results
print("Best n_estimators:", best_n_estimators)
print("Best max_depth:", best_max_depth)
print("Best score:", best_score)
time.time() - start

Best n_estimators: 280
Best max_depth: 26
Best score: 0.82
