In [1]:
from scipy.io import arff
import pandas as pd

data, meta = arff.loadarff('../dataset.arff')

df = pd.DataFrame(data)

In [8]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

for column in df.columns:
    df[column] = df[column].str.decode('utf-8').astype(int)

X = df.drop('Result', axis=1)
y = df['Result']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


rf_classifier = RandomForestClassifier()


param_grid = {
    'n_estimators': [int(x) for x in range(10, 210, 10)],
    'max_depth': [None] + [int(x) for x in range(10, 110, 10)],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(
    rf_classifier,
    param_distributions=param_grid,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    random_state=42
)

In [9]:
random_search.fit(X_train, y_train)


In [10]:
print("Best Parameters: ", random_search.best_params_)
print("Best Accuracy: ", random_search.best_score_)

y_pred = random_search.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: ", test_accuracy)

Best Parameters:  {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 100, 'bootstrap': False}
Best Accuracy:  0.9711666994927727
Test Accuracy:  0.968340117593849
