In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

**Load the dataset**

In [70]:
df = pd.read_csv("datasaurus.csv")

**Preprocess the data**

In [71]:
le = LabelEncoder()
df['dataset'] = le.fit_transform(df['dataset'])

**Split the data**

In [72]:
X = df.drop('dataset', axis=1)
y = df['dataset']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Train the baseline models**

In [73]:
clf_j48 = DecisionTreeClassifier()
clf_rf = RandomForestClassifier()

clf_j48.fit(X_train, y_train)
clf_rf.fit(X_train, y_train)

**Evaluate the baseline models**

In [74]:
y_pred_j48 = clf_j48.predict(X_test)
y_pred_rf = clf_rf.predict(X_test)

print("J48")
print(classification_report(y_test, y_pred_j48))

print("Random Forest")
print(classification_report(y_test, y_pred_rf))

J48
              precision    recall  f1-score   support

           0       0.21      0.21      0.21        28
           1       0.38      0.35      0.37        37
           2       0.36      0.50      0.42        28
           3       0.18      0.16      0.17        25
           4       0.54      0.65      0.59        20
           5       0.81      0.72      0.76        36
           6       0.28      0.29      0.28        28
           7       0.24      0.29      0.26        24
           8       0.25      0.21      0.23        29
           9       0.70      0.70      0.70        27
          10       0.60      0.45      0.52        33
          11       0.25      0.23      0.24        30
          12       0.31      0.36      0.33        25

    accuracy                           0.40       370
   macro avg       0.39      0.39      0.39       370
weighted avg       0.40      0.40      0.40       370

Random Forest
              precision    recall  f1-score   support

      

**Use Random Search and Grid Search**

In [75]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 20, 40],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True]
}

grid_search = GridSearchCV(estimator=clf_rf, param_grid=param_grid, cv=3)
random_search = RandomizedSearchCV(estimator=clf_rf, param_distributions=param_grid, cv=3)

grid_search.fit(X_train, y_train)
random_search.fit(X_train, y_train)

**Train the models with the optimal parameters**

In [76]:
clf_rf_grid = RandomForestClassifier(**grid_search.best_params_)
clf_rf_random = RandomForestClassifier(**random_search.best_params_)

clf_rf_grid.fit(X_train, y_train)
clf_rf_random.fit(X_train, y_train)

**Evaluate the models with the optimal parameters**

In [77]:
y_pred_rf_grid = clf_rf_grid.predict(X_test)
y_pred_rf_random = clf_rf_random.predict(X_test)

print("Random Forest Grid Search")
print(classification_report(y_test, y_pred_rf_grid))

print("Random Forest Random Search")
print(classification_report(y_test, y_pred_rf_random))

Random Forest Grid Search
              precision    recall  f1-score   support

           0       0.29      0.25      0.27        28
           1       0.30      0.27      0.29        37
           2       0.54      0.75      0.63        28
           3       0.28      0.20      0.23        25
           4       0.83      0.95      0.88        20
           5       0.68      0.75      0.71        36
           6       0.36      0.32      0.34        28
           7       0.18      0.17      0.17        24
           8       0.25      0.17      0.20        29
           9       0.61      0.74      0.67        27
          10       0.73      0.73      0.73        33
          11       0.31      0.27      0.29        30
          12       0.44      0.60      0.51        25

    accuracy                           0.47       370
   macro avg       0.45      0.47      0.45       370
weighted avg       0.44      0.47      0.45       370

Random Forest Random Search
              precision  