In [1]:
# 27/03/2024
# CSC354 – Assignmen2 – ML – Decision Trees
# Yahya Irfan
# FA21-BSE-044
"""
Download the Datasaurus Dozen dataset from the following link
Link: https://www.openintro.org/data/csv/datasaurus.csv
Note: Please open the dataset file first for manual inspection before performing any experiments.
Use this dataset for a classification task using decision trees. Specifically, use J48 and Random Forest 
classifiers for predicting the type of ‘dataset’ within the Datasaurus Dozen. Start with a baseline model with 
default parameters. Then find optimal parameters for the model using both Random and Grid search methods. 
You are free to use any train/test split, however, only experiment with models’ parameters, keeping rest of the 
settings constant throughout the experiments.
"""

import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


df = pd.read_csv("datasaurus.csv")
df.head()

inputs = df.drop('dataset',axis='columns')
# inputs

target = df.drop(['x','y'],axis='columns')
# target
le_dataset= LabelEncoder()

target['dataset_d'] = le_dataset.fit_transform(target['dataset'])
target = target.drop('dataset',axis='columns').values.ravel()
print(target)

X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=42)

dt_model = tree.DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train,y_train)

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train,y_train)

dt_pred = dt_model.predict(X_test)
rf_pred = rf_model.predict(X_test)

dt_accuracy = accuracy_score(y_test, dt_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)

print("Baseline Decision Tree Accuracy:", dt_accuracy)
print("Baseline Random Forest Accuracy:", rf_accuracy)

# Random Search for Decision Tree
random_param_dist = {
    'criterion': ['entropy'],
    'max_depth': [10, 20,30],
    'min_samples_split': [4, 6, 10],
    'min_samples_leaf': [1, 5],
    'max_features': ['sqrt', 'log2', None]
}
random_search_tree = RandomizedSearchCV(dt_model, param_distributions=random_param_dist, n_iter=50, cv=5, random_state=42)
random_search_tree.fit(X_train, y_train)
random_search_tree_predictions = random_search_tree.predict(X_test)
random_search_accuracy = accuracy_score(y_test, random_search_tree_predictions)
print("Random Search Decision Tree Accuracy:", random_search_accuracy)

# Grid Search for Decision Tree
grid_param_grid = {
    'criterion': ['entropy'],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 7, 10],
    'min_samples_leaf': [1, 2, 6, 10],
    'max_features': ['sqrt', 'log2']
}
grid_search_tree = GridSearchCV(dt_model, param_grid=grid_param_grid, cv=5)
grid_search_tree.fit(X_train, y_train)
grid_search_tree_predictions = grid_search_tree.predict(X_test)
grid_search_accuracy = accuracy_score(y_test, grid_search_tree_predictions)
print("Grid Search Decision Tree Accuracy:", grid_search_accuracy)


# Random Search for Random  Forest
random_param_dist = {
     'n_estimators': [100, 200],
    'criterion': ['entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2']
}
random_search_tree = RandomizedSearchCV(rf_model, param_distributions=random_param_dist, n_iter=50, cv=5, random_state=42)
random_search_tree.fit(X_train, y_train)
random_search_tree_predictions = random_search_tree.predict(X_test)
random_search_accuracy = accuracy_score(y_test, random_search_tree_predictions)
print("Random Search Random Forest Accuracy:", random_search_accuracy)

# Grid Search for Random Forest
grid_param_grid = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}
grid_search_tree = GridSearchCV(rf_model, param_grid=grid_param_grid, cv=5)
grid_search_tree.fit(X_train, y_train)
grid_search_tree_predictions = grid_search_tree.predict(X_test)
grid_search_accuracy = accuracy_score(y_test, grid_search_tree_predictions)
print("Grid Search Random Forest Accuracy:", grid_search_accuracy)


[ 3  3  3 ... 11 11 11]
Baseline Decision Tree Accuracy: 0.3972972972972973
Baseline Random Forest Accuracy: 0.46216216216216216
Random Search Decision Tree Accuracy: 0.3945945945945946
Grid Search Decision Tree Accuracy: 0.3972972972972973
Random Search Random Forest Accuracy: 0.4648648648648649
