In [4]:
# Train and test 4 different classifiers
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from io import StringIO  
from IPython.display import Image  
import pydotplus

# Helper function to visualize Decision Trees
def plot_decision_tree(clf, feature_names, class_names, filename):
    dot_data = StringIO()
    export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=feature_names, class_names=class_names)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    graph.write_gif(filename)
    Image(graph.create_png())

In [7]:
# Load Penguin dataset
penguin_path = r'C:\Users\ibrah\OneDrive\Desktop\COMP472-A1-datasets\penguins.csv'
penguin_data = pd.read_csv(penguin_path)

# For Penguin dataset
penguin_features = pd.get_dummies(penguin_data.drop('species', axis=1))  # One-hot encode categorical features
penguin_target = penguin_data['species']

penguin_X_train, penguin_X_test, penguin_y_train, penguin_y_test = train_test_split(
    penguin_features, penguin_target, test_size=0.2, random_state=42
)

In [8]:
# (a) Base-DT
base_dt = DecisionTreeClassifier(random_state=42)
base_dt.fit(penguin_X_train, penguin_y_train)
base_dt_predictions = base_dt.predict(penguin_X_test)
base_dt_accuracy = accuracy_score(penguin_y_test, base_dt_predictions)

# Visualize the Base-DT (for simplicity, showing only text representation)
print("Base-DT Accuracy:", base_dt_accuracy)
print("Base-DT Tree:")
print(export_text(base_dt, feature_names=penguin_features.columns))

Base-DT Accuracy: 1.0
Base-DT Tree:
|--- flipper_length_mm <= 206.50
|   |--- culmen_length_mm <= 43.35
|   |   |--- culmen_length_mm <= 42.35
|   |   |   |--- culmen_depth_mm <= 16.70
|   |   |   |   |--- culmen_length_mm <= 39.50
|   |   |   |   |   |--- class: Adelie
|   |   |   |   |--- culmen_length_mm >  39.50
|   |   |   |   |   |--- class: Chinstrap
|   |   |   |--- culmen_depth_mm >  16.70
|   |   |   |   |--- class: Adelie
|   |   |--- culmen_length_mm >  42.35
|   |   |   |--- flipper_length_mm <= 189.50
|   |   |   |   |--- class: Chinstrap
|   |   |   |--- flipper_length_mm >  189.50
|   |   |   |   |--- class: Adelie
|   |--- culmen_length_mm >  43.35
|   |   |--- island_Dream <= 0.50
|   |   |   |--- sex_FEMALE <= 0.50
|   |   |   |   |--- class: Adelie
|   |   |   |--- sex_FEMALE >  0.50
|   |   |   |   |--- class: Gentoo
|   |   |--- island_Dream >  0.50
|   |   |   |--- culmen_length_mm <= 44.65
|   |   |   |   |--- culmen_depth_mm <= 18.90
|   |   |   |   |   |--- cl

In [9]:
# (b) Top-DT
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(penguin_X_train, penguin_y_train)
top_dt = grid_search_dt.best_estimator_
top_dt_predictions = top_dt.predict(penguin_X_test)
top_dt_accuracy = accuracy_score(penguin_y_test, top_dt_predictions)

# Visualize the Top-DT (for simplicity, showing only text representation)
print("\nTop-DT Accuracy:", top_dt_accuracy)
print("Top-DT Tree:")
print(export_text(top_dt, feature_names=penguin_features.columns))


Top-DT Accuracy: 1.0
Top-DT Tree:
|--- flipper_length_mm <= 206.50
|   |--- culmen_length_mm <= 43.35
|   |   |--- culmen_length_mm <= 42.35
|   |   |   |--- culmen_depth_mm <= 16.70
|   |   |   |   |--- culmen_length_mm <= 39.50
|   |   |   |   |   |--- class: Adelie
|   |   |   |   |--- culmen_length_mm >  39.50
|   |   |   |   |   |--- class: Chinstrap
|   |   |   |--- culmen_depth_mm >  16.70
|   |   |   |   |--- class: Adelie
|   |   |--- culmen_length_mm >  42.35
|   |   |   |--- flipper_length_mm <= 189.50
|   |   |   |   |--- class: Chinstrap
|   |   |   |--- flipper_length_mm >  189.50
|   |   |   |   |--- class: Adelie
|   |--- culmen_length_mm >  43.35
|   |   |--- island_Dream <= 0.50
|   |   |   |--- sex_FEMALE <= 0.50
|   |   |   |   |--- class: Adelie
|   |   |   |--- sex_FEMALE >  0.50
|   |   |   |   |--- class: Gentoo
|   |   |--- island_Dream >  0.50
|   |   |   |--- culmen_length_mm <= 44.65
|   |   |   |   |--- culmen_depth_mm <= 18.90
|   |   |   |   |   |--- cla

In [10]:
# (b) Top-DT
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(penguin_X_train, penguin_y_train)
top_dt = grid_search_dt.best_estimator_
top_dt_predictions = top_dt.predict(penguin_X_test)
top_dt_accuracy = accuracy_score(penguin_y_test, top_dt_predictions)

# Visualize the Top-DT (for simplicity, showing only text representation)
print("\nTop-DT Accuracy:", top_dt_accuracy)
print("Top-DT Tree:")
print(export_text(top_dt, feature_names=penguin_features.columns))


Top-DT Accuracy: 1.0
Top-DT Tree:
|--- flipper_length_mm <= 206.50
|   |--- culmen_length_mm <= 43.35
|   |   |--- culmen_length_mm <= 42.35
|   |   |   |--- culmen_depth_mm <= 16.70
|   |   |   |   |--- culmen_length_mm <= 39.50
|   |   |   |   |   |--- class: Adelie
|   |   |   |   |--- culmen_length_mm >  39.50
|   |   |   |   |   |--- class: Chinstrap
|   |   |   |--- culmen_depth_mm >  16.70
|   |   |   |   |--- class: Adelie
|   |   |--- culmen_length_mm >  42.35
|   |   |   |--- flipper_length_mm <= 189.50
|   |   |   |   |--- class: Chinstrap
|   |   |   |--- flipper_length_mm >  189.50
|   |   |   |   |--- class: Adelie
|   |--- culmen_length_mm >  43.35
|   |   |--- island_Dream <= 0.50
|   |   |   |--- sex_FEMALE <= 0.50
|   |   |   |   |--- class: Adelie
|   |   |   |--- sex_FEMALE >  0.50
|   |   |   |   |--- class: Gentoo
|   |   |--- island_Dream >  0.50
|   |   |   |--- culmen_length_mm <= 44.65
|   |   |   |   |--- culmen_depth_mm <= 18.90
|   |   |   |   |   |--- cla

In [11]:
# (c) Base-MLP
base_mlp = MLPClassifier(hidden_layer_sizes=(100, 100), activation='logistic', solver='sgd', random_state=42)
base_mlp.fit(penguin_X_train, penguin_y_train)
base_mlp_predictions = base_mlp.predict(penguin_X_test)
base_mlp_accuracy = accuracy_score(penguin_y_test, base_mlp_predictions)

print("\nBase-MLP Accuracy:", base_mlp_accuracy)


Base-MLP Accuracy: 0.4626865671641791


In [12]:
# (d) Top-MLP
param_grid_mlp = {
    'activation': ['sigmoid', 'tanh', 'relu'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd']
}

grid_search_mlp = GridSearchCV(MLPClassifier(random_state=42), param_grid_mlp, cv=5, scoring='accuracy')
grid_search_mlp.fit(penguin_X_train, penguin_y_train)
top_mlp = grid_search_mlp.best_estimator_
top_mlp_predictions = top_mlp.predict(penguin_X_test)
top_mlp_accuracy = accuracy_score(penguin_y_test, top_mlp_predictions)

print("\nTop-MLP Accuracy:", top_mlp_accuracy)




Top-MLP Accuracy: 0.6268656716417911


20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ibrah\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ibrah\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\ibrah\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\ibrah\AppData\Local\Programs\Python\Python311\Lib\