In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [2]:
# Cell 2: Load Penguin Dataset and Prepare Data
penguin_path = r'C:\Users\ibrah\OneDrive\Desktop\COMP472-A1-datasets\penguins.csv'
penguin_data = pd.read_csv(penguin_path)

In [3]:
# Convert features into 1-hot vectors, so data can be used for training
categorical_columns = ['island', 'sex']
penguin_categorical = penguin_data[categorical_columns]

# Initialize the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the categorical columns
one_hot_encoded = encoder.fit_transform(penguin_categorical).toarray()

# Create a DataFrame with the one-hot encoded features
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded DataFrame with the original DataFrame, dropping the original categorical columns
penguin_data_encoded = pd.concat([penguin_data, one_hot_df], axis=1).drop(categorical_columns, axis=1)

In [4]:
# Split the dataset using train_test_split
penguin_features = penguin_data_encoded.drop('species', axis=1)
penguin_target = penguin_data_encoded['species']

penguin_X_train, penguin_X_test, penguin_y_train, penguin_y_test = train_test_split(
    penguin_features, penguin_target, test_size=0.2, random_state=42
)

In [5]:
# Cell 3: Train Base Decision Tree Model
base_dt_model = DecisionTreeClassifier()
base_dt_model.fit(penguin_X_train, penguin_y_train)

In [6]:
# Cell 4: Predict and Evaluate Base Decision Tree Model
penguin_y_pred_base_dt = base_dt_model.predict(penguin_X_test)

conf_matrix_base_dt = confusion_matrix(penguin_y_test, penguin_y_pred_base_dt)
precision_recall_f1_base_dt = precision_recall_fscore_support(penguin_y_test, penguin_y_pred_base_dt, average=None)
accuracy_base_dt = accuracy_score(penguin_y_test, penguin_y_pred_base_dt)

In [7]:
# Display the results
print("Base-DT Model:")
print("Hyperparameters: Default")
print("(B) Confusion Matrix:")
print(conf_matrix_base_dt)

# Additional formatting for precision, recall, and f1
class_labels = ['Adelie', 'Chinstrap', 'Gentoo']
report = classification_report(penguin_y_test, penguin_y_pred_base_dt, target_names=class_labels)
print("(C) Precision, Recall, and F1:")
print(report)
# Display accuracy, macro-average F1, and weighted-average F1
print("(D) Accuracy:", accuracy_base_dt)
print("(D) Macro-average F1:", f1_score(penguin_y_test, penguin_y_pred_base_dt, average='macro'))
print("(D) Weighted-average F1:", f1_score(penguin_y_test, penguin_y_pred_base_dt, average='weighted'))

# Write the results to a file
output_path = 'penguin_performance.txt'
with open(output_path, 'a') as file:
    file.write("Base-DT Model:\n")
    file.write("Hyperparameters: Default\n")
    file.write("(B) Confusion Matrix:\n")
    file.write(str(conf_matrix_base_dt) + '\n')
    file.write("(C) Precision, Recall, and F1:\n")
    file.write(report + '\n')
    file.write("(D) Accuracy: {}\n".format(accuracy_base_dt))
    file.write("(D) Macro-average F1: {}\n".format(f1_score(penguin_y_test, penguin_y_pred_base_dt, average='macro')))
    file.write("(D) Weighted-average F1: {}\n".format(f1_score(penguin_y_test, penguin_y_pred_base_dt, average='weighted')))
    file.write("--------------------\n")

Base-DT Model:
Hyperparameters: Default
(B) Confusion Matrix:
[[31  0  0]
 [ 0 13  0]
 [ 0  0 23]]
(C) Precision, Recall, and F1:
              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        31
   Chinstrap       1.00      1.00      1.00        13
      Gentoo       1.00      1.00      1.00        23

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67

(D) Accuracy: 1.0
(D) Macro-average F1: 1.0
(D) Weighted-average F1: 1.0


In [8]:
# part 5 for 4b
# Define the hyperparameters for grid search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Decision Tree model
top_dt_model = DecisionTreeClassifier()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(top_dt_model, param_grid, cv=5)
grid_search.fit(penguin_X_train, penguin_y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Train the Top-DT model with the best hyperparameters
top_dt_model = DecisionTreeClassifier(**best_params)
top_dt_model.fit(penguin_X_train, penguin_y_train)


In [9]:
# Predict on the test set
penguin_y_pred_top_dt = top_dt_model.predict(penguin_X_test)

# Evaluate the model
conf_matrix_top_dt = confusion_matrix(penguin_y_test, penguin_y_pred_top_dt)
precision_recall_f1_top_dt = precision_recall_fscore_support(penguin_y_test, penguin_y_pred_top_dt, average=None)
accuracy_top_dt = accuracy_score(penguin_y_test, penguin_y_pred_top_dt)


In [10]:
# Display the results
print("Top Decision Tree Model:")
print("(A) Model Description:")
print("Top Decision Tree with hyperparameters:", grid_search.best_params_)
print("(B) Confusion Matrix:")
print(conf_matrix_top_dt)

# Additional formatting for precision, recall, and f1
class_labels = ['Adelie', 'Chinstrap', 'Gentoo']
report = classification_report(penguin_y_test, penguin_y_pred_top_dt, target_names=class_labels, zero_division=1)
print("(C) Precision, Recall, and F1:")
print(report)

# Display accuracy, macro-average F1, and weighted-average F1
accuracy_top_dt = accuracy_score(penguin_y_test, penguin_y_pred_top_dt)
macro_avg_f1_top_dt = f1_score(penguin_y_test, penguin_y_pred_top_dt, average='macro')
weighted_avg_f1_top_dt = f1_score(penguin_y_test, penguin_y_pred_top_dt, average='weighted')

print("(D) Accuracy:", accuracy_top_dt)
print("(D) Macro-average F1:", macro_avg_f1_top_dt)
print("(D) Weighted-average F1:", weighted_avg_f1_top_dt)

# Write results to file
with open("penguin_performance.txt", "a") as file:
    file.write("Top Decision Tree Model:\n")
    file.write("(A) Model Description:\nTop Decision Tree with hyperparameters: {}\n".format(grid_search.best_params_))
    file.write("(B) Confusion Matrix:\n{}\n".format(conf_matrix_top_dt))
    
    # Additional formatting for precision, recall, and f1
    file.write("(C) Precision, Recall, and F1:\n{}\n".format(report))
    
    file.write("(D) Accuracy: {}\n".format(accuracy_top_dt))
    file.write("(D) Macro-average F1: {}\n".format(macro_avg_f1_top_dt))
    file.write("(D) Weighted-average F1: {}\n".format(weighted_avg_f1_top_dt))
    
    file.write("--------------------\n")  # Separator


Top Decision Tree Model:
(A) Model Description:
Top Decision Tree with hyperparameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2}
(B) Confusion Matrix:
[[31  0  0]
 [ 0 13  0]
 [ 0  0 23]]
(C) Precision, Recall, and F1:
              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        31
   Chinstrap       1.00      1.00      1.00        13
      Gentoo       1.00      1.00      1.00        23

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67

(D) Accuracy: 1.0
(D) Macro-average F1: 1.0
(D) Weighted-average F1: 1.0


In [11]:
# part 5 for penguin 4c
# Base MLP Model
print("--------------------")
print("Base MLP Model")

# Initialize the Base MLP model
base_mlp_model = MLPClassifier(hidden_layer_sizes=(100, 100), activation='logistic', solver='sgd', random_state=42)

--------------------
Base MLP Model


In [12]:
# Train the model
base_mlp_model.fit(penguin_X_train, penguin_y_train)

In [13]:
# Predict on the test set
penguin_y_pred_base_mlp = base_mlp_model.predict(penguin_X_test)

In [14]:
# Cell 5
# Evaluate the model
conf_matrix_base_mlp = confusion_matrix(penguin_y_test, penguin_y_pred_base_mlp)
precision_recall_f1_base_mlp = precision_recall_fscore_support(penguin_y_test, penguin_y_pred_base_mlp, average=None, zero_division=1)  # Set zero_division parameter
accuracy_base_mlp = accuracy_score(penguin_y_test, penguin_y_pred_base_mlp)


In [15]:
# Display the results
print("Base MLP Model:")
print("(A) Model Description:")
print("Base Multi-Layered Perceptron with 2 hidden layers of 100+100 neurons, sigmoid/logistic as activation function, stochastic gradient descent, and default values for the rest of the parameters.")
print("(B) Confusion Matrix:")
print(conf_matrix_base_mlp)

Base MLP Model:
(A) Model Description:
Base Multi-Layered Perceptron with 2 hidden layers of 100+100 neurons, sigmoid/logistic as activation function, stochastic gradient descent, and default values for the rest of the parameters.
(B) Confusion Matrix:
[[31  0  0]
 [13  0  0]
 [23  0  0]]


In [16]:
# Additional formatting for precision, recall, and f1
class_labels = ['Adelie', 'Chinstrap', 'Gentoo']
report = classification_report(penguin_y_test, penguin_y_pred_base_mlp, target_names=class_labels, zero_division=1)  # Set zero_division parameter
print("(C) Precision, Recall, and F1:")
print(report)

(C) Precision, Recall, and F1:
              precision    recall  f1-score   support

      Adelie       0.46      1.00      0.63        31
   Chinstrap       1.00      0.00      0.00        13
      Gentoo       1.00      0.00      0.00        23

    accuracy                           0.46        67
   macro avg       0.82      0.33      0.21        67
weighted avg       0.75      0.46      0.29        67



In [17]:
# Display accuracy, macro-average F1, and weighted-average F1
print("(D) Accuracy:", accuracy_base_mlp)
print("(D) Macro-average F1:", f1_score(penguin_y_test, penguin_y_pred_base_mlp, average='macro'))
print("(D) Weighted-average F1:", f1_score(penguin_y_test, penguin_y_pred_base_mlp, average='weighted'))


(D) Accuracy: 0.4626865671641791
(D) Macro-average F1: 0.2108843537414966
(D) Weighted-average F1: 0.29272007310386844


In [18]:
# Write the results to a file
output_path = 'penguin_performance.txt'
with open(output_path, 'a') as file:
    file.write("Base MLP Model:\n")
    file.write("(A) Model Description:\n")
    file.write("Base Multi-Layered Perceptron with 2 hidden layers of 100+100 neurons, sigmoid/logistic as activation function, stochastic gradient descent, and default values for the rest of the parameters.\n")
    file.write("(B) Confusion Matrix:\n")
    file.write(str(conf_matrix_base_mlp) + '\n')
    file.write("(C) Precision, Recall, and F1:\n")
    file.write(report + '\n')
    file.write("(D) Accuracy: {}\n".format(accuracy_base_mlp))
    file.write("(D) Macro-average F1: {}\n".format(f1_score(penguin_y_test, penguin_y_pred_base_mlp, average='macro')))
    file.write("(D) Weighted-average F1: {}\n".format(f1_score(penguin_y_test, penguin_y_pred_base_mlp, average='weighted')))
    file.write("--------------------\n")

In [19]:
# Define hyperparameter values for grid search
param_grid_mlp = {
    'activation': ['logistic', 'tanh', 'relu'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd']
}

# Initialize the Top MLP model
top_mlp_model = MLPClassifier(random_state=42, max_iter=1000)  # Increase max_iter to avoid convergence warning

In [20]:
# Perform grid search to find the best hyperparameters
grid_search_mlp = GridSearchCV(
    top_mlp_model,
    param_grid=param_grid_mlp,
    cv=10,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)
grid_search_mlp.fit(penguin_X_train, penguin_y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


In [21]:
# Get the best hyperparameters from the grid search
best_params_mlp = grid_search_mlp.best_estimator_.get_params()

In [22]:
# Predict on the test set using the best model
penguin_y_pred_top_mlp = grid_search_mlp.predict(penguin_X_test)

# Evaluate the model
conf_matrix_top_mlp = confusion_matrix(penguin_y_test, penguin_y_pred_top_mlp)
precision_recall_f1_top_mlp = precision_recall_fscore_support(penguin_y_test, penguin_y_pred_top_mlp, average=None, zero_division=1)
accuracy_top_mlp = accuracy_score(penguin_y_test, penguin_y_pred_top_mlp)
macro_avg_f1_top_mlp = f1_score(penguin_y_test, penguin_y_pred_top_mlp, average='macro')
weighted_avg_f1_top_mlp = f1_score(penguin_y_test, penguin_y_pred_top_mlp, average='weighted')


In [23]:
# Display the results
print("Top MLP Model:")
print("(A) Model Description:")
print("Top MLP with hyperparameters:", best_params_mlp)
print("(B) Confusion Matrix:")
print(conf_matrix_top_mlp)

# Additional formatting for precision, recall, and f1
class_labels = ['Adelie', 'Chinstrap', 'Gentoo']
report_top_mlp = classification_report(penguin_y_test, penguin_y_pred_top_mlp, target_names=class_labels, zero_division=1)
print("(C) Precision, Recall, and F1:")
print(report_top_mlp)

# Display accuracy, macro-average F1, and weighted-average F1
print("(D) Accuracy:", accuracy_top_mlp)
print("(D) Macro-average F1:", macro_avg_f1_top_mlp)
print("(D) Weighted-average F1:", weighted_avg_f1_top_mlp)


Top MLP Model:
(A) Model Description:
Top MLP with hyperparameters: {'activation': 'logistic', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (30, 50), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 1000, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': 42, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
(B) Confusion Matrix:
[[29  0  2]
 [13  0  0]
 [ 1  0 22]]
(C) Precision, Recall, and F1:
              precision    recall  f1-score   support

      Adelie       0.67      0.94      0.78        31
   Chinstrap       1.00      0.00      0.00        13
      Gentoo       0.92      0.96      0.94        23

    accuracy                           0.76        67
   macro avg       0.86      0.63      0.57        67
weighted avg       0.82    

In [24]:
# Display the results
print("Top MLP Model:")
print("(A) Model Description:")
print("Top MLP with hyperparameters:", best_params_mlp)
print("(B) Confusion Matrix:")
print(conf_matrix_top_mlp)

# Additional formatting for precision, recall, and f1
class_labels = ['Adelie', 'Chinstrap', 'Gentoo']
report_top_mlp = classification_report(penguin_y_test, penguin_y_pred_top_mlp, target_names=class_labels, zero_division=1)
print("(C) Precision, Recall, and F1:")
print(report_top_mlp)

# Display accuracy, macro-average F1, and weighted-average F1
print("(D) Accuracy:", accuracy_top_mlp)
print("(D) Macro-average F1:", macro_avg_f1_top_mlp)
print("(D) Weighted-average F1:", weighted_avg_f1_top_mlp)    

with open("penguin_performance.txt", "a") as file:
    file.write("Top MLP Model:\n")
    file.write("(A) Model Description:\nTop MLP with hyperparameters: {}\n".format(best_params_mlp))
    file.write("(B) Confusion Matrix:\n{}\n".format(conf_matrix_top_mlp))

    # Additional formatting for precision, recall, and f1
    class_labels = ['Adelie', 'Chinstrap', 'Gentoo']
    report_top_mlp = classification_report(penguin_y_test, penguin_y_pred_top_mlp, target_names=class_labels, zero_division=1)
    file.write("(C) Precision, Recall, and F1:\n{}\n".format(report_top_mlp))

    file.write("(D) Accuracy: {}\n".format(accuracy_top_mlp))
    file.write("(D) Macro-average F1: {}\n".format(macro_avg_f1_top_mlp))
    file.write("(D) Weighted-average F1: {}\n".format(weighted_avg_f1_top_mlp))

    file.write("--------------------\n")  # Separator


Top MLP Model:
(A) Model Description:
Top MLP with hyperparameters: {'activation': 'logistic', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (30, 50), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 1000, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': 42, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
(B) Confusion Matrix:
[[29  0  2]
 [13  0  0]
 [ 1  0 22]]
(C) Precision, Recall, and F1:
              precision    recall  f1-score   support

      Adelie       0.67      0.94      0.78        31
   Chinstrap       1.00      0.00      0.00        13
      Gentoo       0.92      0.96      0.94        23

    accuracy                           0.76        67
   macro avg       0.86      0.63      0.57        67
weighted avg       0.82    