In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
# Cell 2: Load Penguin Dataset and Prepare Data
penguin_path = r'C:\Users\ibrah\OneDrive\Desktop\COMP472-A1-datasets\penguins.csv'
penguin_data = pd.read_csv(penguin_path)

In [3]:
# Convert features into 1-hot vectors, so data can be used for training
categorical_columns = ['island', 'sex']
penguin_categorical = penguin_data[categorical_columns]

# Initialize the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the categorical columns
one_hot_encoded = encoder.fit_transform(penguin_categorical).toarray()

# Create a DataFrame with the one-hot encoded features
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded DataFrame with the original DataFrame, dropping the original categorical columns
penguin_data_encoded = pd.concat([penguin_data, one_hot_df], axis=1).drop(categorical_columns, axis=1)

In [4]:
# Split the dataset using train_test_split
penguin_features = penguin_data_encoded.drop('species', axis=1)
penguin_target = penguin_data_encoded['species']

penguin_X_train, penguin_X_test, penguin_y_train, penguin_y_test = train_test_split(
    penguin_features, penguin_target, test_size=0.2, random_state=42
)

In [5]:
# Cell 3: Train Base Decision Tree Model
base_dt_model = DecisionTreeClassifier()
base_dt_model.fit(penguin_X_train, penguin_y_train)

In [6]:
# Cell 4: Predict and Evaluate Base Decision Tree Model
penguin_y_pred_base_dt = base_dt_model.predict(penguin_X_test)

conf_matrix_base_dt = confusion_matrix(penguin_y_test, penguin_y_pred_base_dt)
precision_recall_f1_base_dt = precision_recall_fscore_support(penguin_y_test, penguin_y_pred_base_dt, average=None)
accuracy_base_dt = accuracy_score(penguin_y_test, penguin_y_pred_base_dt)

In [7]:
# Cell 5: Display Results and Write to File
# (A) Model Description
model_description = "Base Decision Tree Model"
print(model_description)
with open('penguin-performance.txt', 'a') as file:
    file.write("\n" + "-"*20 + "\n" + model_description + "\n")

# (B) Confusion Matrix
print("Confusion Matrix:\n", conf_matrix_base_dt)
with open('penguin-performance.txt', 'a') as file:
    file.write("\n(B) Confusion Matrix:\n" + str(conf_matrix_base_dt) + "\n")

# (C) Precision, Recall, F1 for Each Class
print("Precision, Recall, and F1-Score for Each Class:\n", precision_recall_f1_base_dt)
with open('penguin-performance.txt', 'a') as file:
    file.write("\n(C) Precision, Recall, and F1-Score for Each Class:\n" + str(precision_recall_f1_base_dt) + "\n")

# (D) Accuracy, Macro-average F1, Weighted-average F1
accuracy_base_dt = accuracy_score(penguin_y_test, penguin_y_pred_base_dt)
macro_avg_f1_base_dt = f1_score(penguin_y_test, penguin_y_pred_base_dt, average='macro')
weighted_avg_f1_base_dt = f1_score(penguin_y_test, penguin_y_pred_base_dt, average='weighted')

print("Accuracy: {}".format(accuracy_base_dt))
print("Macro-average F1: {}".format(macro_avg_f1_base_dt))
print("Weighted-average F1: {}".format(weighted_avg_f1_base_dt))

with open('penguin-performance.txt', 'a') as file:
    file.write("\n(D) Accuracy: {}\n".format(accuracy_base_dt))
    file.write("(D) Macro-average F1: {}\n".format(macro_avg_f1_base_dt))
    file.write("(D) Weighted-average F1: {}\n".format(weighted_avg_f1_base_dt))


Base Decision Tree Model
Confusion Matrix:
 [[31  0  0]
 [ 0 13  0]
 [ 0  0 23]]
Precision, Recall, and F1-Score for Each Class:
 (array([1., 1., 1.]), array([1., 1., 1.]), array([1., 1., 1.]), array([31, 13, 23], dtype=int64))
Accuracy: 1.0
Macro-average F1: 1.0
Weighted-average F1: 1.0


In [8]:
# part 5 for 4b
# Define the hyperparameters for grid search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Decision Tree model
top_dt_model = DecisionTreeClassifier()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(top_dt_model, param_grid, cv=5)
grid_search.fit(penguin_X_train, penguin_y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Train the Top-DT model with the best hyperparameters
top_dt_model = DecisionTreeClassifier(**best_params)
top_dt_model.fit(penguin_X_train, penguin_y_train)


In [9]:
# Predict on the test set
penguin_y_pred_top_dt = top_dt_model.predict(penguin_X_test)

# Evaluate the model
conf_matrix_top_dt = confusion_matrix(penguin_y_test, penguin_y_pred_top_dt)
precision_recall_f1_top_dt = precision_recall_fscore_support(penguin_y_test, penguin_y_pred_top_dt, average=None)
accuracy_top_dt = accuracy_score(penguin_y_test, penguin_y_pred_top_dt)
macro_avg_f1_top_dt = f1_score(penguin_y_test, penguin_y_pred_top_dt, average='macro')
weighted_avg_f1_top_dt = f1_score(penguin_y_test, penguin_y_pred_top_dt, average='weighted')

# Display the results
print("Top Decision Tree Model Evaluation:")
print("(B) Confusion Matrix:")
print(conf_matrix_top_dt)

print("\n(C) Precision, Recall, and F1-Score for Each Class:")
precision_top_dt, recall_top_dt, f1_top_dt, support_top_dt = precision_recall_f1_top_dt
print("Precision (Class 0 - Adelie):", precision_top_dt[0])
print("Recall (Class 0 - Adelie):", recall_top_dt[0])
print("F1-Score (Class 0 - Adelie):", f1_top_dt[0])

print("\n(D) Accuracy:", accuracy_top_dt)
print("(D) Macro-average F1:", macro_avg_f1_top_dt)
print("(D) Weighted-average F1:", weighted_avg_f1_top_dt)

# Append results to the penguin-performance.txt file
with open('penguin-performance.txt', 'a') as file:
    file.write("--------------------\n")
    file.write("Top Decision Tree Model\n\n")
    file.write("(B) Confusion Matrix:\n")
    file.write(str(conf_matrix_top_dt) + "\n\n")

    file.write("(C) Precision, Recall, and F1-Score for Each Class:\n")
    file.write("Precision (Class 0 - Adelie): {}\n".format(precision_top_dt[0]))
    file.write("Recall (Class 0 - Adelie): {}\n".format(recall_top_dt[0]))
    file.write("F1-Score (Class 0 - Adelie): {}\n\n".format(f1_top_dt[0]))

    file.write("(D) Accuracy: {}\n".format(accuracy_top_dt))
    file.write("(D) Macro-average F1: {}\n".format(macro_avg_f1_top_dt))
    file.write("(D) Weighted-average F1: {}\n".format(weighted_avg_f1_top_dt))


Top Decision Tree Model Evaluation:
(B) Confusion Matrix:
[[30  1  0]
 [ 0 13  0]
 [ 0  0 23]]

(C) Precision, Recall, and F1-Score for Each Class:
Precision (Class 0 - Adelie): 1.0
Recall (Class 0 - Adelie): 0.967741935483871
F1-Score (Class 0 - Adelie): 0.9836065573770492

(D) Accuracy: 0.9850746268656716
(D) Macro-average F1: 0.9821898401133374
(D) Weighted-average F1: 0.9852286835404036


In [11]:
# part 5 for penguin 4c
# Base MLP Model
print("--------------------")
print("Base MLP Model")

# (A) Model Description
print("\n(A) Model Description:")
print("Base Multi-Layered Perceptron with 2 hidden layers of 100+100 neurons, sigmoid/logistic as activation function, "
      "stochastic gradient descent, and default values for the rest of the parameters.")

# (B) Confusion Matrix, (C) Precision, Recall, F1-Score, (D) Accuracy, Macro-average F1, Weighted-average F1
base_mlp_model = MLPClassifier(hidden_layer_sizes=(100, 100), activation='logistic', solver='sgd', random_state=42)

# Train the model
base_mlp_model.fit(penguin_X_train, penguin_y_train)

# Predict on the test set
penguin_y_pred_base_mlp = base_mlp_model.predict(penguin_X_test)


--------------------
Base MLP Model

(A) Model Description:
Base Multi-Layered Perceptron with 2 hidden layers of 100+100 neurons, sigmoid/logistic as activation function, stochastic gradient descent, and default values for the rest of the parameters.


In [13]:
# Evaluate the model
conf_matrix_base_mlp = confusion_matrix(penguin_y_test, penguin_y_pred_base_mlp)
# Set zero_division parameter to 'warn' or 1
precision_recall_f1_base_mlp = precision_recall_fscore_support(
    penguin_y_test, penguin_y_pred_base_mlp, average=None, zero_division=1
)
accuracy_base_mlp = accuracy_score(penguin_y_test, penguin_y_pred_base_mlp)
macro_avg_f1_base_mlp = f1_score(penguin_y_test, penguin_y_pred_base_mlp, average='macro')
weighted_avg_f1_base_mlp = f1_score(penguin_y_test, penguin_y_pred_base_mlp, average='weighted')

In [14]:
# Display the results
print("\n(B) Confusion Matrix:")
print(conf_matrix_base_mlp)

print("\n(C) Precision, Recall, and F1-Score for Each Class:")
print(precision_recall_f1_base_mlp)

print("\n(D) Accuracy:", accuracy_base_mlp)
print("(D) Macro-average F1:", macro_avg_f1_base_mlp)
print("(D) Weighted-average F1:", weighted_avg_f1_base_mlp)

# Define the file path
penguin_performance_file = "penguin-performance.txt"

# Open the file in append mode
with open(penguin_performance_file, "a") as file:
    # (A) Model Description
    file.write("--------------------\n")
    file.write("Base MLP Model\n\n")
    file.write("(A) Model Description:\n")
    file.write("Base Multi-Layered Perceptron with 2 hidden layers of 100+100 neurons, sigmoid/logistic as activation function, "
               "stochastic gradient descent, and default values for the rest of the parameters.\n")

    # (B) Confusion Matrix
    file.write("\n(B) Confusion Matrix:\n")
    file.write(str(conf_matrix_base_mlp) + "\n")

    # (C) Precision, Recall, and F1-Score for Each Class
    file.write("\n(C) Precision, Recall, and F1-Score for Each Class:\n")
    file.write(str(precision_recall_f1_base_mlp) + "\n")

    # (D) Accuracy, Macro-average F1, Weighted-average F1
    file.write("\n(D) Accuracy: {}\n".format(accuracy_base_mlp))
    file.write("(D) Macro-average F1: {}\n".format(macro_avg_f1_base_mlp))
    file.write("(D) Weighted-average F1: {}\n".format(weighted_avg_f1_base_mlp))

# Display a message
print("Results written to", penguin_performance_file)



(B) Confusion Matrix:
[[31  0  0]
 [13  0  0]
 [23  0  0]]

(C) Precision, Recall, and F1-Score for Each Class:
(array([0.46268657, 1.        , 1.        ]), array([1., 0., 0.]), array([0.63265306, 0.        , 0.        ]), array([31, 13, 23], dtype=int64))

(D) Accuracy: 0.4626865671641791
(D) Macro-average F1: 0.2108843537414966
(D) Weighted-average F1: 0.29272007310386844
Results written to penguin-performance.txt


In [16]:
# Initialize the Top-MLP model
top_mlp_model = MLPClassifier()

# Define the hyperparameter grid
param_grid_mlp = {
    'activation': ['sigmoid', 'tanh', 'relu'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd'],
}

# Perform grid search to find the best hyperparameters
grid_search_mlp = GridSearchCV(top_mlp_model, param_grid_mlp, cv=5)
grid_search_mlp.fit(penguin_X_train_scaled, penguin_y_train)

NameError: name 'penguin_X_train_scaled' is not defined

In [None]:
# Get the best hyperparameters from the grid search
best_params_mlp = grid_search_mlp.best_params_

# Predict on the test set using the best model
penguin_y_pred_top_mlp = grid_search_mlp.predict(penguin_X_test_scaled)

# Evaluate the Top-MLP model
conf_matrix_top_mlp = confusion_matrix(penguin_y_test, penguin_y_pred_top_mlp)
precision_recall_f1_top_mlp = precision_recall_fscore_support(
    penguin_y_test, penguin_y_pred_top_mlp, average=None
)
accuracy_top_mlp = accuracy_score(penguin_y_test, penguin_y_pred_top_mlp)

In [None]:
# Display the results
print("--------------------")
print("Top Multi-Layered Perceptron Model\n")
print("(A) Best Hyperparameters:", best_params_mlp)
print("\n(B) Confusion Matrix:\n", conf_matrix_top_mlp)
print("\n(C) Precision, Recall, and F1-Score for Each Class:\n", precision_recall_f1_top_mlp)
print("(D) Accuracy:", accuracy_top_mlp)

# Write the results to the performance file
with open("penguin-performance.txt", "a") as file:
    file.write("--------------------\n")
    file.write("Top Multi-Layered Perceptron Model\n\n")
    file.write("(A) Best Hyperparameters: {}\n".format(best_params_mlp))
    file.write("\n(B) Confusion Matrix:\n{}\n".format(conf_matrix_top_mlp))
    file.write("\n(C) Precision, Recall, and F1-Score for Each Class:\n{}\n".format(precision_recall_f1_top_mlp))
    file.write("(D) Accuracy: {}\n".format(accuracy_top_mlp))