In [1]:
"""
For each of the 4 classifiers above 4(a), 4(b), 4(c) and 4(d), append the following information in a file called
penguin-performance.txt / abalone-performance.txt: (to make it easier for the TAs, make sure that
your output for each sub-question below is clearly marked in your output file, using the headings (A),
(B) . . .)
(A) a clear separator (a sequence of hyphens or stars) and a string clearly describing the model (e.g. the
model name + hyper-parameter values that you changed). In the case of Top-DT and Top-MLP,
display the best hyperparameters found by the gridsearch.
(B) the confusion matrix
(C) the precision, recall, and F1-measure for each class
(D) the accuracy, macro-average F1 and weighted-average F1 of the model
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from io import StringIO  
from IPython.display import Image  
import pydotplus

In [2]:
# Load Penguin dataset
penguin_path = r'C:\Users\ibrah\OneDrive\Desktop\COMP472-A1-datasets\penguins.csv'
penguin_data = pd.read_csv(penguin_path)

# For Penguin dataset
penguin_features = pd.get_dummies(penguin_data.drop('species', axis=1))  # One-hot encode categorical features
penguin_target = penguin_data['species']

penguin_X_train, penguin_X_test, penguin_y_train, penguin_y_test = train_test_split(
    penguin_features, penguin_target, test_size=0.2, random_state=42
)

In [3]:
# (a) Base-DT
base_dt = DecisionTreeClassifier(random_state=42)
base_dt.fit(penguin_X_train, penguin_y_train)
base_dt_predictions = base_dt.predict(penguin_X_test)

# Output file for Base-DT
output_file_base_dt = 'penguin-performance.txt'

# Append information to the output file
with open(output_file_base_dt, 'a') as f:
    f.write("Base-DT Model:\n")
    f.write("Hyperparameters: Default\n")
    f.write("Confusion Matrix:\n")
    f.write(str(confusion_matrix(penguin_y_test, base_dt_predictions)) + "\n")
    f.write("Precision, Recall, and F1:\n")
    f.write(str(classification_report(penguin_y_test, base_dt_predictions)) + "\n")
    f.write("Accuracy, Macro-average F1, Weighted-average F1:\n")
    f.write(str(accuracy_score(penguin_y_test, base_dt_predictions)) + "\n")
    f.write("-" * 50 + "\n\n")

In [4]:
# (b) Top-DT
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],  # Update with your choices
    'min_samples_split': [2, 5, 10]  # Update with your choices
}

grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(penguin_X_train, penguin_y_train)
top_dt = grid_search_dt.best_estimator_
top_dt_predictions = top_dt.predict(penguin_X_test)

# Output file for Top-DT
output_file_top_dt = 'penguin-performance.txt'

# Append information to the output file
with open(output_file_top_dt, 'a') as f:
    f.write("Top-DT Model:\n")
    f.write("Best Hyperparameters: {}\n".format(grid_search_dt.best_params_))
    f.write("Confusion Matrix:\n")
    f.write(str(confusion_matrix(penguin_y_test, top_dt_predictions)) + "\n")
    f.write("Precision, Recall, and F1:\n")
    f.write(str(classification_report(penguin_y_test, top_dt_predictions)) + "\n")
    f.write("Accuracy, Macro-average F1, Weighted-average F1:\n")
    f.write(str(accuracy_score(penguin_y_test, top_dt_predictions)) + "\n")
    f.write("-" * 50 + "\n\n")

In [5]:
# (c) Base-MLP
base_mlp = MLPClassifier(hidden_layer_sizes=(100, 100), activation='logistic', solver='sgd', max_iter=500, random_state=42)
base_mlp.fit(penguin_X_train, penguin_y_train)
base_mlp_predictions = base_mlp.predict(penguin_X_test)

# Output file for Base-MLP
output_file_base_mlp = 'penguin-performance.txt'

# Append information to the output file
with open(output_file_base_mlp, 'a') as f:
    f.write("Base-MLP Model:\n")
    f.write("Hyperparameters: Default\n")
    f.write("Confusion Matrix:\n")
    f.write(str(confusion_matrix(penguin_y_test, base_mlp_predictions)) + "\n")
    f.write("Precision, Recall, and F1:\n")
    f.write(str(classification_report(penguin_y_test, base_mlp_predictions)) + "\n")
    f.write("Accuracy, Macro-average F1, Weighted-average F1:\n")
    f.write(str(accuracy_score(penguin_y_test, base_mlp_predictions)) + "\n")
    f.write("-" * 50 + "\n\n")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# (d) Top-MLP
param_grid_mlp = {
    'activation': ['logistic', 'tanh', 'relu'],
    'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
    'solver': ['adam', 'sgd'],
    'max_iter': [500, 1000]
}

grid_search_mlp = GridSearchCV(MLPClassifier(random_state=42), param_grid_mlp, cv=5, scoring='accuracy')
grid_search_mlp.fit(penguin_X_train, penguin_y_train)
top_mlp = grid_search_mlp.best_estimator_
top_mlp_predictions = top_mlp.predict(penguin_X_test)

# Output file for Top-MLP
output_file_top_mlp = 'penguin-performance.txt'

# Append information to the output file
with open(output_file_top_mlp, 'a') as f:
    f.write("Top-MLP Model:\n")
    f.write("Best Hyperparameters: {}\n".format(grid_search_mlp.best_params_))
    f.write("Confusion Matrix:\n")
    f.write(str(confusion_matrix(penguin_y_test, top_mlp_predictions)) + "\n")
    f.write("Precision, Recall, and F1:\n")
    f.write(str(classification_report(penguin_y_test, top_mlp_predictions)) + "\n")
    f.write("Accuracy, Macro-average F1, Weighted-average F1:\n")
    f.write(str(accuracy_score(penguin_y_test, top_mlp_predictions)) + "\n")
    f.write("-" * 50 + "\n\n")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
