# SENG 474 A02: Assignment 1
Sean McAuliffe, V00913346  
February 4, 2023

---
## Environment Setup

This step installs the project requirements and imports modules used for implementation.

In [47]:
# !pip3 install numpy
# !pip3 install pandas
# !pip3 install sklearn
# !pip3 install matplotlib
# !pip3 install graphviz

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

---
## Data Preprocessing
This block is to load in the cleaned_adult.csv file, to shuffle the examlples, and to partition the data into training + test sets.

In [44]:
# Importing the dataset from .csv
print("Importing the income dataset...\n")
income_dataset = np.genfromtxt('./cleaned_adult.csv', delimiter=',', skip_header=1, dtype=int)

# Shuffle the rows of the dataset so that examples appear in random order
np.random.shuffle(income_dataset)

# Split the dataset into features and labels
dataset_features = income_dataset[:, :-1]
print("Dataset Features:")
print(f"  Num. dimensions: {dataset_features.ndim}")
print(f"  Data shape: {dataset_features.shape}")
print(f"  Size: {dataset_features.size}\n")

dataset_labels = income_dataset[:, -1]
print("Dataset Labels:")
print(f"  Num. dimensions: {dataset_labels.ndim}")
print(f"  Data shape: {dataset_labels.shape}")
print(f"  Size: {dataset_labels.size}\n")


def split_dataset(features, labels, training_percent):
    """ Split the dataset into training and testing sets """
    training_features = features[:int(training_percent * features.shape[0])]
    training_labels = labels[:int(training_percent * labels.shape[0])]
    testing_features = features[int(training_percent * features.shape[0]):]
    testing_labels = labels[int(training_percent * labels.shape[0]):]
    return training_features, training_labels, testing_features, testing_labels

# Split the dataset into training and testing sets
# With 80% of the data in the training set and 20% in the testing set
training_features, training_labels, testing_features, testing_labels = split_dataset(dataset_features, dataset_labels, 0.8)


print("Training Features:")
print(f"  Num. dimensions: {training_features.ndim}")
print(f"  Data shape: {training_features.shape}")
print(f"  Size: {training_features.size}\n")

print("Training Labels:")
print(f"  Num. dimensions: {training_labels.ndim}")
print(f"  Data shape: {training_labels.shape}")
print(f"  Size: {training_labels.size}\n")

print("Testing Features:")
print(f"  Num. dimensions: {testing_features.ndim}")
print(f"  Data shape: {testing_features.shape}")
print(f"  Size: {testing_features.size}\n")

print("Testing Labels:")
print(f"  Num. dimensions: {testing_labels.ndim}")
print(f"  Data shape: {testing_labels.shape}")
print(f"  Size: {testing_labels.size}\n")



Importing the income dataset...

Dataset Features:
  Num. dimensions: 2
  Data shape: (45222, 104)
  Size: 4703088

Dataset Labels:
  Num. dimensions: 1
  Data shape: (45222,)
  Size: 45222

Training Features:
  Num. dimensions: 2
  Data shape: (36177, 104)
  Size: 3762408

Training Labels:
  Num. dimensions: 1
  Data shape: (36177,)
  Size: 36177

Testing Features:
  Num. dimensions: 2
  Data shape: (9045, 104)
  Size: 940680

Testing Labels:
  Num. dimensions: 1
  Data shape: (9045,)
  Size: 9045



---
### Decision Trees (With Pruning)
1. Decision Tree: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
2. iris dataset: https://en.wikipedia.org/wiki/Iris_flower_data_set  
3. hyperparameter Max_depth = The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
4. Post process pruning: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py

In [57]:
# This visualization function is taken from the Lab 1 code provided by the TA
# Takes in a DecisionTreeClassifier model. X represents the data, y represents
# the labels
def visualize_classifier(model, X, y, ax=None, cmap='rainbow'):
    ax = ax or plt.gca()

    # Plot the training points
    ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cmap,
               clim=(y.min(), y.max()), zorder=3)
    ax.axis('tight')
    ax.axis('off')
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    # fit the estimator
    xx, yy = np.meshgrid(np.linspace(*xlim, num=200),
                         np.linspace(*ylim, num=200))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    # Create a color plot with the results
    n_classes = len(np.unique(y))
    contours = ax.contourf(xx, yy, Z, alpha=0.3,
                           levels=np.arange(n_classes + 1) - 0.5,
                           cmap=cmap, clim=(y.min(), y.max()),
                           zorder=1)

    ax.set(xlim=xlim, ylim=ylim)

# Fit a decision tree to the training data
dtc = DecisionTreeClassifier(random_state=None, max_depth=None, splitter="best", criterion="entropy").fit(training_features, training_labels)
model = dtc.fit(training_features, training_labels)

# measure the accuracy of the model on the testing data
print("Accuracy of the model on the testing data: {:.2f}%".format(model.score(testing_features, testing_labels) * 100))

# Post process pruning documentation
# https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py


Accuracy of the model on the testing data: 80.88%


---
### Random forests (Without Pruning)

---
### Neural Networks