In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import neat as neat
from pureples.shared.visualize import draw_net
from pureples.shared.substrate import Substrate
from pureples.es_hyperneat.es_hyperneat import ESNetwork
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# To test effects of leakage, there are two sets of data. One scaled on the entire dataset, and one scaled on the training set only.

# Load datasets scaled on entire dataset
X_train_full = pd.read_pickle('../data/train_test_sets/X_train_full.pkl')
X_test_full = pd.read_pickle('../data/train_test_sets/X_test_full.pkl')
y_train_full = pd.read_pickle('../data/train_test_sets/y_train_full.pkl')
y_test_full = pd.read_pickle('../data/train_test_sets/y_test_full.pkl')

# Load datasets scaled on training set only
X_train_scale = pd.read_pickle('../data/train_test_sets/X_train_scale.pkl')
X_test_scale = pd.read_pickle('../data/train_test_sets/X_test_scale.pkl')
y_train_scale = pd.read_pickle('../data/train_test_sets/y_train_scale.pkl')
y_test_scale = pd.read_pickle('../data/train_test_sets/y_test_scale.pkl')

In [3]:
# Track results
full_results = {}
scale_results = {}
tied_results = {}

# Models to test will be:
- Decision Tree Classifier
- Decision Tree Regressor
- Gradient Boosting Regression
- K Nearest Neighbors
- K Nearest Neighbors Grid Search
- Logistic Regression
- Neat Neural Network
- Random Forest Classifier
- Random Forest Grid Search
- Random Forest Regressor

# Decision Tree Classifier

In [4]:
# Decision Tree Classifier
def decision_tree_classifier(X_train, X_test, y_train, y_test, depth, criterion, type):
    dtc = DecisionTreeClassifier(max_depth=depth, criterion=criterion)
    dtc.fit(X_train, y_train)
    dtc_pred = dtc.predict(X_test)
    accuracy = accuracy_score(y_test, dtc_pred)
    return dtc_pred, accuracy, type

def decision_tree(X_train, X_test, y_train, y_test, scaled):
    depths = [None, 3, 5, 10]
    criterions = ['entropy', 'gini']
    models = []
    accuracies = []
    for depth in depths:
        for criterion in criterions:
            type = 'depth of ' + str(depth) + ' and the ' + criterion + ' criterion'
            dtc_model = decision_tree_classifier(X_train, X_test, y_train, y_test, depth=depth, criterion=criterion, type=type)
            models.append(dtc_model)
            accuracies.append(dtc_model[1])
    # Find best accuracy
    best_accuracy = accuracies.index(max(accuracies))
    # Find best model
    best_model = models[best_accuracy]
    print('Decision Tree Classifier ' + scaled + ' with a', best_model[2], 'has the best accuracy of ' + str(best_model[1]) + '.')
    print("Classification Report: \n", classification_report(y_test, best_model[0]))
    print("Confusion Matrix: \n", confusion_matrix(y_test, best_model[0]))
    print("Accuracy:", best_model[1])
    return round((best_model[1] * 100), 2)

In [5]:
# Decision Tree Classifier
full = decision_tree(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = decision_tree(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')
if full > scale:
    full_results['Decision Tree Classifier'] = full
scale_results['Decision Tree Classifier'] = scale

Decision Tree Classifier Scaled on Entire Dataset with a depth of 3 and the gini criterion has the best accuracy of 0.9473684210526315.
Classification Report: 
               precision    recall  f1-score   support

         0.0       0.95      0.97      0.96        71
         1.0       0.95      0.91      0.93        43

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114

Confusion Matrix: 
 [[69  2]
 [ 4 39]]
Accuracy: 0.9473684210526315
Decision Tree Classifier Scaled on Training Data with a depth of None and the gini criterion has the best accuracy of 0.9385964912280702.
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.96      0.95        77
           1       0.92      0.89      0.90        37

    accuracy                           0.94       114
   macro avg       0.93      0.93      0.93       114
weighted 

# Decision Tree Regressor

In [6]:
# Decision Tree Regressor
def dt_regressor(X, y, depth):
    dtr = DecisionTreeRegressor(max_depth=depth)
    dtr.fit(X, y)
    return dtr
def decision_tree_regressor(X_train, X_test, y_train, y_test, scaled):
    n_regressors = [3, 5, 7]
    types = []
    accuracies = []
    for depth in range(1, 10):
        for n in n_regressors:
            trees = []
            type = 'depth of ' + str(depth) + ' and ' + str(n) + ' regressors'
            X = X_train
            y = y_train
            dtr_model = dt_regressor(X, y, depth=depth)
            trees.append(dtr_model)
            y_pred = dtr_model.predict(X)
            for i in range(n-1):
                dtr_model = dt_regressor(X, y=y_pred, depth=depth)
                trees.append(dtr_model)
            dtr_pred = sum([tree.predict(X_test) for tree in trees])
            # scale dtr_pred to 0-1 based value is above or below 0.5
            dtr_pred = np.where(dtr_pred > 0.5, 1, 0)
            accuracy = accuracy_score(y_test, dtr_pred)
            types.append(type)
            accuracies.append(accuracy)
    # Find best accuracy
    best_accuracy = max(accuracies)
    accuracy_index = accuracies.index(best_accuracy)
    # Find the best type
    best_type = types[accuracy_index]
    print(f'Decision Tree Regressor {scaled} with a', best_type, 'has an accuracy of ' + str(accuracy) + '.')
    
    return round((best_accuracy * 100), 2)

In [7]:
# Decision Tree Regressor
full = decision_tree_regressor(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = decision_tree_regressor(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')
if full > scale:
    full_results['Decision Tree Regressor'] = full
    scale_results['Decision Tree Regressor'] = scale
elif scale > full:
    scale_results['Decision Tree Regressor'] = scale
else:
    tied_results['Decision Tree Regressor'] = scale

Decision Tree Regressor Scaled on Entire Dataset with a depth of 7 and 7 regressors has an accuracy of 0.9385964912280702.
Decision Tree Regressor Scaled on Training Data with a depth of 5 and 5 regressors has an accuracy of 0.9210526315789473.


# Gradient Boosting Classifier

In [8]:
# Gradient Boosting Classifier
def Gradient_Boosting_Classifier(X_train, X_test, y_train, y_test, scaled):
    learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
    models = {}
    for depth in range(1, 10):
        for learning_rate in learning_rates:
            gb = GradientBoostingClassifier(n_estimators=1, learning_rate = learning_rate, max_features=2, max_depth = depth, warm_start=True)
            for n_estimators in range(1, 20):
                gb.n_estimators = n_estimators
                gb.fit(X_train, y_train)
                accuracy = gb.score(X_test, y_test)
                # Store accuracy, depth, learning rate, and n_estimators
                models[accuracy] = [depth, learning_rate, n_estimators]
    # Find best accuracy
    best_accuracy = max(models.keys())

    print(f'Gradient Boosting {scaled} with a depth of', models[best_accuracy][0], ', learning rate of', models[best_accuracy][1], ', and n_estimators of', models[best_accuracy][2], 'has an accuracy of ' + str(accuracy) + '.')

    return round((best_accuracy * 100), 2)

In [9]:
# Gradient Boosting Classifier
full = Gradient_Boosting_Classifier(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = Gradient_Boosting_Classifier(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')
if full > scale:
    full_results['Gradient Boosting Classifier'] = full
    scale_results['Gradient Boosting Classifier'] = scale
elif scale > full:
    scale_results['Gradient Boosting Classifier'] = scale
else:
    tied_results['Gradient Boosting Classifier'] = scale

Gradient Boosting Scaled on Entire Dataset with a depth of 7 , learning rate of 0.5 , and n_estimators of 8 has an accuracy of 0.9298245614035088.
Gradient Boosting Scaled on Training Data with a depth of 9 , learning rate of 0.25 , and n_estimators of 17 has an accuracy of 0.9385964912280702.


Gradient Boosting with the best results has inconsistent parameters. During initial testing, results were:
- Training Data with a depth of 7 , learning rate of 0.1 , and n_estimators of 10 has an accuracy of 0.9473684210526315.
- Training Data with a depth of 8 , learning rate of 0.1 , and n_estimators of 5 has an accuracy of 0.9473684210526315.
- Training Data with a depth of 9 , learning rate of 1 , and n_estimators of 19 has an accuracy of 0.9649122807017544.
- Entire Dataset with a depth of 9 , learning rate of 0.5 , and n_estimators of 19 has an accuracy of 0.9385964912280702.
- Entire Dataset with a depth of 2 , learning rate of 0.25 , and n_estimators of 8 has an accuracy of 0.9298245614035088.
- Training Data with a depth of 9 , learning rate of 1 , and n_estimators of 19 has an accuracy of 0.9736842105263158.

# K Nearest Neighbors

In [10]:
def K_N_N(X_train, X_test, y_train, y_test, scaled):

    test_scores = []

    for i in range(1,20):

        knn = KNeighborsClassifier(i)
        knn.fit(X_train,y_train)
        test_scores.append(knn.score(X_test,y_test))
    max_test_score = max(test_scores)
    optimal_k = test_scores.index(max_test_score) + 1
    print(f'KNN {scaled} with a k of', optimal_k, 'has an accuracy of ' + str(max_test_score) + '.')
    return round((max_test_score * 100), 2)

In [11]:
full = K_N_N(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = K_N_N(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')
if full > scale:
    full_results['K Nearest Neighbors'] = full
    scale_results['K Nearest Neighbors'] = scale
elif scale > full:
    scale_results['K Nearest Neighbors'] = scale
else:
    tied_results['K Nearest Neighbors'] = full

KNN Scaled on Entire Dataset with a k of 3 has an accuracy of 0.9385964912280702.
KNN Scaled on Training Data with a k of 5 has an accuracy of 0.9824561403508771.


# K Nearest Neighbors Grid Search
We will not track the given accuracy, but useful to compare best estimated number of neighbors

In [12]:
def KNN_Grid_Search(X_train, X_test, y_train, y_test, scaled):
    knn = KNeighborsClassifier()
    param_grid = {'n_neighbors': np.arange(1, 25)}
    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
    knn_gscv.fit(X_train, y_train)
    print(f'KNN Grid Search {scaled} with a k of', knn_gscv.best_params_['n_neighbors'], 'has an accuracy of ' + str(knn_gscv.best_score_) + '.')
    return round((knn_gscv.best_score_ * 100), 2)

In [13]:
full = KNN_Grid_Search(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = KNN_Grid_Search(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')


KNN Grid Search Scaled on Entire Dataset with a k of 18 has an accuracy of 0.956043956043956.
KNN Grid Search Scaled on Training Data with a k of 21 has an accuracy of 0.945054945054945.


# Logistic Regression

In [14]:
def logistic_regression(X_train, X_test, y_train, y_test, scaled):
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    lr_pred = lr.predict(X_test)
    accuracy = accuracy_score(y_test, lr_pred)
    print(f'Logistic Regression {scaled} has an accuracy of ' + str(accuracy) + '.')
    return round((accuracy * 100), 2)

In [15]:
full = logistic_regression(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = logistic_regression(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')
if full > scale:
    full_results['Logistic Regression'] = full
    scale_results['Logistic Regression'] = scale
elif scale > full:
    scale_results['Logistic Regression'] = scale
else:
    tied_results['Logistic Regression'] = full

Logistic Regression Scaled on Entire Dataset has an accuracy of 0.9122807017543859.
Logistic Regression Scaled on Training Data has an accuracy of 0.9473684210526315.


# Neat Neural Network

In [16]:
X_train_neat = X_train_scale.to_numpy()
y_train_neat = y_train_scale.to_numpy()
X_test_neat = X_test_scale.to_numpy()
y_test_neat = y_test_scale.to_numpy()

In [17]:
import math
def elu(z):
    return z if z > 0.0 else math.exp(z) - 1

def selu(z):
    lam = 1.0507009873554804934193349852946
    alpha = 1.6732632423543772848170429916717
    return lam * z if z > 0.0 else lam * alpha * (math.exp(z) - 1)

In [22]:
class Network:
    def __init__(self, genome, config):
        self.genome = genome
        self.config = config
        self.fitness = None
        self.net = neat.nn.FeedForwardNetwork.create(genome, config)
    def activate(self, X):
        return self.net.activate(X)
    def predict(self, X):
        return np.array([self.activate(x) for x in X])

stag_count = 0

def eval_genomes(genomes, config):
    global stag_count
    stag_count += 1
    if stag_count == 10:
        config.stagnation_config.max_stagnation = 15
    networks = []
    for genome_id, genome in genomes:
        networks.append(Network(genome, config))
    for network in networks:
        network.fitness = 0
    for network in networks:
        predictions = [np.argmax(network.activate(xi)) for xi in X_train_neat]
        network.fitness = accuracy_score(y_train_neat, predictions) * 100
    for genome_id, genome in genomes:
        genome.fitness = next(network.fitness for network in networks if network.genome == genome)

In [None]:
def run_neat(X_test_neat, y_test_neat, scaled):
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, neat.DefaultSpeciesSet, neat.DefaultStagnation, '../neat_config.txt')
    config.genome_config.add_activation('elu', elu)
    config.genome_config.add_activation('selu', selu)
    p = neat.Population(config)
    p.add_reporter(neat.StdOutReporter(True))
    stats = neat.StatisticsReporter()
    p.add_reporter(stats)
    winner = p.run(eval_genomes, 500)
    winner_net = neat.nn.FeedForwardNetwork.create(winner, config)
    predictions = [np.argmax(winner_net.activate(xi)) for xi in X_test_neat]
    accuracy = accuracy_score(y_test_neat, predictions) * 100
    print(f'NEAT {scaled} has an accuracy of ' + str(accuracy) + '.')
    return winner


scale = run_neat(X_test_neat, y_test_neat, scaled='Scaled on Training Data')

In [None]:
config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, neat.DefaultSpeciesSet, neat.DefaultStagnation, '../neat_config.txt')
config.genome_config.add_activation('elu', elu)
config.genome_config.add_activation('selu', selu)
winner_net = neat.nn.FeedForwardNetwork.create(scale, config)
predictions = [np.argmax(winner_net.activate(xi)) for xi in X_test_neat]
accuracy = round((accuracy_score(y_test_neat, predictions) * 100),2)
scale_results['NEAT'] = accuracy
print(f'NEAT has an accuracy of ' + str(accuracy_score) + '.')

# Random Forest Classifier

In [28]:
def Random_Forest_Classifier(X_train, X_test, y_train, y_test, scaled):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, rf_pred)
    print(f'Random Forest Classifier {scaled} has an accuracy of ' + str(accuracy) + '.')
    return round((accuracy * 100), 2)

In [29]:
full = Random_Forest_Classifier(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = Random_Forest_Classifier(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')
if full > scale:
    full_results['Random Forest Classifier'] = full
    scale_results['Random Forest Classifier'] = scale
elif scale > full:
    scale_results['Random Forest Classifier'] = scale
else:
    tied_results['Random Forest Classifier'] = full

Random Forest Classifier Scaled on Entire Dataset has an accuracy of 0.9210526315789473.
Random Forest Classifier Scaled on Training Data has an accuracy of 0.9473684210526315.


# Random Forest Grid Search
Similar to KNN Grid Search, we will not track the given accuracy.

In [30]:
def Random_Forest_Grid_Search(X_train, X_test, y_train, y_test, scaled):
    rf = RandomForestClassifier()
    param_grid = {'n_estimators': np.arange(1, 25)}
    rf_gscv = GridSearchCV(rf, param_grid, cv=5)
    rf_gscv.fit(X_train, y_train)
    print(f'Random Forest Grid Search {scaled} with a n_estimator of', rf_gscv.best_params_['n_estimators'], 'has an accuracy of ' + str(rf_gscv.best_score_) + '.')
    return round((rf_gscv.best_score_ * 100), 2)

In [31]:
full = Random_Forest_Grid_Search(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = Random_Forest_Grid_Search(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

Random Forest Grid Search Scaled on Entire Dataset with a n_estimator of 22 has an accuracy of 0.945054945054945.
Random Forest Grid Search Scaled on Training Data with a n_estimator of 12 has an accuracy of 0.9494505494505496.


# Random Forest Regressor

In [32]:
def Random_Forest_Regressor(X_train, X_test, y_train, y_test, scaled):
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    rf_pred = np.where(rf_pred > 0.5, 1, 0)
    accuracy = accuracy_score(y_test, rf_pred)
    print(f'Random Forest Regressor {scaled} has an accuracy of ' + str(accuracy) + '.')
    return round((accuracy * 100), 2)

In [33]:
full = Random_Forest_Regressor(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = Random_Forest_Regressor(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')
if full > scale:
    full_results['Random Forest Regressor'] = full
    scale_results['Random Forest Regressor'] = scale
elif scale > full:
    scale_results['Random Forest Regressor'] = scale
else:
    tied_results['Random Forest Regressor'] = full

Random Forest Regressor Scaled on Entire Dataset has an accuracy of 0.9298245614035088.
Random Forest Regressor Scaled on Training Data has an accuracy of 0.9385964912280702.


# Support Vector Machine

In [34]:
def svm(X_train, X_test, y_train, y_test, scaled):
    svm = SVC()
    svm.fit(X_train, y_train)
    svm_pred = svm.predict(X_test)
    accuracy = accuracy_score(y_test, svm_pred)
    print(f'Support Vector Machine {scaled} has an accuracy of ' + str(accuracy) + '.')
    return round((accuracy * 100), 2)

In [35]:
full = svm(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = svm(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')
if full > scale:
    full_results['Support Vector Machine'] = full
    scale_results['Support Vector Machine'] = scale
elif scale > full:
    scale_results['Support Vector Machine'] = scale
else:
    tied_results['Support Vector Machine'] = full

Support Vector Machine Scaled on Entire Dataset has an accuracy of 0.9210526315789473.
Support Vector Machine Scaled on Training Data has an accuracy of 0.9649122807017544.


# Results

In [39]:
# Print the results of full_results
print('Full Results:')
for key, value in full_results.items():
    print(key, 'has an accuracy of', value)

# Print the results of scale_results
print('\nScaled Results:')
for key, value in scale_results.items():
    print(key, 'has an accuracy of', value)

# Print the results of tied_results
print('\nTied Results:')
for key, value in tied_results.items():
    print(key, 'has an accuracy of', value)

Full Results:
Decision Tree Classifier has an accuracy of 94.74
Decision Tree Regressor has an accuracy of 94.74

Scaled Results:
Decision Tree Classifier has an accuracy of 93.86
Decision Tree Regressor has an accuracy of 92.11
Gradient Boosting Classifier has an accuracy of 97.37
K Nearest Neighbors has an accuracy of 98.25
Logistic Regression has an accuracy of 94.74
NEAT has an accuracy of 96.49
Random Forest Classifier has an accuracy of 94.74
Random Forest Regressor has an accuracy of 93.86
Support Vector Machine has an accuracy of 96.49

Tied Results:


The columns I've chosen to use are:
- Radius Mean
- Texture Mean
- Smoothness Mean
- Compactness Mean
- Concavity Mean
- Concave Points Mean
- Symmetry Mean
- Fractal Dimension Mean

Which on average, gives:
- K Nearest Neighbors with 98.25%
- Gradient Boosting Classifier with 97.37%
- Support Vector Machine with 96.49%
- NEAT Neural Network with 96.49%

When utilizing similar columns as Kaggle notebook (https://www.kaggle.com/code/priyanka841/breast-cancer-diagnostics-prediction)
- Radius Mean
- Texture Mean
- Smoothness Mean
- Compactness Mean
- Symmetry Mean
- Fractal Dimension Mean
- Radius Standard Error
- Texture Standard Error
- Smoothness Standard Error
- Compactness Standard Error
- Symmetry Standard Error
- Fractal Dimension Standard Error

Full Results:
- (None)

Scaled Results:
- Decision Tree Classifier has an accuracy of 95.61
- Decision Tree Regressor has an accuracy of 96.49
- Gradient Boosting Classifier has an accuracy of 99.12
- K Nearest Neighbors has an accuracy of 94.73
- Logistic Regression has an accuracy of 94.73
- Random Forest Classifier has an accuracy of 96.49
- Random Forest Regressor has an accuracy of 94.74
- Support Vector Machine has an accuracy of 93.86

Tied Results:
- (None)

# Conclusions

On average, the models trained on data that was scaled on training data, not the full dataset, score better. Leakage does seem to hinder the test accuracy.

My criteria for success was to find a model with higher accuracy than the example notebook by 'priyanka841'. Having originally chosen less columns (just the 'Mean' columns, not including the 'Standard Error'), I matched the SVM results at 96.49%. Beyond that, K Nearest Columns scored 98.25% and Gradient Boosting Classifier scored 97.37%.

When adjusting the columns in the dataset to match 'priyanka841' (addition of Standard Error, removing concavity and concave points), Gradiant Boosting Classifier reached an impressive 99.12%.

Additionally, the NEAT neural network was tested and was tied in 3rd place with 96.49%. The caveat, was that while the other models trained in less than 30 seconds, NEAT was allowed to train for ~80 minutes. While it's theoretically possible that the accuracy could improve, time invested must be considered.