In [426]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import neat as neat
from pureples.shared.visualize import draw_net
from pureples.shared.substrate import Substrate
from pureples.es_hyperneat.es_hyperneat import ESNetwork
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [427]:
# To test effects of leakage, there are two sets of data. One scaled on the entire dataset, and one scaled on the training set only.

# Load datasets scaled on entire dataset
X_train_full = pd.read_pickle('../data/train_test_sets/X_train_full.pkl')
X_test_full = pd.read_pickle('../data/train_test_sets/X_test_full.pkl')
y_train_full = pd.read_pickle('../data/train_test_sets/y_train_full.pkl')
y_test_full = pd.read_pickle('../data/train_test_sets/y_test_full.pkl')

# Load datasets scaled on training set only
X_train_scale = pd.read_pickle('../data/train_test_sets/X_train_scale.pkl')
X_test_scale = pd.read_pickle('../data/train_test_sets/X_test_scale.pkl')
y_train_scale = pd.read_pickle('../data/train_test_sets/y_train_scale.pkl')
y_test_scale = pd.read_pickle('../data/train_test_sets/y_test_scale.pkl')

In [428]:
# Track results
full_results = {}
scale_results = {}
tied_results = {}

# Models to test will be:
- Decision Tree Classifier
- Decision Tree Regressor
- Gradient Boosting Regression
- K Nearest Neighbors
- K Nearest Neighbors Grid Search
- Logistic Regression
- Neat Neural Network
- Random Forest Classifier
- Random Forest Grid Search
- Random Forest Regressor

# Decision Tree Classifier

In [429]:
# Decision Tree Classifier
def decision_tree_classifier(X_train, X_test, y_train, y_test, depth, criterion, d_type):
    dtc = DecisionTreeClassifier(max_depth=depth, criterion=criterion)
    dtc.fit(X_train, y_train)
    dtc_pred = dtc.predict(X_test)
    accuracy = accuracy_score(y_test, dtc_pred)
    f1 = f1_score(y_test, dtc_pred)
    return dtc_pred, accuracy, d_type, f1

def decision_tree(X_train, X_test, y_train, y_test, scaled):
    depths = [None, 3, 5, 10]
    criterions = ['entropy', 'gini']
    models = []
    f1_scores = []
    for depth in depths:
        for criterion in criterions:
            d_type = 'depth of ' + str(depth) + ' and the ' + criterion + ' criterion'
            dtc_model = decision_tree_classifier(X_train, X_test, y_train, y_test, depth=depth, criterion=criterion, d_type=d_type)
            models.append(dtc_model)
            f1_scores.append(dtc_model[3])
    # Find the best F1 score
    best_f1 = f1_scores.index(max(f1_scores))
    # Find best model
    best_model = models[best_f1]
    acc = round((best_model[1] * 100), 3)
    f1 = round(best_model[3], 3)
    print('Decision Tree Classifier ' + scaled + ' with a', best_model[2], 'has an f1-score of', f1, 'and had the best accuracy of ' + str(acc) + '.')
    print("Classification Report: \n", classification_report(y_test, best_model[0]))
    print("Confusion Matrix: \n", confusion_matrix(y_test, best_model[0]), '\n')
    return {'Accuracy': acc, 'F1-Score': f1}

In [430]:
# Decision Tree Classifier
full = decision_tree(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = decision_tree(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

full_results['Decision Tree Classifier'] = full
scale_results['Decision Tree Classifier'] = scale

Decision Tree Classifier Scaled on Entire Dataset with a depth of None and the entropy criterion has an f1-score of 0.936 and had the best accuracy of 94.737.
Classification Report: 
               precision    recall  f1-score   support

         0.0       0.94      0.97      0.96        66
         1.0       0.96      0.92      0.94        48

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.95       114
weighted avg       0.95      0.95      0.95       114

Confusion Matrix: 
 [[64  2]
 [ 4 44]] 

Decision Tree Classifier Scaled on Training Data with a depth of 3 and the entropy criterion has an f1-score of 0.933 and had the best accuracy of 95.614.
Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.95      0.97        78
           1       0.90      0.97      0.93        36

    accuracy                           0.96       114
   macro avg       0.94      0.96      0.95       

# Decision Tree Regressor

In [431]:
# Decision Tree Regressor
def dt_regressor(X, y, depth):
    dtr = DecisionTreeRegressor(max_depth=depth)
    dtr.fit(X, y)
    return dtr
def decision_tree_regressor(X_train, X_test, y_train, y_test, scaled):
    n_regressors = [3, 5, 7]
    types = []
    accuracies = []
    f1_scores = []
    for depth in range(1, 10):
        for n in n_regressors:
            trees = []
            type = 'depth of ' + str(depth) + ' and ' + str(n) + ' regressors'
            X = X_train
            y = y_train
            dtr_model = dt_regressor(X, y, depth=depth)
            trees.append(dtr_model)
            y_pred = dtr_model.predict(X)
            for i in range(n-1):
                dtr_model = dt_regressor(X, y=y_pred, depth=depth)
                trees.append(dtr_model)
            dtr_pred = sum([tree.predict(X_test) for tree in trees])
            # scale dtr_pred to 0-1 based value is above or below 0.5
            dtr_pred = np.where(dtr_pred > 0.5, 1, 0)
            accuracy = accuracy_score(y_test, dtr_pred)
            types.append(type)
            accuracies.append(accuracy)
            f1 = f1_score(y_test, dtr_pred)
            f1_scores.append(f1)
    # Find the best F1 score
    best_f1 = f1_scores.index(max(f1_scores))
    f1 = round(f1_scores[best_f1], 3)
    # Find accuracy
    best_accuracy = accuracies[best_f1]
    accuracy = round((best_accuracy * 100), 3)
    # Find the best type
    best_type = types[best_f1]
    print(f'Decision Tree Regressor {scaled} with a', best_type, '\n',
          'has an f1-score of', f1, 'and had the best accuracy of ' + str(accuracy) + '.', '\n')
    return {'Accuracy': accuracy, 'F1-Score': f1}

In [432]:
# Decision Tree Regressor
full = decision_tree_regressor(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = decision_tree_regressor(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

full_results['Decision Tree Regressor'] = full
scale_results['Decision Tree Regressor'] = scale

Decision Tree Regressor Scaled on Entire Dataset with a depth of 3 and 3 regressors 
 has an f1-score of 0.928 and had the best accuracy of 93.86. 

Decision Tree Regressor Scaled on Training Data with a depth of 3 and 3 regressors 
 has an f1-score of 0.897 and had the best accuracy of 92.982. 



# Gradient Boosting Classifier

In [433]:
# Gradient Boosting Classifier
def Gradient_Boosting_Classifier(X_train, X_test, y_train, y_test, scaled):
    learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
    models = {}
    for depth in range(1, 10):
        for learning_rate in learning_rates:
            gb = GradientBoostingClassifier(n_estimators=1, learning_rate = learning_rate, max_features=2, max_depth = depth, warm_start=True)
            for n_estimators in range(1, 20):
                gb.n_estimators = n_estimators
                gb.fit(X_train, y_train)
                gb_pred = gb.predict(X_test)
                accuracy = gb.score(X_test, y_test)
                accuracy = round((accuracy * 100), 3)
                f1 = f1_score(y_test, gb_pred)
                f1 = round(f1, 3)
                # Store f1_score, accuracy, depth, learning_rate, n_estimators
                models[f1] = [depth, learning_rate, n_estimators, accuracy]

    # Find best accuracy
    best_f1 = max(models.keys())

    print(f'Gradient Boosting {scaled} with a depth of', models[best_f1][0], ', learning rate of', 
          models[best_f1][1], ', and n_estimators of', models[best_f1][2], '\n',
          'had an f1-score of', best_f1, 'and had the best accuracy of ' + str(models[best_f1][3]) + '.', '\n')
    return {'Accuracy': models[best_f1][3], 'F1-Score': best_f1}

In [501]:
# Gradient Boosting Classifier
full = Gradient_Boosting_Classifier(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = Gradient_Boosting_Classifier(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

full_results['Gradient Boosting Classifier'] = full
scale_results['Gradient Boosting Classifier'] = scale

Gradient Boosting Scaled on Entire Dataset with a depth of 9 , learning rate of 0.75 , and n_estimators of 6 
 had an f1-score of 0.947 and had the best accuracy of 95.614. 

Gradient Boosting Scaled on Training Data with a depth of 9 , learning rate of 0.1 , and n_estimators of 13 
 had an f1-score of 0.972 and had the best accuracy of 98.246. 



# K Nearest Neighbors

In [447]:
def K_N_N(X_train, X_test, y_train, y_test, scaled):

    acc_scores = []
    f1_scores = []

    for i in range(1,20):

        knn = KNeighborsClassifier(i)
        knn.fit(X_train,y_train)
        acc_scores.append(knn.score(X_test,y_test))
        knn_pred = knn.predict(X_test)
        f1 = f1_score(y_test, knn_pred)
        f1_scores.append(f1)
    max_f1_score = max(f1_scores)
    optimal_k = f1_scores.index(max_f1_score) + 1
    f1 = round(max_f1_score, 3)
    acc = round((acc_scores[optimal_k] * 100), 3)

    print(f'KNN {scaled} with a k of', optimal_k, 'has an f1-score of',
           f1, 'and had the best accuracy of ' + str(acc) + '.', '\n')
    return {'Accuracy': acc, 'F1-Score': f1}

In [448]:
full = K_N_N(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = K_N_N(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

full_results['K Nearest Neighbors'] = full
scale_results['K Nearest Neighbors'] = scale

KNN Scaled on Entire Dataset with a k of 3 has an f1-score of 0.926 and had the best accuracy of 92.105. 

KNN Scaled on Training Data with a k of 2 has an f1-score of 0.944 and had the best accuracy of 92.982. 



# K Nearest Neighbors Grid Search


In [449]:
def KNN_Grid_Search(X_train, X_test, y_train, y_test, scaled):
    knn = KNeighborsClassifier()
    param_grid = {'n_neighbors': np.arange(1, 25)}
    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
    knn_gscv.fit(X_train, y_train)
    f1 = f1_score(y_test, knn_gscv.predict(X_test))
    f1 = round(f1, 3)
    acc = round((knn_gscv.best_score_ * 100), 3)

    print(f'KNN Grid Search {scaled} with a k of', knn_gscv.best_params_['n_neighbors'], 'has an f1-score of',
              f1, 'and had the best accuracy of ' + str(acc) + '.', '\n')
    return {'Accuracy': acc, 'F1-Score': f1}

In [450]:
full = KNN_Grid_Search(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = KNN_Grid_Search(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

full_results['K Nearest Neighbors Grid Search'] = full
scale_results['K Nearest Neighbors Grid Search'] = scale

KNN Grid Search Scaled on Entire Dataset with a k of 10 has an f1-score of 0.911 and had the best accuracy of 94.945. 

KNN Grid Search Scaled on Training Data with a k of 12 has an f1-score of 0.944 and had the best accuracy of 95.165. 



# Logistic Regression

In [451]:
def logistic_regression(X_train, X_test, y_train, y_test, scaled):
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    lr_pred = lr.predict(X_test)
    accuracy = accuracy_score(y_test, lr_pred)
    accuracy = round((accuracy * 100), 3)
    f1 = f1_score(y_test, lr_pred)
    f1 = round(f1, 3)
    print(f'Logistic Regression {scaled} has an f1-score of', f1,
           'and had the best accuracy of ' + str(accuracy) + '.', '\n')
    return {'Accuracy': accuracy, 'F1-Score': f1}

In [452]:
full = logistic_regression(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = logistic_regression(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

full_results['Logistic Regression'] = full
scale_results['Logistic Regression'] = scale

Logistic Regression Scaled on Entire Dataset has an f1-score of 0.86 and had the best accuracy of 89.474. 

Logistic Regression Scaled on Training Data has an f1-score of 0.933 and had the best accuracy of 95.614. 



# Neat Neural Network

In [453]:
X_train_neat = X_train_scale.to_numpy()
y_train_neat = y_train_scale.to_numpy()
X_test_neat = X_test_scale.to_numpy()
y_test_neat = y_test_scale.to_numpy()

In [454]:
import math
def elu(z):
    return z if z > 0.0 else math.exp(z) - 1

def selu(z):
    lam = 1.0507009873554804934193349852946
    alpha = 1.6732632423543772848170429916717
    return lam * z if z > 0.0 else lam * alpha * (math.exp(z) - 1)

In [455]:
class Network:
    def __init__(self, genome, config):
        self.genome = genome
        self.config = config
        self.fitness = None
        self.net = neat.nn.FeedForwardNetwork.create(genome, config)
    def activate(self, X):
        return self.net.activate(X)
    def predict(self, X):
        return np.array([self.activate(x) for x in X])

def eval_genomes(genomes, config):
    networks = []
    for genome_id, genome in genomes:
        networks.append(Network(genome, config))
    for network in networks:
        network.fitness = 0
    for network in networks:
        predictions = [np.argmax(network.activate(xi)) for xi in X_train_neat]
        network.fitness = (f1_score(y_train_neat, predictions)*100)
    for genome_id, genome in genomes:
        genome.fitness = next(network.fitness for network in networks if network.genome == genome)

In [456]:
def run_neat(X_test_neat, y_test_neat, scaled):
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, neat.DefaultSpeciesSet, neat.DefaultStagnation, '../neat_config.txt')
    config.genome_config.add_activation('elu', elu)
    config.genome_config.add_activation('selu', selu)
    p = neat.Population(config)
    p.add_reporter(neat.StdOutReporter(True))
    stats = neat.StatisticsReporter()
    p.add_reporter(stats)
    winner = p.run(eval_genomes, 500)
    winner_net = neat.nn.FeedForwardNetwork.create(winner, config)
    predictions = [np.argmax(winner_net.activate(xi)) for xi in X_test_neat]
    accuracy = accuracy_score(y_test_neat, predictions)
    accuracy = round((accuracy * 100), 3)
    f1 = f1_score(y_test_neat, predictions)
    f1 = round(f1, 3)

    print(f'NEAT {scaled} has an f1 score of', f1, 'and has an accuracy of ' + str(accuracy) + '.')
    return winner

scale = run_neat(X_test_neat, y_test_neat, scaled='Scaled on Training Data')


 ****** Running generation 0 ****** 



Population's average fitness: 0.00000 stdev: 0.00000
Best fitness: 0.00000 - size: (8, 0) - species 1 - id 1
Average adjusted fitness: 0.000
Mean genetic distance 2.825, standard deviation 0.472
Population of 500 members in 60 species:
   ID   age  size  fitness  adj fit  stag
     1    0   319      0.0    0.000     0
     2    0     4       --       --     0
     3    0     3       --       --     0
     4    0     5       --       --     0
     5    0     6       --       --     0
     6    0    11       --       --     0
     7    0     6       --       --     0
     8    0     3       --       --     0
     9    0     4       --       --     0
    10    0     4       --       --     0
    11    0     1       --       --     0
    12    0     1       --       --     0
    13    0     5       --       --     0
    14    0     2       --       --     0
    15    0     2       --       --     0
    16    0     2       --       --     0
    17    0     5       --       --     0
    18  

In [457]:
config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, neat.DefaultSpeciesSet, neat.DefaultStagnation, '../neat_config.txt')
config.genome_config.add_activation('elu', elu)
config.genome_config.add_activation('selu', selu)
winner_net = neat.nn.FeedForwardNetwork.create(scale, config)
predictions = [np.argmax(winner_net.activate(xi)) for xi in X_test_neat]
accuracy = round((accuracy_score(y_test_neat, predictions) * 100),3)
f1 = round(f1_score(y_test_neat, predictions), 3)
print(f'NEAT Scaled on Training Data has an f1 score of', f1, 'and has an accuracy of ' + str(accuracy) + '.')
scale_results['NEAT'] = {'Accuracy': accuracy, 'F1-Score': f1}

NEAT Scaled on Training Data has an f1 score of 0.933 and has an accuracy of 95.614.


# Random Forest Classifier

In [458]:
def Random_Forest_Classifier(X_train, X_test, y_train, y_test, scaled):
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, rf_pred)
    accuracy = round((accuracy * 100), 3)
    f1 = f1_score(y_test, rf_pred)
    f1 = round(f1, 3)
    print(f'Random Forest Classifier {scaled} has an f1-score of', f1,
           'and had the best accuracy of ' + str(accuracy) + '.', '\n')
    return {'Accuracy': accuracy, 'F1-Score': f1}

In [459]:
full = Random_Forest_Classifier(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = Random_Forest_Classifier(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

full_results['Random Forest Classifier'] = full
scale_results['Random Forest Classifier'] = scale

Random Forest Classifier Scaled on Entire Dataset has an f1-score of 0.913 and had the best accuracy of 92.982. 

Random Forest Classifier Scaled on Training Data has an f1-score of 0.904 and had the best accuracy of 93.86. 



# Random Forest Grid Search


In [460]:
def Random_Forest_Grid_Search(X_train, X_test, y_train, y_test, scaled):
    rf = RandomForestClassifier()
    param_grid = {'n_estimators': np.arange(1, 25)}
    rf_gscv = GridSearchCV(rf, param_grid, cv=5)
    rf_gscv.fit(X_train, y_train)
    f1 = f1_score(y_test, rf_gscv.predict(X_test))
    f1 = round(f1, 3)
    acc = round((rf_gscv.best_score_ * 100), 3)

    print(f'Random Forest Grid Search {scaled} with a n_estimator of', rf_gscv.best_params_['n_estimators'], 
          '\nhas an f1-score of', f1, 'and had the best accuracy of ' + str(acc) + '.', '\n')
    return {'Accuracy': acc, 'F1-Score': f1}

In [461]:
full = Random_Forest_Grid_Search(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = Random_Forest_Grid_Search(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

full_results['Random Forest Grid Search'] = full
scale_results['Random Forest Grid Search'] = scale

Random Forest Grid Search Scaled on Entire Dataset with a n_estimator of 14 
has an f1-score of 0.899 and had the best accuracy of 94.286. 

Random Forest Grid Search Scaled on Training Data with a n_estimator of 11 
has an f1-score of 0.897 and had the best accuracy of 94.505. 



# Random Forest Regressor

In [462]:
def Random_Forest_Regressor(X_train, X_test, y_train, y_test, scaled):
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    rf_pred = np.where(rf_pred > 0.5, 1, 0)
    accuracy = accuracy_score(y_test, rf_pred)
    accuracy = round((accuracy * 100), 3)
    f1 = f1_score(y_test, rf_pred)
    f1 = round(f1, 3)
    print(f'Random Forest Regressor {scaled} has an f1-score of', f1,
           'and had the best accuracy of ' + str(accuracy) + '.', '\n')
    return {'Accuracy': accuracy, 'F1-Score': f1}

In [463]:
full = Random_Forest_Regressor(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = Random_Forest_Regressor(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

full_results['Random Forest Regressor'] = full
scale_results['Random Forest Regressor'] = scale

Random Forest Regressor Scaled on Entire Dataset has an f1-score of 0.926 and had the best accuracy of 93.86. 

Random Forest Regressor Scaled on Training Data has an f1-score of 0.946 and had the best accuracy of 96.491. 



# Support Vector Machine

In [464]:
def svm(X_train, X_test, y_train, y_test, scaled):
    svm = SVC()
    svm.fit(X_train, y_train)
    svm_pred = svm.predict(X_test)
    accuracy = accuracy_score(y_test, svm_pred)
    accuracy = round((accuracy * 100), 3)
    f1 = f1_score(y_test, svm_pred)
    f1 = round(f1, 3)

    print(f'Support Vector Machine {scaled} has an f1-score of', f1,
              'and had the best accuracy of ' + str(accuracy) + '.', '\n')
    return {'Accuracy': accuracy, 'F1-Score': f1}

In [465]:
full = svm(X_train_full, X_test_full, y_train_full, y_test_full, scaled='Scaled on Entire Dataset')
scale = svm(X_train_scale, X_test_scale, y_train_scale, y_test_scale, scaled='Scaled on Training Data')

full_results['Support Vector Machine'] = full
scale_results['Support Vector Machine'] = scale

Support Vector Machine Scaled on Entire Dataset has an f1-score of 0.899 and had the best accuracy of 92.105. 

Support Vector Machine Scaled on Training Data has an f1-score of 0.944 and had the best accuracy of 96.491. 



# Results

In [522]:
sorted_full_results = {k: v for k, v in sorted(full_results.items(), key=lambda item: item[1]['Accuracy'], reverse=True)}
full_results = pd.DataFrame.from_dict(sorted_full_results, orient='index')
full_results.index.names = ['Model']
print(full_results)

                                 Accuracy  F1-Score
Model                                              
Gradient Boosting Classifier       96.491     0.958
K Nearest Neighbors Grid Search    94.945     0.911
Decision Tree Classifier           94.737     0.936
Random Forest Grid Search          94.286     0.899
Decision Tree Regressor            93.860     0.928
Random Forest Regressor            93.860     0.926
Random Forest Classifier           92.982     0.913
K Nearest Neighbors                92.105     0.926
Support Vector Machine             92.105     0.899
Logistic Regression                89.474     0.860


In [523]:
sorted_scale_results = {k: v for k, v in sorted(scale_results.items(), key=lambda item: item[1]['Accuracy'], reverse=True)}
scale_results = pd.DataFrame.from_dict(sorted_scale_results, orient='index')
scale_results.index.names = ['Model']
print(scale_results)

                                 Accuracy  F1-Score
Model                                              
Gradient Boosting Classifier       98.246     0.972
Random Forest Regressor            96.491     0.946
Support Vector Machine             96.491     0.944
Decision Tree Classifier           95.614     0.933
Logistic Regression                95.614     0.933
NEAT                               95.614     0.933
K Nearest Neighbors Grid Search    95.165     0.944
Random Forest Grid Search          94.505     0.897
Random Forest Classifier           93.860     0.904
Decision Tree Regressor            92.982     0.897
K Nearest Neighbors                92.982     0.944


Winning model is Gradient Boosting Classifier

In [513]:
gb = GradientBoostingClassifier(n_estimators=13, learning_rate=1, max_features=2, max_depth=9, warm_start=True)
gb.fit(X_train_scale, y_train_scale)
gb_pred = gb.predict(X_test_scale)

print("Classification Report: \n", classification_report(y_test_scale, gb_pred))
print("Confusion Matrix: \n", confusion_matrix(y_test_scale, gb_pred), '\n')

Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.95      0.97        78
           1       0.90      0.97      0.93        36

    accuracy                           0.96       114
   macro avg       0.94      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114

Confusion Matrix: 
 [[74  4]
 [ 1 35]] 



The columns I've chosen to use are:
- Radius Mean
- Texture Mean
- Smoothness Mean
- Compactness Mean
- Concavity Mean
- Concave Points Mean
- Symmetry Mean
- Fractal Dimension Mean

Which on average, gives:
- Gradient Boosting Classifier with 98.25%

And the following three models tied at 96.49%:
- Random Forest Regressor
- Support Vector Machine

When utilizing similar columns as Kaggle notebook (https://www.kaggle.com/code/priyanka841/breast-cancer-diagnostics-prediction)
- Radius Mean
- Texture Mean
- Smoothness Mean
- Compactness Mean
- Symmetry Mean
- Fractal Dimension Mean
- Radius Standard Error
- Texture Standard Error
- Smoothness Standard Error
- Compactness Standard Error
- Symmetry Standard Error
- Fractal Dimension Standard Error

Scaled Results:

![](../images/secondary_results.PNG)

# Conclusions

On average, the models trained on data that was scaled on training data, not the full dataset, score better. Leakage does seem to hinder the test accuracy.

My criteria for success was to find a model with higher accuracy than the example notebook by 'priyanka841'. Having originally chosen less columns (just the 'Mean' columns, not including the 'Standard Error'), I matched their highest accuracy for SVM results at 96.49% with 2 separate models. Beyond that, Gradient Boosting Classifier reached 98.25%.

When adjusting the columns in the dataset to match 'priyanka841' (addition of Standard Error, removing concavity and concave points), Gradiant Boosting Classifier only tied their SVM results of 96.49%.

Additionally, the NEAT neural network was tested and reached an accuracy of 93.86%. The caveat was that while the other models trained in less than 30 seconds, NEAT was allowed to train for ~16 minutes. While it's theoretically possible that the accuracy could improve, time invested must be considered.