In [None]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import seaborn as sns
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn import tree
from sklearn.decomposition import PCA
import keras
from keras.models import Sequential
from tensorflow.keras import layers
from keras.layers import Dense
from keras.layers import Dropout
import keras.backend as K
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import visualkeras
from sklearn.tree import export_graphviz
import graphviz

In [None]:
asdData = pd.read_csv('dataset.csv', header = None)
asdData.columns = ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10","age","gender","ethnicity","jaundice","autism","country","used_app_before","result","age_desc","relation","Class_ASD"]

In [None]:
asdData.head(5)

In [None]:
asdData.info()

In [None]:
asdData['A1'] = asdData['A1'].astype(object)
asdData['A2'] = asdData['A2'].astype(object)
asdData['A3'] = asdData['A3'].astype(object)
asdData['A4'] = asdData['A4'].astype(object)
asdData['A5'] = asdData['A5'].astype(object)
asdData['A6'] = asdData['A6'].astype(object)
asdData['A7'] = asdData['A7'].astype(object)
asdData['A8'] = asdData['A8'].astype(object)
asdData['A9'] = asdData['A9'].astype(object)
asdData['A10'] = asdData['A10'].astype(object)

In [None]:
for col in asdData:
    print(asdData[col].unique())

In [None]:
asdPie = asdData['Class_ASD'].squeeze()
labels = ['Not Diagnosed', 'Diagnosed']
explode = (0.05, 0.05)
asdPie.value_counts(normalize=True).plot.pie(labels = labels, fontsize = 12,
                                             ylabel = "",
                                             autopct='%1.1f%%', pctdistance = 1.2, labeldistance = 1.4, 
                                             colors =['darkseagreen', 'deepskyblue'],
                                             explode = explode,
                                             legend = True)
plt.title("Pie Chart to Show the Proportion of Adults Diagnosed Vs. Those Not Diagnosed", pad = 20)
plt.savefig('asdPie.png')
plt.show()

## Data Cleaning

In [None]:
#Missing Values
asdData.drop(asdData[asdData['ethnicity'] == '?'].index, inplace = True)
asdData.info()

In [None]:
asdData['age'] = asdData['age'].astype('int64')

In [None]:
#Outliers
#Fixing age
asdData = asdData.replace(383,38)
summary = asdData.describe()
summary = summary.transpose()
summary

In [None]:
#Duplicates
asdData = asdData.replace('others', 'Others')
print(asdData['ethnicity'].unique())

## Data Visualisation

In [None]:
#age and gender based on the class
sns.set_style("darkgrid")
sns.set_palette("Set2")
sns.boxplot(x = asdData['gender'],
            y = asdData['age'],
            hue = asdData['Class_ASD'])
plt.title("BoxPlot To Show the Age Distribution of Men and Women and Their Class", pad=20)
plt.show()

In [None]:
sns.set_palette("Set2")
sns.boxplot(x = asdData['gender'],
            y = asdData['result'],
           hue = asdData['Class_ASD'])
plt.yticks([0, 1,  2, 3,  4, 5,  6, 7, 8, 9,  10])
plt.show()

In [None]:
sns.countplot(data = asdData, x = 'Class_ASD', hue = 'jaundice')
plt.show()

In [None]:
sns.countplot(data = asdData, x = 'Class_ASD', hue = 'autism')
plt.show()

In [None]:
fig, axs = plt.subplots(2,2, figsize = (10,10))
fig.tight_layout(pad=5.0)
sns.set_style("darkgrid")
sns.set_palette("Set2")
sns.boxplot(x = asdData['gender'],
            y = asdData['age'],
            hue = asdData['Class_ASD'], ax = axs[0,0])
axs[0,0].set_title("Age Distribution of men and women \n and Being Classed With ASD")

sns.boxplot(x = asdData['gender'],
            y = asdData['result'],
           hue = asdData['Class_ASD'], ax = axs[0,1])
axs[0,1].set_yticks([0, 1,  2, 3,  4, 5,  6, 7, 8, 9,  10])
axs[0,1].set_title("Result Distribution of men and women \n and Being Classed With ASD")

sns.set_palette("coolwarm_r")

sns.countplot(data = asdData, x = 'Class_ASD', hue = 'jaundice', ax=axs[1,0])
axs[1,0].set_title("Relationship Between Jaundice Diagnosis At Birth \n and Being Classed With ASD")
axs[1,0].legend(title = 'Jaundice',title_fontsize = 12, fontsize=12)

sns.set_palette("BrBG_r")

sns.countplot(data = asdData, x = 'Class_ASD', hue = 'autism', ax = axs[1,1])
axs[1,1].set_title("Relationship Between Having a Relative With Autism \n and Being Classed With ASD")
axs[1,1].legend(loc = 'upper right', title = 'ASD in Family', title_fontsize = 12,  fontsize=12)
fig.suptitle("Relationships Between Categorical Feature Variables and The Target Class (Classed With ASD)", fontsize = 16, fontweight = 'bold', y = 1.00)
plt.savefig('visulisaitonGrid.png')
plt.show()

## Test/ Train Split

In [None]:
#Varibale reduction
asdData = asdData.drop(['country', 'used_app_before', 'result', 'age_desc'], axis = 1)

In [None]:
asdData = pd.get_dummies(asdData, 
              columns = ["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10",
                        "gender", "ethnicity", "jaundice", "autism", "relation"],
             prefix = ["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10",
                        "gender", "ethnicity", "jaundice", "autism", "relation"])


In [None]:
#Normalising Age Data
scaler = MinMaxScaler()
cols_to_norm = ['age']
asdData[cols_to_norm] = scaler.fit_transform(asdData[cols_to_norm])

In [None]:
random.seed(704)

#partition Data into test and train
train_dataset = asdData.sample(frac=0.8, random_state=0)
test_dataset = asdData.drop(train_dataset.index)

#Split response from features
trainFeatures = train_dataset.copy()
testFeatures = test_dataset.copy()

#Labels = response
trainLabels = trainFeatures.pop('Class_ASD')
testLabels = testFeatures.pop('Class_ASD')

In [None]:
label_encoder = LabelEncoder()
trainLabels = label_encoder.fit_transform(trainLabels)

In [None]:
label_encoder = LabelEncoder()
testLabels = label_encoder.fit_transform(testLabels)

In [None]:
trainFeatures = trainFeatures.values
testFeatures = testFeatures.values

In [None]:
random.seed(704)
#Cross Val data split
crossX = asdData
crossY = crossX.pop('Class_ASD')

crossX = crossX.values
crossY = crossY.values

In [None]:
label_encoder = LabelEncoder()
crossY = label_encoder.fit_transform(crossY)
label_encoder_name_mapping = dict(zip(label_encoder.classes_,
                                         label_encoder.transform(label_encoder.classes_)))
print("Mapping of Label Encoded Classes", label_encoder_name_mapping, sep="\n")
#print("Label Encoded Target Variable", encoded_y, sep="\n")

## Decision Tree

In [None]:
#K-fold Cross Validation:
def cross_validation(model, _X, _y, _cv=10):
    _scoring = ['accuracy', 'precision', 'recall', 'f1']
    results = cross_validate(estimator = model,
                            X = _X,
                            y = _y,
                            cv=_cv,
                            scoring = _scoring,
                            return_train_score=True)
    return {"Training Accuracy scores": results['train_accuracy'],
              "Mean Training Accuracy": results['train_accuracy'].mean()*100,
              "Training Precision scores": results['train_precision'],
              "Mean Training Precision": results['train_precision'].mean(),
              "Training Recall scores": results['train_recall'],
              "Mean Training Recall": results['train_recall'].mean(),
              "Training F1 scores": results['train_f1'],
              "Mean Training F1 Score": results['train_f1'].mean(),
              "Validation Accuracy scores": results['test_accuracy'],
              "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
              "Validation Precision scores": results['test_precision'],
              "Mean Validation Precision": results['test_precision'].mean(),
              "Validation Recall scores": results['test_recall'],
              "Mean Validation Recall": results['test_recall'].mean(),
              "Validation F1 scores": results['test_f1'],
              "Mean Validation F1 Score": results['test_f1'].mean()
              }

In [None]:
#Grouped bar chart to visulise training and validation results in each fold
def plot_result(x_label, y_label, plot_title, train_data, val_data):
    plt.figure(figsize=(12,6))
    labels = ["1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold", "6th Fold", "7th Fold", "8th Fold", "9th Fold", "10th Fold"]
    X_axis = np.arange(len(labels))
    ax = plt.gca()
    plt.ylim(0.40000, 1)
    plt.bar(X_axis-0.2, train_data, 0.4, color='cornflowerblue', label='Training')
    plt.bar(X_axis+0.2, val_data, 0.4, color='coral', label='Validation')
    plt.title(plot_title, fontsize=20, pad = 20)
    plt.xticks(X_axis, labels)
    plt.xlabel(x_label, fontsize=14)
    plt.ylabel(y_label, fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
random.seed(704)
#Create
decTree = DecisionTreeClassifier(criterion = 'entropy')
#fit
decTreeResults = cross_validation(decTree, crossX, crossY, 10)
decTreeResults

In [None]:
plot_result("Decision Tree", "Accuracy", "Accuracy scores in 10-folds",
           decTreeResults["Training Accuracy scores"],
           decTreeResults["Validation Accuracy scores"])

In [None]:
path = decTree.cost_complexity_pruning_path(trainFeatures, trainLabels)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(ccp_alphas)

In [None]:
#For each alpha we will append our model to a list
decTrees = []
for ccp_alpha in ccp_alphas:
    decTree = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha, criterion = 'entropy')
    decTree.fit(trainFeatures, trainLabels)
    decTrees.append(decTree)

In [None]:
#Plot accuracy vs. Alpha graph - value of alpha for maximum training accuracy
train_acc = []
test_acc = []
for dec in decTrees:
    Y_train_pred = dec.predict(trainFeatures)
    Y_test_pred = dec.predict(testFeatures)
    train_acc.append(accuracy_score(Y_train_pred, trainLabels))
    test_acc.append(accuracy_score(Y_test_pred, testLabels))

plt.scatter(ccp_alphas, train_acc)
plt.scatter(ccp_alphas, test_acc)
plt.plot(ccp_alphas, train_acc, label = 'Train Accuracy', drawstyle = "steps-post")
plt.plot(ccp_alphas, test_acc, label = 'Test Accuracy', drawstyle = "steps-post")
plt.legend()
plt.title("Accuracy Vs Alpha")
plt.savefig('Alpha.png')
plt.show()

In [None]:
#0.01 alpha with maximum test accuracy alongside optimum training accuracy
#Model more generalised and will perform better on unseen data

In [None]:
random.seed(704)
decTreeAlph = DecisionTreeClassifier(ccp_alpha =  0.04, criterion = 'entropy')
decTreeAlph.fit(trainFeatures, trainLabels)

dot_data = tree.export_graphviz(decTreeAlph, out_file=None, 
                                feature_names=asdData.columns.values[:],  
                                class_names=asdData.columns.values[-2:],
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph


In [None]:
random.seed(704)
decTreeResAlph = cross_validation(decTreeAlph, crossX, crossY, 10)
decTreeResAlph

In [None]:
plot_result("Pruned Tree", "Accuracy", "Accuracy scores in 10-folds",
           decTreeResAlph["Training Accuracy scores"],
           decTreeResAlph["Validation Accuracy scores"])

In [None]:
UnPrunTrain = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
PrunTrain = [0.88868613, 0.88868613, 0.88321168, 0.88138686, 0.87773723, 0.88868613, 0.88868613, 0.87408759, 0.87773723, 0.90346084]
            
UnprunVal = [0.86885246, 0.95081967, 0.85245902, 0.95081967, 0.98360656, 0.95081967, 0.90163934, 0.8852459 , 0.95081967, 0.88333333]
PrunVal= [0.85245902, 0.85245902, 0.90163934, 0.91803279, 0.95081967, 0.85245902, 0.85245902, 0.78688525, 0.95081967, 0.78333333]

In [None]:
foldLabels = ["1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold", "6th Fold", "7th Fold", "8th Fold", "9th Fold", "10th Fold"]

In [None]:
decData = {
    'Fold': foldLabels,
    'Not Pruned Training': UnPrunTrain,
    'Pruned Training': PrunTrain,
    'Not Pruned Validation': UnprunVal,
    'Pruned Validation': PrunVal
}

dec = pd.DataFrame(decData)

In [None]:
f, axs = plt.subplots(2, figsize = (10,6), sharex=True, sharey=True)
sns.barplot(data=dec, x="Fold",y="Not Pruned Training",hue='Model', ax = axs[0])
sns.scatterplot(data=df, x="Fold",y="Accuracy", hue="Model", ax = axs[0], legend = False)
axs[0].set_ylabel('Validation Score')
axs[0].set_xlabel('Fold Number')
axs[0].set_xticklabels(labels = fold, rotation=45)
axs[0].set_title('Accuracy Score')

sns.lineplot(data=df, x="Fold",y="F1", hue="Model", ax = axs[1], legend = False)
sns.scatterplot(data=df, x="Fold",y="F1", hue="Model", ax = axs[1], legend = False)
axs[1].set_ylabel('F1 Score')
axs[1].set_xlabel('Fold Number')
axs[1].set_xticklabels(labels = fold, rotation=45)
axs[1].set_title('F1Score')

f.suptitle("Accuracy and F1 Score For All Models Across 10 Folds", fontsize = 16, fontweight = 'bold', y = 0.99)

plt.show()

In [None]:
X_Axis = np.arange(len(foldLabels))
f, axs = plt.subplots(2, figsize = (20,20), sharex=True, sharey=True)
ax = plt.gca()
plt.xticks(X_Axis, foldLabels)
axs[0].bar(X_Axis -0.2, decTreeResults["Training Accuracy scores"], 0.4, color = 'cornflowerblue', label = 'Training')
axs[0].bar(X_Axis +0.2, decTreeResults["Validation Accuracy scores"], 0.4, color = 'coral', label = 'Validation')
axs[0].set_ylim(0.40000, 1)
axs[0].set_ylabel('Accuracy Score', fontsize = 18)
axs[0].set_xticklabels(labels = foldLabels, rotation=45)
axs[0].set_title('Accuracy Levels Before Pruning', fontsize = 30)
axs[0].grid()
axs[0].legend(prop = {'size': 16})


axs[1].bar(X_Axis -0.2, decTreeResAlph["Training Accuracy scores"], 0.4, color = 'cornflowerblue', label = 'Training')
axs[1].bar(X_Axis +0.2, decTreeResAlph["Validation Accuracy scores"], 0.4, color = 'coral', label = 'Validation')
axs[1].set_ylim(0.40000, 1)
axs[1].set_ylabel('Accuracy Score', fontsize = 18)
axs[1].set_xticklabels(labels = foldLabels, rotation=45, fontsize = 18)
axs[1].set_title('Accuracy Levels After Pruning', fontsize = 30)
axs[1].grid()
axs[1].set_xlabel('Fold Number', fontsize = 18)

f.suptitle("Training and Validation Accuracy in an UnPruned and Pruned Decision Tree", fontsize = 32, fontweight = 'bold', y = 0.94)
plt.savefig('barcomparison.png')
plt.show()


## Random Forests

In [None]:
random.seed(704)

In [None]:
#Optimise the forest
#No. trees
n_estimators = [int(x) for x in np.linspace (start = 50, stop = 300, num = 10)]
#No. features to consider every split
max_features = ['auto', 'sqrt']
#Mac no. levels in tree
max_depth = [int(x) for x in np.linspace (10, 110, num = 11)]
max_depth.append(None)
#min no. samples required to split node
min_samples_split = [2,5,10]
#min no. samples at each leaf node
min_samples_leaf = [1, 2, 4]
#Method of selecting samples for training each tree
bootstrap = [True, False]

#Create grid
ran_grid = {'n_estimators': n_estimators,
           'max_features': max_features,
           'max_depth': max_depth,
           'min_samples_split': min_samples_split,
           'min_samples_leaf': min_samples_leaf,
           'bootstrap': bootstrap}

In [None]:
random.seed(704)
gscvran = GridSearchCV(estimator = RandomForestClassifier(), param_grid = ran_grid)
gscvran.fit(crossX, crossY)

In [None]:
print(gscvran.best_score_)
print(gscvran.best_params_)

In [None]:
random.seed(704)
gridTree = RandomForestClassifier(n_estimators=70, max_depth = 5, min_samples_leaf = 1,
                                 min_samples_split = 2, criterion = 'gini')
gridTreeRes = cross_validation(gridTree, crossX, crossY, 10)
gridTreeRes

In [None]:
plot_result("Optimised Tree", "Accuracy", "Accuracy scores in 10-folds",
           gridTreeRes["Training Accuracy scores"],
           gridTreeRes["Validation Accuracy scores"])

## Neural Network

In [None]:
#Function to vary number of nodes
def FindLayersLinear(n_layers, first_layer_nodes, last_layer_nodes):
    layers = []
    nodes_increment = (last_layer_nodes - first_layer_nodes)/ (n_layers-1)
    nodes = first_layer_nodes
    for i in range(1,n_layers+1):
        layers.append(math.ceil(nodes))
        nodes = nodes + nodes_increment
    return layers

FindLayersLinear(3,45,3)

In [None]:
### Function to change tensortflow model parameters
def createmodel(n_layers, first_layer_nodes, last_layer_nodes, activation_func, loss_func):
    model = Sequential()
    n_nodes = FindLayersLinear(n_layers, first_layer_nodes, last_layer_nodes)
    for i in range(1, n_layers):
        if i ==1:
            model.add(Dense(first_layer_nodes, input_dim=trainFeatures.shape[1], activation=activation_func))
        else:
            model.add(Dense(n_nodes[i-1], activation=activation_func))
    #Output layer for binary classification have single node
    model.add(Dense(1, activation=activation_func))
    model.compile(optimizer = 'adam', loss=loss_func, metrics = ["accuracy"])
    
    return model

#Wrap into scikit-learn
model = KerasClassifier(build_fn = createmodel, verbose=False)

In [None]:
#Functions for changing model parameters
activation_funcs = ['sigmoid', 'relu', 'tanh']
loss_funcs = ['binary_crossentropy', 'hinge']
param_grid = dict(n_layers=[1,2,3], first_layer_nodes=[45,30,15], last_layer_nodes = [3], activation_func = activation_funcs, loss_func = loss_funcs, batch_size=[100], epochs = [20,150])
grid = GridSearchCV(estimator = model, param_grid = param_grid)

In [None]:
random.seed(704)
#Fit Grid Object With Data
grid.fit(crossX,crossY)

In [None]:
print(grid.best_score_)
print(grid.best_params_)

In [None]:
def plot_loss(modelFit):
    plt.plot(modelFit.history['loss'], label = 'loss')
    plt.plot(modelFit.history['val_loss'], label = 'val_loss')
    plt.xlabel('Epoch')
    plt.ylabel('Error [class]')
    plt.legend()
    plt.grid(True)

In [None]:
random.seed(704)
asdSeqModel2 = tf.keras.Sequential([
    tf.keras.Input(shape=(42,)),
    layers.Dense(45, activation = 'relu'),
    layers.Dropout(0.5),
    layers.Dense(45, activation = 'relu'),
    layers.Dropout(0.5),
    layers.Dense(45, activation = 'relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation = 'sigmoid')
])
asdSeqModel2.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])


In [None]:
%%time
modelFit2 = asdSeqModel2.fit(
    trainFeatures,
    trainLabels,
    epochs = 80,
    verbose = 0,
    validation_split = 0.2)

#Visulaise training progress
hist2 = pd.DataFrame(modelFit2.history)
hist2['epoch'] = modelFit2.epoch
hist2.tail()

In [None]:
plot_loss(modelFit2)
plt.savefig('trainingcurve')

In [None]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val


def create_model():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(42,)),
        layers.Dense(45, activation = 'relu'),
        layers.Dropout(0.5),
        layers.Dense(45, activation = 'relu'),
        layers.Dropout(0.5),
        layers.Dense(45, activation = 'relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation = 'sigmoid')
    ])

    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = [get_f1, 'accuracy'])

    return model

In [None]:
random.seed(704)
#Fold Dataset and Train Model
#No. Folds
n_split = 10

loss = []
accuracy = []
validationLoss = []
validationAccuracy = []
validationF1 = []

for train_index, test_index in KFold(n_split).split(crossX):
    x_train, x_test = crossX[train_index], crossX[test_index]
    y_train, y_test = crossY[train_index], crossY[test_index]
    
    model = create_model()
    hist = model.fit(x_train, y_train, validation_split=0.2, epochs = 80, verbose = 0)
    
    loss.append(hist.history['loss'])
    accuracy.append(hist.history['accuracy'])
    validationLoss.append(hist.history['val_loss'])
    validationAccuracy.append(hist.history['val_accuracy'])
    validationF1.append(hist.history['get_f1'])
    print('Model Evaluation ', model.evaluate(x_test,y_test))

In [None]:
loss = [sum(subList) / len(subList) for subList in loss]
accuracy = [sum(subList) / len(subList) for subList in accuracy]
validationLoss = [sum(subList) / len(subList) for subList in validationLoss]
validationAccuracy = [sum(subList) / len(subList) for subList in validationAccuracy]

In [None]:
validationF1 = [sum(subList) / len(subList) for subList in validationF1]

In [None]:
plot_result("Sequential Neural Network", "Accuracy", "Accuracy scores in 10-folds",
           accuracy,
           validationAccuracy)

## Results

In [None]:
decTreeResults["Training Accuracy scores"]

In [None]:
fold = ["1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold", "6th Fold", "7th Fold", "8th Fold", "9th Fold", "10th Fold",
        "1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold", "6th Fold", "7th Fold", "8th Fold", "9th Fold", "10th Fold",
        "1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold", "6th Fold", "7th Fold", "8th Fold", "9th Fold", "10th Fold",]

model = ["Decision Tree", "Decision Tree", "Decision Tree", "Decision Tree", "Decision Tree", "Decision Tree", "Decision Tree", "Decision Tree", "Decision Tree", "Decision Tree",
        "Random Forests", "Random Forests", "Random Forests", "Random Forests", "Random Forests", "Random Forests", "Random Forests", "Random Forests", "Random Forests", "Random Forests",
        "Sequential Network", "Sequential Network", "Sequential Network", "Sequential Network", "Sequential Network", "Sequential Network", "Sequential Network", "Sequential Network", "Sequential Network", "Sequential Network"]

valAcc = [0.90163934, 0.91803279, 0.86885246, 0.96721311, 0.95081967, 0.90163934, 0.8852459, 0.8852459, 0.96721311, 0.86666667,
          0.96721311, 0.93442623, 0.95081967, 0.93442623, 0.98360656, 0.91803279, 0.90163934, 0.86885246, 0.96721311, 0.81666667,
          0.9712500050663948, 0.9596590913832188,0.9688636481761932,0.9815909147262574,0.9656818218529224,0.982954553514719,0.9781818233430386,0.9090909130871296,0.8930681683123112,0.913068202137947]

f1Score = [0.85, 0.84848485, 0.78947368, 0.94444444, 0.90909091, 0.83333333, 0.78787879, 0.82926829, 0.94444444, 0.73333333,
           0.94444444, 0.88235294, 0.91428571, 0.875, 0.97142857, 0.84848485, 0.82352941, 0.8, 0.94444444, 0.56,
           0.9137816399335861, 0.9133445382118225, 0.9041821813210845, 0.9311458569020032, 0.9105896070599556, 0.8897312300279736, 0.9122078403830528,0.9253039043396711,0.9122180918231606,0.9262904550880193]

In [None]:
#A line graph of all models accuracy and f1 scores accross the 10 folds
data = {
    'Fold': fold,
    'Model': model,
    'Accuracy': valAcc,
    'F1': f1Score
}

df = pd.DataFrame(data)

In [None]:
sns.set_style("darkgrid")
sns.lineplot(data=df, x="Fold",y="Accuracy", hue="Model")
sns.scatterplot(data=df, x="Fold",y="Accuracy", hue="Model", legend = False)
plt.ylim(0.5, 1)
plt.xlabel('Fold Number')
plt.ylabel('Validation Accuracy')
plt.xticks(rotation=45)
plt.title("The Validation Accuracy of Each Model Across 10-Validation Folds", pad = 20, fontweight = 'bold', fontsize =14)
plt.show()

In [None]:
sns.set_style("darkgrid")
sns.lineplot(data=df, x="Fold",y="F1", hue="Model")
sns.scatterplot(data=df, x="Fold",y="F1", hue="Model", legend = False)
plt.ylim(0.5, 1)
plt.xlabel('Fold Number')
plt.ylabel('Validation F1 Score')
plt.xticks(rotation=45)
plt.title("The F1 Score of Each Model Across 10-Validation Folds", pad = 20, fontweight = 'bold', fontsize =14)
plt.show()

In [None]:
f, axs = plt.subplots(2, figsize = (10,6), sharex=True, sharey=True)
sns.lineplot(data=df, x="Fold",y="Accuracy", hue="Model", ax = axs[0])
sns.scatterplot(data=df, x="Fold",y="Accuracy", hue="Model", ax = axs[0], legend = False)
axs[0].set_ylabel('Validation Score')
axs[0].set_xlabel('Fold Number')
axs[0].set_xticklabels(labels = fold, rotation=45)
axs[0].set_title('Accuracy Score')

sns.lineplot(data=df, x="Fold",y="F1", hue="Model", ax = axs[1], legend = False)
sns.scatterplot(data=df, x="Fold",y="F1", hue="Model", ax = axs[1], legend = False)
axs[1].set_ylabel('F1 Score')
axs[1].set_xlabel('Fold Number')
axs[1].set_xticklabels(labels = fold, rotation=45)
axs[1].set_title('F1Score')

f.suptitle("Accuracy and F1 Score For All Models Across 10 Folds", fontsize = 16, fontweight = 'bold', y = 0.99)

plt.savefig('modelLines')
plt.show()

In [None]:
sum(validationAccuracy)/ len(validationAccuracy)