# Imports
<h3> These imports are shared across the whole application and not specific to either model </h3>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json as json

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

<h3>SVM Imports</h3>
<p>These imports are specifically related to the SVM's functionality</p>

In [None]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import make_pipeline

<h3>ADA Ensambles</h3>
<p>These imports are specifically related to the ADA models</p>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# Getting the Data ready for interperating
<p>In this stage we are using the Pandas libary to load the CSV data file.</p>
	<div style="margin-left: 20px;">
		<p>- This helps by giving us functionality to use a wide array of methods</p>
	</div>
	<p>The data is then printed to allow for easy refencing and understanding of base data</p>



In [None]:
## Load the CSV file

data = pd.read_csv('diabetes.csv')

print(data[:10])
print(data.shape)

<h3>Importance of Splitting Test and Training Data in Machine Learning Models</h3>

In Machine Learning, it is essential to split the dataset into training and test sets.
<div style="margin-left: 20px;">
	<p>The importance of having a <strong> Validation Dataset </strong> is to have a selection of data for which the learned model will be scored against. By knowing the result of said inputs, it is possible to match them against the predictions made by the model. This is the most accurate way to test. A counter-option would be to test against the trained data, however this wouldn't be a real-world example of predicting new labels. Knowing the success rate of predicted models means the model can be adjusted until the preformance is satisfactory. 
	</p>
</div>
Additionally, the from the training <strong>data</strong> provided by the CSV, the outcome must be dropped for the list of inputs (<strong>X</strong>). Everything, besides the outcome, should also be dropped from the list of outputs (<strong>y</strong>).


In [None]:
# Creating the initial X and y lists from the data CSV file
X = data.drop("Outcome", axis=1)
y = data["Outcome"]


#split the data into training and testing data, 80% training and 20% testing- random state is set to 42 because it is the answer to everything
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
svm_tests = {
	'C': np.linspace(1, 100, 100),
	'tollerance': np.linspace(0.0001, 0.1, 100),
	'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
	'max_iter': np.linspace(10, 10000, 100).astype(int),
	'decision_function_shape': ['ovo', 'ovr'],
	'probability': [True, False],
}

current_svm_data = {
	'kernel': None,#
	'max_iter': None,
	'decision_function_shape': None,#
	'probability': None,#
	'shrinking': None,#
	'C': None,
	'tollerance':None#
}

In [None]:
ada_ensambles_tests = {
	'Estimator': {
		'n_estimators': np.linspace(1, 200, 10).astype(int),
		'criterion': ['gini', 'entropy', 'log_loss'],
		'max_features': ['sqrt', 'log2'],
		'bootstrap': [True, False],
		'min_samples_split': np.linspace(2, 11, 10).astype(int),
		'min_samples_leaf': np.linspace(2, 6, 5).astype(int)
	},
	'Params': {
		'n_estimators': np.linspace(1, 100, 10).astype(int),
		'learning_rate': np.linspace(0.1, 3, 10),
		'algorithim': ['SAMME', 'SAMME.R']
	}
}

current_ada_data = {
	'Estimator': {
		'n_estimators': None,
		'criterion': None,
		'max_features': None,
		'bootstrap': None,
		'min_samples_split': None,
		'min_samples_leaf': None
	},
	'Params': {
		'n_estimators': None,
		'learning_rate': None,
		'algorithim': None
	}
}

In [None]:
solution_list = []
accuracy_list = []

In [None]:
def ratios(y_true, y_pred):
    # Get the confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    
    # Calculate False Negative Ratio
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
	# Calculate Recall
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
	# Calculate Precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
	# Calculate specificity
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    return fnr, recall, precision, specificity

In [None]:
class svm_model:
	def __init__(self, data):
		self.kernel 					= data['kernel']
		self.max_iter 					= data['max_iter']
		self.func_shape 				= data['decision_function_shape']
		self.probability 				= data['probability']
		self.shrinking 					= data['shrinking']
		self.tollerance 				= data['tollerance']
		self.C 							= data['C']

		self.closed_fnr 		= None
		self.closed_recall		= None
		self.closed_f1 			= None
		self.closed_precision 	= None
		self.closed_accuracy 	= None
		self.closed_specificity = None

		self.closed_pred 		= None

	def predict(self):
		closed_clf = make_pipeline(StandardScaler(), SVC(kernel=self.kernel, 
																max_iter=self.max_iter, 
																decision_function_shape=self.func_shape, 
																probability=self.probability, 
																shrinking=self.shrinking,
																C=self.C,
																tol=self.tollerance
																))
		
		closed_clf.fit(X_train, y_train)

		self.closed_pred 		= closed_clf.predict(X_test)
		self.closed_accuracy 	= accuracy_score(y_test, self.closed_pred)
		self.closed_f1 			= f1_score(y_test, self.closed_pred)
		self.closed_fnr, self.closed_recall, self.closed_precision, self.closed_specificity	= ratios(y_test, self.closed_pred)

In [None]:
class ada_model:
	def __init__(self, current_test):

		self.clf_estimator = current_test['Estimator']

		self.clf_params = current_test['Params']
		


	def predict(self):
		closed_hype_clf = RandomForestClassifier(
									n_estimators=self.clf_estimator['n_estimators'], 
									criterion=self.clf_estimator['criterion'],
									max_features=self.clf_estimator['max_features'], 
									bootstrap=self.clf_estimator['bootstrap'],
									min_samples_split=self.clf_estimator['min_samples_split'], 
									min_samples_leaf=self.clf_estimator['min_samples_leaf'], 
									n_jobs=-1
								)
			
		
		closed_clf = AdaBoostClassifier(
							closed_hype_clf,
							n_estimators=self.clf_params['n_estimators'],
							learning_rate=self.clf_params['learning_rate'],
							algorithm=self.clf_params['algorithim'],
							random_state=1)
		
		closed_clf.fit(X_train, y_train)

		self.closed_pred 		= closed_clf.predict(X_test)
		self.closed_accuracy 	= accuracy_score(y_test, self.closed_pred)
		self.closed_f1 			= f1_score(y_test, self.closed_pred)
		self.closed_fnr, self.closed_recall, self.closed_precision, self.closed_specificity	= ratios(y_test, self.closed_pred)

In [None]:
def convert_to_serializable(obj):
    if isinstance(obj, np.int32):
        return int(obj)
    return obj

In [None]:
def write_json(class_name, output_file):
	class_dict = {
		"features": {
		},
		"results": {
			"accuracy": class_name.closed_accuracy,#float
			"f1": class_name.closed_f1,#float
			"fnr": class_name.closed_fnr,#float
			"recall": class_name.closed_recall,#float
			"precision": class_name.closed_precision,#float
			"specificity": class_name.closed_specificity#float
		},
		"predictions": {
			"y_pred": str(class_name.closed_pred.tolist()),#array
			"y_true": str(y_test.tolist())#array
		}
	}

	if(isinstance(class_name, svm_model)):
		class_dict["features"] = {
			"kernel": str(class_name.kernel),#str
			"max_iter": class_name.max_iter,#int
			"decision_function_shape": str(class_name.func_shape),#str
			"probability": class_name.probability,#bool
			"shrinking": class_name.shrinking,#bool
			"tollerance": class_name.tollerance,#float
			"C": class_name.C#float
		}
	elif(isinstance(class_name, ada_model)):
		class_dict["features"] = {
			"Estimator": {
				"n_estimators": class_name.clf_estimator['n_estimators'],#int
				"criterion": str(class_name.clf_estimator['criterion']),#str
				"max_features": str(class_name.clf_estimator['max_features']),#str
				"bootstrap": class_name.clf_estimator['bootstrap'],#bool
				"min_samples_split": class_name.clf_estimator['min_samples_split'],#int
				"min_samples_leaf": class_name.clf_estimator['min_samples_leaf']#int
			},
			"Params": {
				"n_estimators": class_name.clf_params['n_estimators'],#int
				"learning_rate": class_name.clf_params['learning_rate'],#float
				"algorithim": str(class_name.clf_params['algorithim'])#str
			}
		}



	_tojson_ = json.dumps(class_dict, default=convert_to_serializable, indent=4)

	with open(output_file, 'a') as f:

		f.write(str(_tojson_ ) + ',\n')
		

In [None]:
def check_worst(current_test, solution_name):

	for i in range(len(solution_list)):

		if current_test.closed_fnr < accuracy_list[i]:
			solution_list.append(current_test)
			accuracy_list.append(current_test.closed_fnr)

			write_json(current_test, solution_name)



			if(len(solution_list) > 10):
				solution_list.pop(0)
				accuracy_list.pop(0)

			return

In [None]:
def check_best(current_test, solution_name):

	for i in range(len(solution_list)):

		if current_test.closed_fnr > accuracy_list[i]:
			solution_list.append(current_test)
			accuracy_list.append(current_test.closed_fnr)

			write_json(current_test, solution_name)

			if(len(solution_list) > 10):
				solution_list.pop(0)
				accuracy_list.pop(0)

			return

In [None]:
def svmDepthFirstSearch(data):

	iterationCount = 0

	number_of_tests = (
		len(data['kernel']) *
		len(data['decision_function_shape']) *
		len(data['probability']) *
		len(data['probability']) *
		len(data['max_iter']) *
		len(data['tollerance']) *
		len(data['C'])
	)

	with open("svm_results.json", 'w') as f:
		f.write("[\n")
		
	solution_list.append(current_svm_data)
	accuracy_list.append(1.0)

	for kernel in data['kernel']:
		for function_shape in data['decision_function_shape']:
			for probability in data['probability']:
				for shrinking in data['probability']:
					for max_iter in data['max_iter']:
						for tol in data['tollerance']:
							for C in data['C']:

								current_svm_data['kernel'] = kernel
								current_svm_data['decision_function_shape'] = function_shape
								current_svm_data['probability'] = probability
								current_svm_data['shrinking'] = shrinking
								current_svm_data['max_iter'] = max_iter
								current_svm_data['tollerance'] = tol
								current_svm_data['C'] = C

								current_test = svm_model(current_svm_data)
								current_test.predict()
								
								check_worst(current_test, "svm_results.json")

								iterationCount += 1
								
								
							with open("svm_iterations.txt", 'w') as f:
								f.write("Iteration: " + str(iterationCount) + "/" + str(number_of_tests))


	with open("svm_results.txt", 'a') as f:
		f.write("]\n")

In [None]:
def adaDepthSearch(data):

	iterationCount = 0

	number_of_tests = (
		len(data['Estimator']['n_estimators']) *
		len(data['Estimator']['criterion']) *
		len(data['Estimator']['max_features']) *
		len(data['Estimator']['bootstrap']) *
		len(data['Estimator']['min_samples_split']) *
		len(data['Estimator']['min_samples_leaf']) *
		len(data['Params']['n_estimators']) *
		len(data['Params']['learning_rate']) *
		len(data['Params']['algorithim'])
	)

	with open("ada_results.json", 'w') as f:
		f.write("[\n")
		
	solution_list.append(current_ada_data)
	accuracy_list.append(0.0)

	for n_estimators in data['Estimator']['n_estimators']:
		for criterion in data['Estimator']['criterion']:
			for max_features in data['Estimator']['max_features']:
				for bootstrap in data['Estimator']['bootstrap']:
					for min_samples_split in data['Estimator']['min_samples_split']:
						for min_samples_leaf in data['Estimator']['min_samples_leaf']:

							current_ada_data['Estimator']['n_estimators'] = n_estimators
							current_ada_data['Estimator']['criterion'] = criterion
							current_ada_data['Estimator']['max_features'] = max_features
							current_ada_data['Estimator']['bootstrap'] = bootstrap
							current_ada_data['Estimator']['min_samples_split'] = min_samples_split
							current_ada_data['Estimator']['min_samples_leaf'] = min_samples_leaf


							for n_estimators_params in data['Params']['n_estimators']:
								for learning_rate in data['Params']['learning_rate']:
									for algorithim in data['Params']['algorithim']:

										
										current_ada_data['Params']['n_estimators'] = n_estimators_params
										current_ada_data['Params']['learning_rate'] = learning_rate
										current_ada_data['Params']['algorithim'] = algorithim

										current_test = ada_model(current_ada_data)
										current_test.predict()
										
										check_best(current_test, "ada_results.json")

										iterationCount += 1
										
										
									with open("ada_iterations.txt", 'w') as f:
										f.write("Iteration: " + str(iterationCount) + "/" + str(number_of_tests))


	with open("ada_results.txt", 'a') as f:
		f.write("]\n")

In [None]:
# svmDepthFirstSearch(svm_tests)

In [None]:
# adaDepthSearch(ada_ensambles_tests)