In [251]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, precision_score, f1_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [252]:
def ratios(y_true, y_pred):
    # Get the confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    
    # Calculate False Negative Ratio
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
	# Calculate Recall
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
	# Calculate Precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
	# Calculate specificity
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    return fnr, recall, precision, specificity

In [253]:
data = pd.read_csv('diabetes.csv')

print(data[:10])
print(data.shape)

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   
5            5      116             74              0        0  25.6   
6            3       78             50             32       88  31.0   
7           10      115              0              0        0  35.3   
8            2      197             70             45      543  30.5   
9            8      125             96              0        0   0.0   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   2

In [254]:
##split the data into training and testing data
X = data.drop('Outcome', axis=1)
y = data['Outcome']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [255]:
test_possibilitys = {
	'Class': {
		'n_estimators': np.linspace(1, 200, 10).astype(int),
		'criterion': ['gini', 'entropy', 'log_loss'],
		'max_features': ['sqrt', 'log2'],
		'bootstrap': [True, False],
		'min_samples_split': np.linspace(2, 11, 10).astype(int),
		'min_samples_leaf': np.linspace(2, 6, 5).astype(int)
	},
	'Params': {
		'n_estimators': np.linspace(1, 100, 10).astype(int),
		'learning_rate': np.linspace(0.1, 3, 10),
		'algorithim': ['SAMME', 'SAMME.R']
	}
}

In [256]:
current_test = {
	'clf': RandomForestClassifier,
	'clf_params': {},
	'params': {}
}

In [257]:
solution_list = []
accuracy_list = []

iteration_count = 0

In [258]:
class test:
	def __init__(self, current_test):
		self.hype_clf = current_test['clf'](
									n_estimators=current_test['clf_params']['n_estimators'], 
									criterion=current_test['clf_params']['criterion'],
									max_features=current_test['clf_params']['max_features'], 
									bootstrap=current_test['clf_params']['bootstrap'],
									min_samples_split=current_test['clf_params']['min_samples_split'], 
									min_samples_leaf=current_test['clf_params']['min_samples_leaf'], 
									n_jobs=-1
								)
			
		
		self.clf = AdaBoostClassifier(
							self.hype_clf,
							n_estimators=current_test['params']['n_estimators'],
							learning_rate=current_test['params']['learning_rate'],
							algorithm=current_test['params']['algorithim'],
							random_state=1)
		
		self.clf.fit(X_train, y_train)
		self.predsy_pred = self.clf.predict(X_test)

		self.fnr, self.recall, self.precision, self.specificity = ratios(y_test, self.predsy_pred)
		self.accuracy = accuracy_score(y_test, self.predsy_pred)
		self.f1 = f1_score(y_test, self.predsy_pred)

In [259]:
def check_best(current_test):

	if (current_test.accuracy > 0.7):
	
		if current_test.accuracy > accuracy_list[0]:
			solution_list.append(current_test)
			accuracy_list.append(current_test.accuracy)

			print("New best solution found: ", current_test.accuracy)

			with open('ada_best_params.txt', 'a') as f:
				f.write(str(current_test.__dict__) + ",\n")



			if(len(solution_list) > 10):
				solution_list.pop(0)
				accuracy_list.pop(0)



In [260]:
total_tests = (
	len(test_possibilitys['Class']['n_estimators']) * 
	len(test_possibilitys['Class']['criterion']) * 
	len(test_possibilitys['Class']['max_features']) * 
	len(test_possibilitys['Class']['bootstrap']) * 
	len(test_possibilitys['Class']['min_samples_split']) * 
	len(test_possibilitys['Class']['min_samples_leaf']) * 
	len(test_possibilitys['Params']['n_estimators']) * 
	len(test_possibilitys['Params']['learning_rate']) * 
	len(test_possibilitys['Params']['algorithim'])
)
print("Total Tests: ", total_tests)



solution_list.append(current_test)
accuracy_list.append(0.0)

with open('ada_best_params.txt', 'w') as f:
	f.write("[\n")


for _n_estimators in test_possibilitys['Class']['n_estimators']:
	for _citeration in test_possibilitys['Class']['criterion']:
		for _max_features in test_possibilitys['Class']['max_features']:
			for _bootstrap in test_possibilitys['Class']['bootstrap']:
				for _min_samples_split in test_possibilitys['Class']['min_samples_split']:
					for _min_samples_leaf in test_possibilitys['Class']['min_samples_leaf']:


						#Test the hyper paramaters of the estimator
						current_test['clf_params']['n_estimators'] = _n_estimators
						current_test['clf_params']['criterion'] = _citeration
						current_test['clf_params']['max_features'] = _max_features
						current_test['clf_params']['bootstrap'] = _bootstrap
						current_test['clf_params']['min_samples_split'] = _min_samples_split
						current_test['clf_params']['min_samples_leaf'] = _min_samples_leaf


						for _ada_estimator in test_possibilitys['Params']['n_estimators']:
							for _ada_learning_rate in test_possibilitys['Params']['learning_rate']:
								for _ada_algorithm in test_possibilitys['Params']['algorithim']:


									#Test the hyper paramaters of the estimator
									current_test['params']['n_estimators'] = _ada_estimator
									current_test['params']['learning_rate'] = _ada_learning_rate
									current_test['params']['algorithim'] = _ada_algorithm

									#Test the model
									IterationTest = test(current_test)
									check_best(IterationTest)

									iteration_count += 1
									
						with open('ada_iterations.txt', 'w') as f:
							f.write(str(iteration_count) + "/" + str(total_tests))

Total Tests:  1200000


New best solution found:  0.7662337662337663
New best solution found:  0.7012987012987013
New best solution found:  0.7467532467532467
New best solution found:  0.7207792207792207
New best solution found:  0.7532467532467533
New best solution found:  0.7337662337662337
New best solution found:  0.7012987012987013
New best solution found:  0.7077922077922078
New best solution found:  0.7532467532467533
New best solution found:  0.7662337662337663
New best solution found:  0.7727272727272727
New best solution found:  0.7142857142857143
New best solution found:  0.7532467532467533
New best solution found:  0.7467532467532467
New best solution found:  0.7792207792207793
New best solution found:  0.7402597402597403
New best solution found:  0.7077922077922078
New best solution found:  0.7532467532467533
New best solution found:  0.7597402597402597
New best solution found:  0.7792207792207793
New best solution found:  0.7792207792207793
New best solution found:  0.7272727272727273
New best s

In [None]:
hyp_clf = RandomForestClassifier()

clf = AdaBoostClassifier(hyp_clf,
    n_estimators=100, learning_rate=2)


# the train-test split is done in each iteration of cross validation
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
#find the input features that are most important



clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

acc = accuracy_score(y_test, predictions)
confusion = confusion_matrix(y_test, predictions)

print('Accuracy:', acc)
print('Confusion matrix:')
print(confusion)

print('Classification report:')
print(classification_report(y_test, predictions))

print('Recall:', recall_score(y_test, predictions))
print('Precision:', precision_score(y_test, predictions))
print('F1 Score:', f1_score(y_test, predictions))



[0.77235772 0.79674797 0.7398374  0.74796748 0.78688525]
Accuracy: 0.7402597402597403
Confusion matrix:
[[79 20]
 [20 35]]
Classification report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80        99
           1       0.64      0.64      0.64        55

    accuracy                           0.74       154
   macro avg       0.72      0.72      0.72       154
weighted avg       0.74      0.74      0.74       154

Recall: 0.6363636363636364
Precision: 0.6363636363636364
F1 Score: 0.6363636363636364
