In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import warnings
warnings.filterwarnings("ignore")

class Classification:
    def __init__(self, path='C.csv', clf_opt='ab', no_of_selected_features=None):
        self.path = path
        self.clf_opt = clf_opt
        self.no_of_selected_features = int(no_of_selected_features) if no_of_selected_features is not None else None

    def classification_pipeline(self):
        if self.clf_opt == 'ab':
            print('\n\t### Training AdaBoost Classifier ### \n')
            be1 = svm.SVC(kernel='rbf', class_weight='balanced',probability=True)              
            be2 = LogisticRegression(solver='liblinear',class_weight='balanced') 
            be3 = DecisionTreeClassifier(max_depth=50)
#            clf = AdaBoostClassifier(algorithm='SAMME',n_estimators=100)            
            clf = AdaBoostClassifier(algorithm='SAMME.R',n_estimators=100)
            clf_parameters = {
            'clf__base_estimator':(be2,be3),
            'clf__random_state':(0,10)}
        else:
            print('Select a valid classifier \n')
            sys.exit(0)
        return clf, clf_parameters

    def get_class_statistics(self, labels):
        class_statistics = Counter(labels)
        print('\n Class \t\t Number of Instances \n')
        for item in list(class_statistics.keys()):
            print('\t' + str(item) + '\t\t\t' + str(class_statistics[item]))

    def get_data(self, filename):
        reader = pd.read_csv(self.path)
        data = reader.iloc[:, :-1]
        labels = reader['Outcome']
        self.get_class_statistics(labels)
        return data, labels

    def train_classifier(self, X_train, y_train):
        clf, clf_parameters = self.classification_pipeline()
        pipeline = Pipeline([
            ('feature_selection', SelectKBest(chi2, k=self.no_of_selected_features)),
            ('clf', clf),
        ])
        grid = GridSearchCV(pipeline, clf_parameters, scoring='f1_micro', cv=10)
        grid.fit(X_train, y_train)
        clf = grid.best_estimator_
        
        print("\nBest Parameters:")
        print(grid.best_params_)
        return clf

    def evaluate_classifier(self, clf, X_test, y_test):
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        return accuracy, report

    def predict_unseen_data(self, clf, unseen_data):
        return clf.predict(unseen_data)

    def classify(self):
        data, labels = self.get_data('C.csv')
        data = np.asarray(data)

        skf = StratifiedKFold(n_splits=5)
        predicted_class_labels = []
        actual_class_labels = []
        count = 0
        probs = []

        for train_index, test_index in skf.split(data, labels):
            X_train = data[train_index]
            y_train = labels[train_index]
            X_test = data[test_index]
            y_test = labels[test_index]

            count += 1
            print('Training Phase ' + str(count))
            clf = self.train_classifier(X_train, y_train)

            predicted = clf.predict(X_test)
            predicted_probability = clf.predict_proba(X_test)
            print(predicted_probability)

            for item in predicted_probability:
                probs.append(float(max(item)))

            for item in y_test:
                actual_class_labels.append(item)
            for item in predicted:
                predicted_class_labels.append(item)

        class_names = list(Counter(labels).keys())
        class_names = [str(x) for x in class_names]

        print('\n ##### Classification Report on Training Data ##### \n')
        print(classification_report(actual_class_labels, predicted_class_labels, target_names=class_names))

        pr = precision_score(actual_class_labels, predicted_class_labels, average='macro')
        print('\n Precision:\t' + str(pr))

        rl = recall_score(actual_class_labels, predicted_class_labels, average='macro')
        print('\n Recall:\t' + str(rl))

        fm = f1_score(actual_class_labels, predicted_class_labels, average='macro')
        print('\n F1-Score:\t' + str(fm))

        ac = accuracy_score(actual_class_labels, predicted_class_labels)
        print('\n Accuracy:\t' + str(ac))

        # Now, let's say you have a new data file called 'unseen_data.csv'
        unseen_data = pd.read_csv('tst4.csv')  # Change the file name accordingly
        unseen_data_features = np.asarray(unseen_data.iloc[:, :])

        # Predict on unseen data
        predictions_unseen_data = self.predict_unseen_data(clf, unseen_data_features)
        print('\n ##### Predictions on Unseen Data ##### \n')
        print(predictions_unseen_data)
        print(len(predictions_unseen_data))


if __name__ == "__main__":
    classifier = Classification(clf_opt='ab', no_of_selected_features=32)
    classifier.classify()



 Class 		 Number of Instances 

	0			1213
	1			1213
Training Phase 1

	### Training AdaBoost Classifier ### 


Best Parameters:
{'clf__base_estimator': DecisionTreeClassifier(max_depth=50), 'clf__random_state': 0}
[[1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.22044605e-16]
 [1.00000000e+00 2.2204460