In [5]:
# importing the important libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from collections import Counter
import warnings
warnings.filterwarnings("ignore")


In [6]:
#  defining the classifiers and pipeline
class Classification:
    def __init__(self, path='C.csv', clf_opt='lr', no_of_selected_features=None):
        self.path = path
        self.clf_opt = clf_opt
        self.no_of_selected_features = int(no_of_selected_features) if no_of_selected_features is not None else None

    def classification_pipeline(self):
        if self.clf_opt == 'lr':
            print('\n\t### Training Logistic Regression Classifier ### \n')
            clf = LogisticRegression(class_weight='balanced',random_state=42)
            # setting up the parameters
            clf_parameters = {
    'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'clf__C': [0.001, 0.01, 0.1, 1, 10],
    'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
            }
        else:
            print('Select a valid classifier \n')
            sys.exit(0)
        return clf, clf_parameters

    def get_class_statistics(self, labels):
        class_statistics = Counter(labels)
        print('\n Class \t\t Number of Instances \n')
        for item in list(class_statistics.keys()):
            print('\t' + str(item) + '\t\t\t' + str(class_statistics[item]))

    def get_data(self, filename):
        reader = pd.read_csv(self.path)
        data = reader.iloc[:, :-1]
        labels = reader['Outcome']
        self.get_class_statistics(labels)
        return data, labels

    def train_classifier(self, X_train, y_train):
        clf, clf_parameters = self.classification_pipeline()
        pipeline = Pipeline([
            ('feature_selection', SelectKBest(chi2, k=self.no_of_selected_features)),
            ('clf', clf),
        ])
        grid = GridSearchCV(pipeline, clf_parameters, scoring='f1_micro', cv=10)
        grid.fit(X_train, y_train)
        clf = grid.best_estimator_
        
        print("\nBest Parameters:")
        print(grid.best_params_)
        return clf

    def evaluate_classifier(self, clf, X_test, y_test):
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        return accuracy, report

    def predict_unseen_data(self, clf, unseen_data):
        return clf.predict(unseen_data)

    def classify(self):
        data, labels = self.get_data('C.csv')
        data = np.asarray(data)

        skf = StratifiedKFold(n_splits=5)
        predicted_class_labels = []
        actual_class_labels = []
        count = 0
        probs = []

        for train_index, test_index in skf.split(data, labels):
            X_train = data[train_index]
            y_train = labels[train_index]
            X_test = data[test_index]
            y_test = labels[test_index]

            count += 1
            print('Training Phase ' + str(count))
            clf = self.train_classifier(X_train, y_train)

            predicted = clf.predict(X_test)
            predicted_probability = clf.predict_proba(X_test)  # Not applicable for Logistic Regression
            print(predicted_probability)

            for item in predicted_probability:
                probs.append(float(max(item)))

            for item in y_test:
                actual_class_labels.append(item)
            for item in predicted:
                predicted_class_labels.append(item)

        class_names = list(Counter(labels).keys())
        class_names = [str(x) for x in class_names]

        print('\n ##### Classification Report on Training Data ##### \n')
        print(classification_report(actual_class_labels, predicted_class_labels, target_names=class_names))

        pr = precision_score(actual_class_labels, predicted_class_labels, average='macro')
        print('\n Precision:\t' + str(pr))

        rl = recall_score(actual_class_labels, predicted_class_labels, average='macro')
        print('\n Recall:\t' + str(rl))

        fm = f1_score(actual_class_labels, predicted_class_labels, average='macro')
        print('\n F1-Score:\t' + str(fm))

        ac = accuracy_score(actual_class_labels, predicted_class_labels)
        print('\n Accuracy:\t' + str(ac))

        # predicting the value of unseen data
        unseen_data = pd.read_csv('tst4.csv')  
        unseen_data_features = np.asarray(unseen_data.iloc[:, :])

        predictions_unseen_data = self.predict_unseen_data(clf, unseen_data_features)
        print('\n ##### Predictions on Unseen Data ##### \n')

        print(predictions_unseen_data)
        # print(len(predictions_unseen_data))


In [7]:
if __name__ == "__main__":
    classifier = Classification(clf_opt='lr', no_of_selected_features=32)
    classifier.classify()



 Class 		 Number of Instances 

	0			1213
	1			1213
Training Phase 1

	### Training Logistic Regression Classifier ### 


Best Parameters:
{'clf__C': 10, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
[[1.00000000e+00 4.58162881e-18]
 [1.00000000e+00 1.27374259e-18]
 [1.00000000e+00 2.75169548e-19]
 [1.00000000e+00 5.45777931e-19]
 [1.00000000e+00 5.88295484e-18]
 [1.00000000e+00 3.52509101e-19]
 [1.00000000e+00 2.61647550e-19]
 [1.00000000e+00 9.28704745e-19]
 [1.00000000e+00 4.25936388e-20]
 [1.00000000e+00 6.74536093e-18]
 [1.00000000e+00 2.34740636e-18]
 [1.00000000e+00 7.74578453e-19]
 [1.00000000e+00 4.64923842e-18]
 [1.00000000e+00 6.23240993e-18]
 [1.00000000e+00 2.34399740e-18]
 [1.00000000e+00 2.98910791e-18]
 [1.00000000e+00 5.12924986e-18]
 [1.00000000e+00 5.19248722e-18]
 [1.00000000e+00 1.47288953e-17]
 [1.00000000e+00 4.23472694e-18]
 [1.00000000e+00 2.88483960e-18]
 [1.00000000e+00 6.97485833e-18]
 [1.00000000e+00 3.28944589e-19]
 [1.00000000e+00 5.24218842e-19]
 [1