# Etapa 3 - Machine Learning

## Imports

In [None]:
import sys
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score, f1_score, recall_score, matthews_corrcoef, ConfusionMatrixDisplay, precision_score)
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV, HalvingGridSearchCV

%matplotlib inline

# Functions

Metrics

In [None]:
def metrics(label_test, predict):
    print('METRICS:')
    print(f"Accuracy score:\n{accuracy_score(label_test, predict)}\n")
    print(f"Recall score:\n{recall_score(label_test, predict, average='weighted')}\n")
    print(f"Precison score:\n{precision_score(label_test, predict, average='weighted', zero_division=0)}\n")
    print(f"F1-score:\n{f1_score(label_test, predict, average='weighted')}\n")
    print(f"MCC score:\n{matthews_corrcoef(label_test, predict)}\n")
    print(f"Confusion matrix:\n{confusion_matrix(label_test, predict)}\n")
    print(f"Classification report:\n{classification_report(label_test, predict, zero_division=True)}\n")
    ConfusionMatrixDisplay.from_predictions(label_test, predict)
    plt.show()

ML

In [None]:
def ml(model, dataset, labels, param=None, rand=None):
    # SPLIT
    data_train, data_test, label_train, label_test = train_test_split(dataset, labels, test_size=0.3)

    # k-fold
    kfold = StratifiedKFold(n_splits=5, random_state=rand, shuffle=False)
    # Cross validation
    scores_scoring = cross_val_score(model, X=data_train, y=label_train, cv=kfold, scoring='accuracy')
    print(f'Cross Validation accuracy score: {np.mean(scores_scoring)}\n')

    # model training - FIT
    model.fit(data_train, label_train)

    # PREDICT
    predict = model.predict(X=data_test)
    base_model = accuracy_score(label_test, predict)
    print('Base Model Accuracy: {:.3f}\n'.format(base_model))

    if param != None:
        ## OPTIMIZATION
        search = HalvingGridSearchCV(estimator=model, param_grid=param, cv=kfold, scoring='accuracy', random_state=rand,
                                     n_jobs=-1)
        # search = HalvingRandomSearchCV(estimator=model, param_distributions=param, cv=kfold, scoring='accuracy',
        #                                random_state=rand, n_jobs=-1)
        search.fit(X=data_train, y=label_train)

        best_params = search.best_params_
        print(f'{best_params}\n')
        # OPTI MODEL FITTED
        best_model = search.best_estimator_
        predict_opt = best_model.predict(X=data_test)
        opt_model = accuracy_score(label_test, predict_opt)
        print('Optimized Model Accuracy: {:.3f}\n'.format(opt_model))

        improv = ((opt_model-base_model)/base_model*100)
        print('Optimized model improved {:.3f}% over base model.\n'.format(improv))

        if improv >= 0:
            predict = predict_opt

    # Metrics
    metrics(label_test, predict)

The *ml* function will firstly do a cross validation to evaluate the model adaptation to the data. Secondly, it will fit the model and then predict the test set labels and it's corresponding accuracy. Lastly, it will do a hyperparameter optimization and then use the best parameters to predict the test set labels and it's accuracy.

# Binary Classification

### Load data

In [None]:
descriptors = pd.read_csv('../dataset/binary_class/descriptors_fs.csv', sep=',')
fingerprint = pd.read_csv('../dataset/binary_class/rdk_fs.csv', sep=',')

descriptors_data_b = descriptors.drop("activity", axis=1)
descriptors_label_b = descriptors["activity"]
fingerprint_data_b = fingerprint.drop("activity", axis=1)
fingerprint_label_b = fingerprint["activity"]

## Descriptors

### Random Forest

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
params_rf = {'n_estimators': range(10, 211, 50), 'criterion': ['entropy', 'gini'], 'max_features': ['sqrt', 'log2', None],
          'bootstrap': [True, False]}

ml(rf, descriptors_data_b, descriptors_label_b, params_rf)

### Gaussian Naive Bayes

In [None]:
nb = GaussianNB()

ml(nb, descriptors_data_b, descriptors_label_b, None)

### K Nearest Neighbors

In [None]:
knn = KNeighborsClassifier(n_jobs=-1)
params_knn = {'n_neighbors': range(2, 11, 2), 'weights': ['distance', 'uniform'], 'leaf_size': range(10, 50, 10), 'p': [1, 2]}

ml(knn, descriptors_data_b, descriptors_label_b, params_knn)

### Voting Classifier
This model will include all the previous models.

In [None]:
voting = VotingClassifier(estimators=[('Random Forest', rf), ('Naive Bayes', nb), ('KNN', knn)], voting='soft', n_jobs=-1)

ml(voting, descriptors_data_b, descriptors_label_b, None)

### Neural Network

In [None]:
nn = MLPClassifier(early_stopping=True)
params_nn = {'activation': ['identity', 'logistic', 'tanh', 'relu'], 'learning_rate': ['constant', 'invscaling', 'adaptive']}

ml(nn, descriptors_data_b, descriptors_label_b, params_nn)

## Fingerprints

### Random Forest

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
params_rf = {'n_estimators': range(10, 211, 50), 'criterion': ['entropy', 'gini'],
             'max_features': ['sqrt', 'log2', None],
             'bootstrap': [True, False]}

ml(rf, fingerprint_data_b, fingerprint_label_b, params_rf)

### Gaussian Naive Bayes

In [None]:
nb = GaussianNB()

ml(nb, fingerprint_data_b, fingerprint_label_b, None)

### K Nearest Neighbors

In [None]:
knn = KNeighborsClassifier(n_jobs=-1)
params_knn = {'n_neighbors': range(2, 11, 2), 'weights': ['distance', 'uniform'], 'leaf_size': range(10, 50, 10),
              'p': [1, 2]}

ml(knn, fingerprint_data_b, fingerprint_label_b, params_knn)

### Voting Classifier
This model will include all the previous models.

In [None]:
voting = VotingClassifier(estimators=[('Random Forest', rf), ('Naive Bayes', nb), ('KNN', knn)], voting='soft',
                          n_jobs=-1)

ml(voting, fingerprint_data_b, fingerprint_label_b, None)

### Neural Network

In [None]:
nn = MLPClassifier(early_stopping=True)
params_nn = {'activation': ['identity', 'logistic', 'tanh', 'relu'],
             'learning_rate': ['constant', 'invscaling', 'adaptive']}

ml(nn, fingerprint_data_b, fingerprint_label_b, params_nn)

# Multiclass

### Load data

In [None]:
descriptors = pd.read_csv('../dataset/multiclass/descriptors_fs.csv', sep=',')
fingerprint = pd.read_csv('../dataset/multiclass/rdk_fs.csv', sep=',')

descriptors_data_m = descriptors.drop("Activity at 46.23 uM", axis=1)
descriptors_label_m = descriptors["Activity at 46.23 uM"]
fingerprint_data_m = fingerprint.drop("Activity at 46.23 uM", axis=1)
fingerprint_label_m = fingerprint["Activity at 46.23 uM"]

### Random Forest

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
params_rf = {'n_estimators': range(10, 211, 50), 'criterion': ['entropy', 'gini'],
             'max_features': ['sqrt', 'log2', None],
             'bootstrap': [True, False]}

ml(rf, descriptors_data_m, descriptors_label_m, params_rf)

### Gaussian Naive Bayes

In [None]:
nb = GaussianNB()

ml(nb, descriptors_data_m, descriptors_label_m, None)

### K Nearest Neighbors

In [None]:
knn = KNeighborsClassifier(n_jobs=-1)
params_knn = {'n_neighbors': range(2, 11, 2), 'weights': ['distance', 'uniform'], 'leaf_size': range(10, 50, 10),
              'p': [1, 2]}

ml(knn, descriptors_data_m, descriptors_label_m, params_knn)

### Voting Classifier
This model will include all the previous models.

In [None]:
voting = VotingClassifier(estimators=[('Random Forest', rf), ('Naive Bayes', nb), ('KNN', knn)], voting='soft',
                          n_jobs=-1)

ml(voting, descriptors_data_m, descriptors_label_m, None)

### Neural Network

In [None]:
nn = MLPClassifier(early_stopping=True)
params_nn = {'activation': ['identity', 'logistic', 'tanh', 'relu'],
             'learning_rate': ['constant', 'invscaling', 'adaptive']}

ml(nn, descriptors_data_m, descriptors_label_m, params_nn)

## Fingerprints

### Random Forest

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
params_rf = {'n_estimators': range(10, 211, 50), 'criterion': ['entropy', 'gini'],
             'max_features': ['sqrt', 'log2', None],
             'bootstrap': [True, False]}

ml(rf, fingerprint_data_m, fingerprint_label_m, params_rf)

### Gaussian Naive Bayes

In [None]:
nb = GaussianNB()

ml(nb, fingerprint_data_m, fingerprint_label_m, None)

### K Nearest Neighbors

In [None]:
knn = KNeighborsClassifier(n_jobs=-1)
params_knn = {'n_neighbors': range(2, 11, 2), 'weights': ['distance', 'uniform'], 'leaf_size': range(10, 50, 10),
              'p': [1, 2]}

ml(knn, fingerprint_data_m, fingerprint_label_m, params_knn)

### Voting Classifier
This model will include all the previous models.

In [None]:
voting = VotingClassifier(estimators=[('Random Forest', rf), ('Naive Bayes', nb), ('KNN', knn)], voting='soft',
                          n_jobs=-1)

ml(voting, fingerprint_data_m, fingerprint_label_m, None)

### Neural Network

In [None]:
nn = MLPClassifier(early_stopping=True)
params_nn = {'activation': ['identity', 'logistic', 'tanh', 'relu'],
             'learning_rate': ['constant', 'invscaling', 'adaptive']}

ml(nn, fingerprint_data_m, fingerprint_label_m, params_nn)