In [1]:
# Imports
import pandas as pd
import numpy as np
# Models and selection methods
from sklearn.base import clone
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
# Binary classifier metrics
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, accuracy_score
#Pré-Processamento de dados
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector

  from pandas import MultiIndex, Int64Index


In [2]:
# Estatisticas para classificadores
def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds))
    print("The Recall is: %7.4f" % recall_score(truth, preds))
    print("The F1 score is: %7.4f" % f1_score(truth, preds))
    matthews = matthews_corrcoef(truth, preds)
    print("The Matthews correlation coefficient is: %7.4f" % matthews)
    print()
    print("This is the Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(truth, preds)))

# Previsao de resultados com cross validation
def CrossValidation(X_TRAIN, y_TRAIN, kf, model):
    TRUTH=None
    PREDS=None
    for train_index, test_index in kf.split(X_TRAIN):
        X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
        y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]
        temp_model = clone(model)
        temp_model.fit(X_train, y_train)
        preds = temp_model.predict(X_test)
        if TRUTH is None:
            PREDS=preds
            TRUTH=y_test
        else:
            PREDS=np.hstack((PREDS, preds))
            TRUTH=np.hstack((TRUTH, y_test))
    return (TRUTH, PREDS)
    
# Model testing rapido
def naif_model_testing(X_train_bio, y_train_bio):
    X_train, X_test, y_train, y_test = train_test_split(X_train_bio, y_train_bio, test_size=0.25, random_state=27)
    rfr= RandomForestClassifier(n_jobs=8)
    rfr.fit(X_train, y_train)
    dtr= DecisionTreeClassifier(max_depth=5)
    dtr.fit(X_train, y_train)
    lmr=LogisticRegression(n_jobs=8)
    lmr.fit(X_train, y_train)
    rf_preds=rfr.predict(X_test)
    dt_preds=dtr.predict(X_test)
    lr_preds=lmr.predict(X_test)
    scores = [f1_score(y_test, rf_preds),f1_score(y_test, dt_preds),f1_score(y_test, lr_preds)]
    print("F1 RFs: %7.4f" % f1_score(y_test, rf_preds))
    print("F1 DTs: %7.4f" % f1_score(y_test, dt_preds))
    print("F1 LRs: %7.4f" % f1_score(y_test, lr_preds))
    print("F1 Avg:  %7.4f" % (sum(scores) / len(scores)))
    return (sum(scores) / len(scores))

#Escolhas de features metodo stepwise
def Step_for(X_train, X_test, y_train):
    
    N,M=X_train.shape

    #Vamos usar random forests
    rfr=RandomForestClassifier(random_state=45, n_jobs=8)
    sfs = SequentialFeatureSelector(rfr, n_features_to_select=10)
    sfs.fit(X_train, y_train)

    #get the relevant columns
    features=sfs.get_support()
    Features_selected =np.arange(M)[features]
    print("The features selected are columns: ", Features_selected)

    nX_train=sfs.transform(X_train)
    nX_test=sfs.transform(X_test)

    f1_avg = naif_model_testing(nX_train, y_train)
    return (f1_avg, nX_train, nX_test)
    
def ML_Sel(X_train, X_test, y_train, thresh):
    N,M=X_train.shape

    rfr=RandomForestClassifier(random_state=45, n_jobs=8)
    sel = SelectFromModel(estimator=rfr, threshold=thresh)
    sel.fit(X_train, y_train)
    
    print("Default threshold: ", sel.threshold_)
    features=sel.get_support()
    Features_selected =np.arange(M)[features]
    print("The features selected are columns: ", Features_selected)
    nX_train=sel.transform(X_train)
    nX_test=sel.transform(X_test)
    f1_avg = naif_model_testing(nX_train, y_train)
    return (f1_avg, nX_train, nX_test)

#Função de peso para o KNN
def gaussian(dsts):
    kernel_width = .5
    weights = np.exp(-(dsts**2)/kernel_width)
    return weights

## Pré-Processamento dos dados
Preparação do dataset - importação, normalização e preenchimento dos missing values

In [3]:
#Criar dataframe
bio_a = pd.read_csv('biodegradable_a.csv')
#Separação das 41 variáveis do y
X_bio_a=bio_a.drop(columns=["Biodegradable"])
y_bio_a=bio_a['Biodegradable'].apply(lambda x : 1 if x == 'RB' else 0)
#Converter para numpy array
Xc_bio= X_bio_a.to_numpy()
yc_bio= y_bio_a.to_numpy()
# Divisão do dataset em training set e independent validation set
X_bio_train, X_bio_test, y_bio_train, y_bio_test = train_test_split(Xc_bio, yc_bio, test_size=0.25, random_state=512)
# Kfold
kf = KFold(n_splits=16, shuffle=True, random_state = 274)

In [4]:
# Converter os tipos de floats para outros, para puder determinar as categoricas pelo tipo Int64
X_train_temp = pd.DataFrame(X_bio_train).convert_dtypes()
X_test_temp = pd.DataFrame(X_bio_test).convert_dtypes()

# Imputting das variaveis categoricas utilizando a moda
imputer_categoricas = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

# Selecionar as colunas com variaveis Int64
imputer_categoricas.fit(X_train_temp.select_dtypes("Int64"))
#Criar um dataframe com as varaiveis categoricas sem missing values
X_train_categorical = imputer_categoricas.transform(X_train_temp.select_dtypes("Int64"))
X_test_categorical = imputer_categoricas.transform(X_test_temp.select_dtypes("Int64"))

#Criar um dataframe com as variaveis continuas
X_train_continuos = X_train_temp.select_dtypes(exclude="Int64")
X_test_continuos = X_test_temp.select_dtypes(exclude="Int64")

# Juntar os dataframes das categoricas e continuas com os indices de coluna originais, dar sort das colunas por indice, e converter para float para puder fazer scaling
X_bio_train_step1 = X_train_continuos.join(pd.DataFrame(X_train_categorical, columns=X_train_temp.select_dtypes("Int64").columns)).sort_index(axis=1).astype(float)
X_bio_test_step1 = X_test_continuos.join(pd.DataFrame(X_test_categorical, columns=X_train_temp.select_dtypes("Int64").columns)).sort_index(axis=1).astype(float)

Passamos agora à normalização dos dados. Vão ser escolhidos os seguintes métodos de normalização para comparar mais tarde: MinMax Scaler, Standard Scaler e Power Transformer

In [5]:
#Passemos à normalização dos dados
scaler_p = PowerTransformer()
scaler_st = StandardScaler()
scaler_min= MinMaxScaler()

#Transformar dados
X_bio_train_p=scaler_p.fit_transform(X_bio_train_step1)
X_bio_test_p=scaler_p.fit_transform(X_bio_test_step1)

X_bio_train_st=scaler_st.fit_transform(X_bio_train_step1)
X_bio_test_st=scaler_st.fit_transform(X_bio_test_step1)

X_bio_train_min=scaler_min.fit_transform(X_bio_train_step1)
X_bio_test_min=scaler_min.fit_transform(X_bio_test_step1)

Com os dados normalizados, podemos passar à imputação dos valores em falta. Escolheu-se o método de imputação utilizando o K-Nearest Neighbours em todos 

In [6]:
#Tratamento dos Missing values -> Utilizar Imputação de KNN

imputer_continuas = KNNImputer(n_neighbors=3, weights="uniform")


datasets_scaled = []

imputer_continuas.fit(X_bio_train_p)
X_bio_train_p=imputer_continuas.transform(X_bio_train_p)
X_bio_test_p=imputer_continuas.transform(X_bio_test_p)
datasets_scaled.append(("PowerTransformer",X_bio_train_p, X_bio_test_p))

imputer_continuas.fit(X_bio_train_st)
imputer_categoricas.fit(X_bio_train_st)
X_bio_train_st=imputer_continuas.transform(X_bio_train_st)
X_bio_test_st=imputer_continuas.transform(X_bio_test_st)
datasets_scaled.append(("StandardScaler",X_bio_train_st, X_bio_test_st))

imputer_continuas.fit(X_bio_train_min)
X_bio_train_min=imputer_continuas.transform(X_bio_train_min)
X_bio_test_min=imputer_continuas.transform(X_bio_test_min)
datasets_scaled.append(("MinMax",X_bio_train_min, X_bio_test_min))

Resta apenas ver quais são as variáveis mais relevantes. Para tal, vamos utilizar dois métodos diferentes e posteriormente comparar: Stepwise Feature Selection e Random Forests para a seleção de Features

In [7]:
datasets_reduced = []
for name, x_train_scaled, x_test_scaled in datasets_scaled:
    print("Scaling:", name)
    print("Stepwise")
    datasets_reduced.append(Step_for(x_train_scaled, x_test_scaled, y_bio_train))
    print("Random Forests")
    datasets_reduced.append(ML_Sel(x_train_scaled, x_test_scaled, y_bio_train, 0.035))
    print()


Scaling: PowerTransformer
Stepwise
The features selected are columns:  [ 2  5  6  7 10 15 17 33 35 36]
F1 RFs:  0.9822
F1 DTs:  0.9704
F1 LRs:  0.9517
F1 Avg:   0.9681
Random Forests
Default threshold:  0.035
The features selected are columns:  [ 0  2  4  5  6 10 21 26 33 35 40]
F1 RFs:  0.9790
F1 DTs:  0.9646
F1 LRs:  0.9634
F1 Avg:   0.9690

Scaling: StandardScaler
Stepwise
The features selected are columns:  [ 2  3  4  5 10 15 22 31 33 37]
F1 RFs:  0.9748
F1 DTs:  0.9667
F1 LRs:  0.9696
F1 Avg:   0.9704
Random Forests
Default threshold:  0.035
The features selected are columns:  [ 0  2  4  5  6 10 21 33 35 40]
F1 RFs:  0.9789
F1 DTs:  0.9652
F1 LRs:  0.9690
F1 Avg:   0.9710

Scaling: MinMax
Stepwise
The features selected are columns:  [ 2  4  5  6 10 15 22 33 37 40]
F1 RFs:  0.9782
F1 DTs:  0.9709
F1 LRs:  0.9521
F1 Avg:   0.9671
Random Forests
Default threshold:  0.035
The features selected are columns:  [ 0  2  4  5  6 10 21 33 35 40]
F1 RFs:  0.9782
F1 DTs:  0.9639
F1 LRs:  0.955

In [8]:
# escolher o conjunto de treino com maior f1_avg
_,X_train, X_test = max(datasets_reduced, key= lambda x : x[0])

## Modelos

Nesta segunda parte iremos criar modelos que consigam prevêr se um químico é ou não biodegradável. Iremos também otimizar estes modelos consoante os seus hiperparâmetros. Os modelos a ser utilizados são: Decision Tree, Regressão Logística, KNN, SVM, Random Forests e XGBoost

In [9]:
# Vamos guardar aqui os modelos e seu respetivo mathews correlaction coef
models = []

**Decision Tree Classifier**

In [10]:
params = [
    {"max_depth" : [6,8,10,12,14,16,18,20,22,24,26,28,30],
    "min_samples_leaf" : [1,2,5,10, 15, 20],
    "min_samples_split" : [2,5,10, 20, 25, 30],
    "criterion":['gini','entropy']}]

grid_search_treeclass = GridSearchCV(
    DecisionTreeClassifier(), params, scoring="f1", cv=kf, n_jobs=-1)

grid_search_treeclass.fit(X_train,  y_bio_train)
print("Melhores Parâmetros:", grid_search_treeclass.best_params_,"\n")
(Truth, Preds) = CrossValidation(X_test, y_bio_test, kf, grid_search_treeclass.best_estimator_)
printClassResults(Truth, Preds)
models.append((matthews_corrcoef(Truth, Preds), grid_search_treeclass.best_estimator_))

Melhores Parâmetros: {'criterion': 'gini', 'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 5} 

The Accuracy is:  0.9299
The Precision is:  0.9608
The Recall is:  0.9569
The F1 score is:  0.9588
The Matthews correlation coefficient is:  0.7222

This is the Confusion Matrix
     0    1
0  129   38
1   42  932


**Regressão Logistica**

In [11]:
params = [
    {"C" : [x*0.1 for x in range(1,11)],
    "max_iter" : [999999]}]

grid_search_log = GridSearchCV(
    LogisticRegression(), params, scoring="f1", cv=kf, n_jobs=8)

grid_search_log.fit(X_train, y_bio_train)
print("Melhores Parâmetros:", grid_search_log.best_params_,"\n")
(Truth, Preds) = CrossValidation(X_test, y_bio_test, kf, grid_search_log.best_estimator_)
printClassResults(Truth, Preds)
models.append((matthews_corrcoef(Truth, Preds), grid_search_log.best_estimator_))

Melhores Parâmetros: {'C': 0.6000000000000001, 'max_iter': 999999} 

The Accuracy is:  0.9457
The Precision is:  0.9479
The Recall is:  0.9908
The F1 score is:  0.9689
The Matthews correlation coefficient is:  0.7675

This is the Confusion Matrix
     0    1
0  114   53
1    9  965


**Gaussian Naive Bayes**

In [12]:
gaussNB = GaussianNB()
gaussNB.fit(X_train, y_bio_train)
(Truth, Preds) = CrossValidation(X_test, y_bio_test, kf, gaussNB)
printClassResults(Truth, Preds)
models.append((matthews_corrcoef(Truth, Preds), gaussNB))

The Accuracy is:  0.9229
The Precision is:  0.9567
The Recall is:  0.9528
The F1 score is:  0.9547
The Matthews correlation coefficient is:  0.6945

This is the Confusion Matrix
     0    1
0  125   42
1   46  928


**KNN**

In [13]:
params = [{"n_neighbors": [1,2,3,4,5,6,7,8,9,10],
          "weights":["uniform", "distance", gaussian]}]

grid_search_knn = GridSearchCV(
    KNeighborsClassifier(), params, scoring="f1", cv=kf, n_jobs=8)

grid_search_knn.fit(X_train, y_bio_train)
print("Melhores Parâmetros:", grid_search_knn.best_params_,"\n")
(Truth, Preds) = CrossValidation(X_test, y_bio_test, kf, grid_search_knn.best_estimator_)
printClassResults(Truth, Preds)
models.append((matthews_corrcoef(Truth, Preds), grid_search_knn.best_estimator_))

Melhores Parâmetros: {'n_neighbors': 4, 'weights': <function gaussian at 0x00000207401E2670>} 

The Accuracy is:  0.9483
The Precision is:  0.9561
The Recall is:  0.9846
The F1 score is:  0.9702
The Matthews correlation coefficient is:  0.7817

This is the Confusion Matrix
     0    1
0  123   44
1   15  959


**SVM**

In [14]:
params =[{"kernel": ['linear','rbf','sigmoid'],
         "gamma": [0.1,0.5,1,10,100,1000],
         "C": [0.1,1,10,100,1000]}]

grid_search_svc = GridSearchCV(
    SVC(), params, scoring="f1", cv=kf, n_jobs=8)

grid_search_svc.fit(X_train, y_bio_train)
print("Melhores Parâmetros:", grid_search_svc.best_params_,"\n")
(Truth, Preds) = CrossValidation(X_test, y_bio_test, kf, grid_search_svc.best_estimator_)
printClassResults(Truth, Preds)
models.append((matthews_corrcoef(Truth, Preds), grid_search_svc.best_estimator_))

Melhores Parâmetros: {'C': 10, 'gamma': 0.5, 'kernel': 'rbf'} 

The Accuracy is:  0.9483
The Precision is:  0.9626
The Recall is:  0.9774
The F1 score is:  0.9699
The Matthews correlation coefficient is:  0.7862

This is the Confusion Matrix
     0    1
0  130   37
1   22  952


**RANDOM FOREST**

In [15]:
params = [
    {"n_estimators": [10,100,1000],
    "max_depth" : [4,10,16,22,28],
    "min_samples_leaf" : [5,10,20],
    "min_samples_split" : [5,10,20,30],
    "criterion":['gini','entropy']}]

grid_search_rfc = GridSearchCV(
    RandomForestClassifier(), params, scoring="f1", cv=kf, n_jobs=8)

grid_search_rfc.fit(X_train,  y_bio_train)
print("Melhores Parâmetros:", grid_search_rfc.best_params_,"\n")
(Truth, Preds) = CrossValidation(X_test, y_bio_test, kf, grid_search_rfc.best_estimator_)
printClassResults(Truth, Preds)
models.append((matthews_corrcoef(Truth, Preds), grid_search_rfc.best_estimator_))

Melhores Parâmetros: {'criterion': 'entropy', 'max_depth': 22, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 1000} 

The Accuracy is:  0.9457
The Precision is:  0.9488
The Recall is:  0.9897
The F1 score is:  0.9688
The Matthews correlation coefficient is:  0.7677

This is the Confusion Matrix
     0    1
0  115   52
1   10  964


**ADABOOST**

In [16]:
params =[{"n_estimators": [10,100],
          "learning_rate": [0.01,0.1,1],
          "base_estimator": [GaussianNB(), RandomForestClassifier(max_depth=5)]}]

grid_search_ada = GridSearchCV(
    AdaBoostClassifier(), params, scoring="f1", cv=kf, n_jobs=-1)

grid_search_ada.fit(X_train, y_bio_train)
print("Melhores Parâmetros:", grid_search_ada.best_params_,"\n")
(Truth, Preds) = CrossValidation(X_test, y_bio_test, kf, grid_search_ada.best_estimator_)
printClassResults(Truth, Preds)
models.append((matthews_corrcoef(Truth, Preds), grid_search_ada.best_estimator_))

Melhores Parâmetros: {'base_estimator': RandomForestClassifier(max_depth=5), 'learning_rate': 1, 'n_estimators': 100} 

The Accuracy is:  0.9562
The Precision is:  0.9602
The Recall is:  0.9897
The F1 score is:  0.9747
The Matthews correlation coefficient is:  0.8158

This is the Confusion Matrix
     0    1
0  127   40
1   10  964


**XGBOOST**

In [17]:
params=[{"n_estimators": [10,100,500,1000],
        "max_depth" : [4,8,12,16,20,24,28],
        "learning_rate":[0.01,0.1,0.5,1]}]

grid_search_xgb = GridSearchCV(
    XGBClassifier(), params, scoring="f1", cv=kf, n_jobs=8)

grid_search_xgb.fit(X_train, y_bio_train)
(Truth, Preds) = CrossValidation(X_test, y_bio_test, kf, grid_search_xgb.best_estimator_)
models.append((matthews_corrcoef(Truth, Preds), grid_search_xgb.best_estimator_))





In [18]:
print("Melhores Parâmetros:", grid_search_xgb.best_params_,"\n")
printClassResults(Truth, Preds)

Melhores Parâmetros: {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 1000} 

The Accuracy is:  0.9492
The Precision is:  0.9589
The Recall is:  0.9825
The F1 score is:  0.9706
The Matthews correlation coefficient is:  0.7869

This is the Confusion Matrix
     0    1
0  126   41
1   17  957


In [19]:
# Escolhemos automaticamente o melhor modelo baseado no matthews coeficient
# Lembremos que models = [(Matthews_coef, model_1),...]
best_model = max(models, key= lambda x: x[0])[1]
best_model

AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=5),
                   learning_rate=1, n_estimators=100)