In [17]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Models and selection methods
from sklearn.base import clone
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
# Binary classifier metrics
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, accuracy_score
# Linear regression metrics
from sklearn.metrics import explained_variance_score, mean_squared_error, max_error, mean_absolute_error
from scipy.stats import pearsonr
#Pré-Processamento de dados
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector

In [40]:
# Estatisticas para classificadores
def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds))
    print("The Recall is: %7.4f" % recall_score(truth, preds))
    print("The F1 score is: %7.4f" % f1_score(truth, preds))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print()
    print("This is the Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(truth, preds)))

# Previsao de resultados com cross validation
def CrossValidation(X_TRAIN, y_TRAIN, kf, model):
    TRUTH=None
    PREDS=None
    for train_index, test_index in kf.split(X_TRAIN):
        X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
        y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]
        temp_model = clone(model)
        temp_model.fit(X_train, y_train)
        preds = temp_model.predict(X_test)
        if TRUTH is None:
            PREDS=preds
            TRUTH=y_test
        else:
            PREDS=np.hstack((PREDS, preds))
            TRUTH=np.hstack((TRUTH, y_test))
    return (TRUTH, PREDS)
    
# Model testing rapido
def naif_model_testing(X_train, X_test, y_train, y_test):
    rfr= RandomForestClassifier(n_jobs=-1)
    rfr.fit(X_train, y_train)
    dtr= DecisionTreeClassifier(max_depth=5)
    dtr.fit(X_train, y_train)
    lmr=LogisticRegression(n_jobs=-1)
    lmr.fit(X_train, y_train)
    rf_preds=rfr.predict(X_test)
    dt_preds=dtr.predict(X_test)
    lr_preds=lmr.predict(X_test)
    scores = [f1_score(y_test, rf_preds),f1_score(y_test, dt_preds),f1_score(y_test, lr_preds)]
    print("F1 RFs: %7.4f" % f1_score(y_test, rf_preds))
    print("F1 DTs: %7.4f" % f1_score(y_test, dt_preds))
    print("F1 LRs: %7.4f" % f1_score(y_test, lr_preds))
    print("F1 Avg:  %7.4f" % (sum(scores) / len(scores)))
    return (sum(scores) / len(scores))

def Step_for(X_train, X_test, y_train, y_test):
    
    N,M=X_train.shape

    #Vamos usar random forests
    rfr=RandomForestClassifier(random_state=45)
    sfs = SequentialFeatureSelector(rfr, n_features_to_select=10)
    sfs.fit(X_train, y_train)

    #get the relevant columns
    features=sfs.get_support()
    Features_selected =np.arange(M)[features]
    print("The features selected are columns: ", Features_selected)

    nX_train=sfs.transform(X_train)
    nX_test=sfs.transform(X_test)

    f1_avg = naif_model_testing(nX_train, nX_test, y_train, y_test)
    return (f1_avg, nX_train, nX_test)
    
def ML_Sel(X_train, X_test, y_train, y_test, thresh):
    N,M=X_train.shape

    rfr=RandomForestClassifier(random_state=45, n_jobs=-1)
    sel = SelectFromModel(estimator=rfr, threshold=thresh)
    sel.fit(X_train, y_train)
    
    print("Default threshold: ", sel.threshold_)
    features=sel.get_support()
    Features_selected =np.arange(M)[features]
    print("The features selected are columns: ", Features_selected)
    nX_train=sel.transform(X_train)
    nX_test=sel.transform(X_test)
    f1_avg = naif_model_testing(nX_train, nX_test, y_train, y_test)
    return (f1_avg, nX_train, nX_test)

## Pré-Processamento dos dados
Preparação do dataset - importação, normalização e preenchimento dos missing values

In [24]:
#Criar dataframe
bio_a = pd.read_csv('biodegradable_a.csv')
#Separação das 41 variáveis do y
X_bio_a=bio_a.drop(columns=["Biodegradable"])
y_bio_a=bio_a['Biodegradable'].apply(lambda x : 1 if x == 'RB' else 0)
#Converter para numpy array
Xc_bio= X_bio_a.to_numpy()
yc_bio= y_bio_a.to_numpy()
# Divisão do dataset em training set e independent validation set
X_bio_train, X_bio_test, y_bio_train, y_bio_test = train_test_split(Xc_bio, yc_bio, test_size=0.25, random_state=512)
# Kfold
kf = KFold(n_splits=16, shuffle=True, random_state = 274)

Passamos agora à normalização dos dados. Vão ser escolhidos os seguintes métodos de normalização para comparar mais tarde: MinMax Scaler, Standard Scaler e Power Transformer

In [12]:
#Passemos à normalização dos dados
scaler_p = PowerTransformer()
scaler_st = StandardScaler()
scaler_min= MinMaxScaler()

#Transformar dados
X_bio_train_p=scaler_p.fit_transform(X_bio_train)
X_bio_test_p=scaler_p.fit_transform(X_bio_test)

X_bio_train_st=scaler_st.fit_transform(X_bio_train)
X_bio_test_st=scaler_st.fit_transform(X_bio_test)

X_bio_train_min=scaler_min.fit_transform(X_bio_train)
X_bio_test_min=scaler_min.fit_transform(X_bio_test)

pd.DataFrame(X_bio_train_p)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,-1.857579,0.168452,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,-0.176119,-1.219043,1.235903,...,-0.113263,-0.457263,-0.427509,1.321538,-0.636159,2.246219,-0.468445,-0.949614,-0.222886,-0.233042
1,-1.854459,-1.019003,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,,0.819561,-0.615911,...,-0.113263,-0.457263,-0.427509,1.553357,-2.135502,0.987608,2.123623,-2.097699,-0.222886,
2,0.430395,1.193057,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,0.291224,-1.219043,0.109793,...,-0.113263,-0.457263,-0.427509,-0.874182,0.281114,0.540742,-0.468445,0.064084,-0.222886,-0.233042
3,0.421222,-0.036376,-0.298474,-0.092755,-0.329914,-0.193061,1.293260,1.092849,-1.219043,0.109793,...,-0.113263,-0.457263,-0.427509,-0.874182,0.456898,0.227531,-0.468445,0.252883,-0.222886,-0.233042
4,-0.751657,-1.775834,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,-0.466588,0.084134,-1.585803,...,-0.113263,-0.457263,-0.427509,0.821645,,-1.140070,-0.468445,-1.096429,-0.222886,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3418,1.664765,0.591478,3.349907,-0.092755,-0.329914,-0.193061,1.293260,,-1.219043,0.711807,...,-0.113263,-0.457263,-0.427509,1.321538,2.082672,,-0.468445,1.506523,-0.222886,-0.233042
3419,0.244807,-0.641362,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,-0.292831,0.819561,0.109793,...,-0.113263,-0.457263,-0.427509,1.553357,-0.191679,-0.552582,-0.468445,0.648379,-0.222886,-0.233042
3420,-0.562774,0.804821,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,,0.084134,0.109793,...,-0.113263,-0.457263,-0.427509,1.321538,0.404293,2.169303,-0.468445,-0.276885,-0.222886,-0.233042
3421,0.755840,-0.060657,-0.298474,-0.092755,-0.329914,-0.193061,1.293260,0.813510,1.329679,-1.585803,...,-0.113263,-0.457263,-0.427509,-0.874182,0.397339,-1.456878,-0.468445,0.039927,-0.222886,-0.233042


Com os dados normalizados, podemos passar à imputação dos valores em falta. Escolheu-se o método de imputação utilizando o K-Nearest Neighbours em todos 

In [14]:
#Tratamento dos Missing values -> Utilizar Imputação de KNN
imputer = KNNImputer(n_neighbors=3, weights="uniform")

imputer.fit(X_bio_train_p)
X_bio_train_p=imputer.transform(X_bio_train_p)
X_bio_test_p=imputer.transform(X_bio_test_p)

imputer.fit(X_bio_train_st)
X_bio_train_st=imputer.transform(X_bio_train_st)
X_bio_test_st=imputer.transform(X_bio_test_st)

imputer.fit(X_bio_train_min)
X_bio_train_min=imputer.transform(X_bio_train_min)
X_bio_test_min=imputer.transform(X_bio_test_min)


Resta apenas ver quais são as variáveis mais relevantes. Para tal, vamos utilizar dois métodos diferentes e posteriormente comparar: Stepwise Feature Selection e Random Forests para a seleção de Features

In [41]:
datasets = []

print("Stepwise")
datasets.append(Step_for(X_bio_train_st, X_bio_test_st, y_bio_train, y_bio_test))

print()
print("Random Forests")
datasets.append(ML_Sel(X_bio_train_st, X_bio_test_st, y_bio_train, y_bio_test, 0.035))

# escolher o conjunto de treino com maior f1_avg
_, X_train, X_test = max(datasets, key= lambda x : x[0])

Stepwise
The features selected are columns:  [ 0  2  5  6  7 10 15 21 22 33]
F1 RFs:  0.9775
F1 DTs:  0.9711
F1 LRs:  0.9493
F1 Avg:   0.9660

Random Forests
Default threshold:  0.035
The features selected are columns:  [ 0  2  4  5  6 10 21 33 35 40]
F1 RFs:  0.9769
F1 DTs:  0.9712
F1 LRs:  0.9651
F1 Avg:   0.9711


## Modelos

Nesta segunda parte iremos criar modelos que consigam prevêr se um químico é ou não biodegradável. Iremos também otimizar estes modelos consoante os seus hiperparâmetros. Os modelos a ser utilizados são: KNN, SVM, Random Forests, AdaBoost e XGBoost

Decision Tree Classifier

In [42]:
params = [
    {"max_depth" : [6,8,10,12,14,16,18,20,22,24,26,28,30],
    "min_samples_leaf" : [1,2,5,10,20],
    "min_samples_split" : [2,5,10,20],
    "criterion":['gini','entropy']}]

grid_search_treeclass = GridSearchCV(
    DecisionTreeClassifier(), params, scoring="f1", cv=kf, n_jobs=-1)

grid_search_treeclass.fit(X_train,  y_bio_train)
print(grid_search_treeclass.best_params_)

{'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 10, 'min_samples_split': 20}


In [43]:
printClassResults(*CrossValidation(X_train, y_bio_train, kf, grid_search_treeclass.best_estimator_))

The Accuracy is:  0.9489
The Precision is:  0.9669
The Recall is:  0.9719
The F1 score is:  0.9694
The Matthews correlation coefficient is:  0.8145

This is the Confusion Matrix
     0     1
0  477    95
1   80  2771


Regressão Logistica

In [44]:
params = [
    {"C" : [x*0.1 for x in range(1,11)],
    "max_iter" : [999999]}]

grid_search_log = GridSearchCV(
    LogisticRegression(), params, scoring="f1", cv=kf, n_jobs=-1)

grid_search_log.fit(X_train, y_bio_train)
print(grid_search_log.best_params_)

{'C': 0.7000000000000001, 'max_iter': 999999}


In [45]:
printClassResults(*CrossValidation(X_train, y_bio_train, kf, grid_search_log.best_estimator_))

The Accuracy is:  0.9375
The Precision is:  0.9426
The Recall is:  0.9849
The F1 score is:  0.9633
The Matthews correlation coefficient is:  0.7617

This is the Confusion Matrix
     0     1
0  401   171
1   43  2808


KNN

SVM

XGBOOST