In [14]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Models and selection methods
from sklearn.base import clone
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
# Binary classifier metrics
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, accuracy_score
# Linear regression metrics
from sklearn.metrics import explained_variance_score, mean_squared_error, max_error, mean_absolute_error
from scipy.stats import pearsonr
#Scalers e Imputação
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, PowerTransformer
from sklearn.impute import KNNImputer
import seaborn as sns

In [3]:
# Estatisticas para classificadores
def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds))
    print("The Recall is: %7.4f" % recall_score(truth, preds))
    print("The F1 score is: %7.4f" % f1_score(truth, preds))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print()
    print("This is the Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(truth, preds)))

# Plot das previsoes vs dados reais
def displayPlot(preds, truth):
    plt.figure(figsize=(5,5))
    plt.scatter(preds, truth)
    plt.axline((0, 0), slope=1, color="red")
    plt.grid()
    plt.show()

## Pré-Processamento dos dados
Preparação do dataset - importação, normalização e preenchimento dos missing values

In [13]:
#Criar dataframe
bio_a = pd.read_csv('biodegradable_a.csv')
#Separação das 41 variáveis do y
X_bio_a=bio_a.drop(columns=["Biodegradable"])
y_bio_a=bio_a['Biodegradable']
#Converter para numpy array
Xc_bio= X_bio_a.to_numpy()
yc_bio= y_bio_a.to_numpy()
print(Xc_bio)
# Divisão do dataset em training set e independent validation set
X_bio_train, X_bio_test, y_bio_train, y_bio_test = train_test_split(Xc_bio, yc_bio, test_size=0.25, random_state=512)

[[3.919      2.6909     0.         ... 7.253      0.         0.        ]
 [4.17       2.1144     0.         ... 7.257      0.         0.        ]
 [3.932      3.2512     0.         ... 7.601      0.         0.        ]
 ...
 [4.29477099 3.47122594 0.         ... 7.69932352 0.         0.        ]
 [4.56037572 3.89639031 0.         ... 7.90880199 0.         0.        ]
 [4.04509735 2.84718513 0.         ... 6.86331402 0.         0.        ]]


In [18]:
#Passemos à normalização dos dados
##NOTA: A partir daqui é só experiência (só para se poder continuar código)
scaler = PowerTransformer()
X_bio_train_n=scaler.fit_transform(X_bio_train)
X_bio_test_n=scaler.fit_transform(X_bio_test)

print(X_bio_train_n)

[[-1.85757882  0.16845239 -0.29847422 ... -0.94961448 -0.22288565
  -0.23304178]
 [-1.85445921 -1.0190031  -0.29847422 ... -2.09769934 -0.22288565
          nan]
 [ 0.43039516  1.19305664 -0.29847422 ...  0.06408421 -0.22288565
  -0.23304178]
 ...
 [-0.56277432  0.80482105 -0.29847422 ... -0.27688505 -0.22288565
  -0.23304178]
 [ 0.75583959 -0.06065668 -0.29847422 ...  0.03992748 -0.22288565
  -0.23304178]
 [ 1.05482077 -0.19308493 -0.29847422 ...  0.76443062  4.48660609
  -0.23304178]]


In [19]:
#Tratamento dos Missing values -> Utilizar Imputação de KNN
imputer = KNNImputer(n_neighbors=2, weights="uniform")
pd.DataFrame(imputer.fit_transform(X_bio_train_n))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,-1.857579,0.168452,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,-0.176119,-1.219043,1.235903,...,-0.113263,-0.457263,-0.427509,1.321538,-0.636159,2.246219,-0.468445,-0.949614,-0.222886,-0.233042
1,-1.854459,-1.019003,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,-1.944854,0.819561,-0.615911,...,-0.113263,-0.457263,-0.427509,1.553357,-2.135502,0.987608,2.123623,-2.097699,-0.222886,-0.233042
2,0.430395,1.193057,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,0.291224,-1.219043,0.109793,...,-0.113263,-0.457263,-0.427509,-0.874182,0.281114,0.540742,-0.468445,0.064084,-0.222886,-0.233042
3,0.421222,-0.036376,-0.298474,-0.092755,-0.329914,-0.193061,1.293260,1.092849,-1.219043,0.109793,...,-0.113263,-0.457263,-0.427509,-0.874182,0.456898,0.227531,-0.468445,0.252883,-0.222886,-0.233042
4,-0.751657,-1.775834,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,-0.466588,0.084134,-1.585803,...,-0.113263,-0.457263,-0.427509,0.821645,-0.735963,-1.140070,-0.468445,-1.096429,-0.222886,-0.233042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3418,1.664765,0.591478,3.349907,-0.092755,-0.329914,-0.193061,1.293260,0.257541,-1.219043,0.711807,...,-0.113263,-0.457263,-0.427509,1.321538,2.082672,0.859662,-0.468445,1.506523,-0.222886,-0.233042
3419,0.244807,-0.641362,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,-0.292831,0.819561,0.109793,...,-0.113263,-0.457263,-0.427509,1.553357,-0.191679,-0.552582,-0.468445,0.648379,-0.222886,-0.233042
3420,-0.562774,0.804821,-0.298474,-0.092755,-0.329914,-0.193061,-0.766057,-1.396728,0.084134,0.109793,...,-0.113263,-0.457263,-0.427509,1.321538,0.404293,2.169303,-0.468445,-0.276885,-0.222886,-0.233042
3421,0.755840,-0.060657,-0.298474,-0.092755,-0.329914,-0.193061,1.293260,0.813510,1.329679,-1.585803,...,-0.113263,-0.457263,-0.427509,-0.874182,0.397339,-1.456878,-0.468445,0.039927,-0.222886,-0.233042
