In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, PowerTransformer
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.svm import SVC
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
# Load data from CSV file
data = pd.read_csv('biodegradable_a.csv')
#Since the Biodegradable has no Numbers we have to separate it from de data set 
# select all columns except for 'Biodegradable'
dataNB = data.drop('Biodegradable', axis=1)
# create a separate variable for 'Biodegradable'
BioD = data['Biodegradable']
#now we are gointto separte the data in a 80/20% split portion 
dataNB_80, dataNB_20, BioD_80, BioD_20 = train_test_split(dataNB, BioD, test_size=0.2, random_state=42)
BioD_80_Bin = BioD_80.replace({'RB': 1, 'NRB': 0})
#First we are going to use 2 different methods to replace all the missing values
#Then we are going to normalize the dataset using 4 different methods and classify using each one to test the best outcome
#Using Univariate Imputation
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
dataNB_Imputed = pd.DataFrame(imputer.fit_transform(dataNB_80))
#Using KNN Imputation, for now we decided to use 4 neighbours
neighboursImp = KNNImputer(n_neighbors=4, weights="uniform")
dataNB_NeighboursImp = pd.DataFrame(neighboursImp.fit_transform(dataNB_80))
#Let's star with the Standar Scaler
scalerS = StandardScaler()
scalerNB_Imputed = scalerS.fit_transform(dataNB_Imputed)
scalerNB_NeighboursImp = scalerS.fit_transform(dataNB_NeighboursImp)
# normalize the data using MinMaxScaler
MinMax = MinMaxScaler()
MinMaxNB_Imputed = MinMax.fit_transform(dataNB_Imputed)
MinMaxNB_Neigbours = MinMax.fit_transform(dataNB_NeighboursImp)
#normalize the data using PowerTransformer
normalizerPT = PowerTransformer()
PT_NB_Imputed = normalizerPT.fit_transform(dataNB_Imputed)
PT_NB_Neigbours = normalizerPT.fit_transform(dataNB_NeighboursImp)
#normalize the data using Normalizer
theNormalizer = Normalizer()
imputed_Normalised_NB = theNormalizer.fit_transform(dataNB_Imputed)
neighbours_Normalised_NB = theNormalizer.fit_transform(dataNB_NeighboursImp)

In [2]:
scalers = [scalerNB_Imputed, scalerNB_NeighboursImp, MinMaxNB_Imputed, MinMaxNB_Neigbours, PT_NB_Imputed, PT_NB_Neigbours, imputed_Normalised_NB, neighbours_Normalised_NB]
logistic_regression = LogisticRegression()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
kf = KFold(n_splits=5, shuffle=True)
scaler_accuracies = {}
scaler_precision = {}
scaler_f1 = {}
for scaler in scalers:
    accuracies = []
    precisions = []
    f1s = []
    for train_index, test_index in kf.split(scaler):
        X_train, X_test = scaler[train_index], scaler[test_index]
        y_train, y_test = BioD_80_Bin.iloc[train_index], BioD_80_Bin.iloc[test_index]
        logistic_regression.fit(X_train, y_train)
        decision_tree.fit(X_train, y_train)
        random_forest.fit(X_train, y_train)
        # Make predictions
        lr_pred = logistic_regression.predict(X_test)
        dt_pred = decision_tree.predict(X_test)
        rf_pred = random_forest.predict(X_test)
        # Calculate accuracies
        lr_accuracy = accuracy_score(y_test, lr_pred)
        dt_accuracy = accuracy_score(y_test, dt_pred)
        rf_accuracy = accuracy_score(y_test, rf_pred)
        # Calculate precision
        lr_precison = precision_score(y_test, lr_pred)
        dt_precison = precision_score(y_test, dt_pred)
        rf_precison = precision_score(y_test, rf_pred)
        # Calculate f1
        lr_f1 = f1_score(y_test, lr_pred)
        dt_f1 = f1_score(y_test, dt_pred)
        rf_f1 = f1_score(y_test, rf_pred)
        # Store the average accuracy for this fold
        accuracies.append((lr_accuracy + dt_accuracy + rf_accuracy) / 3)
        precisions.append((lr_precison + dt_precison + rf_precison) / 3)
        f1s.append((lr_f1 + dt_f1 + rf_f1) / 3)
    scaler_accuracies[str(scaler)] = sum(accuracies) / len(accuracies)
    scaler_precision[str(scaler)] = sum(precisions) / len(precisions)
    scaler_f1[str(scaler)] = sum(f1s) / len(f1s)
# Print the average accuracy for each scaler
for scaler in scalers:
    accuracy1 = scaler_accuracies[str(scaler)]
    accuracy2 = scaler_precision[str(scaler)]
    accuracy3 = scaler_f1[str(scaler)]
    print("Average Accuracy: {:.4f}".format(accuracy1))
    print("Average precision: {:.4f}".format(accuracy2))
    print("Average f1: {:.4f}".format(accuracy3))
    print("\n")    
#After analyzing all the vales from all scalers we concluded that the best scaler to use is the scalerNB_NeighboursImp, which is the normalized data set using the minmax scaler and with all the values
#it's the dataset normalized by the standard scaler method and imputed with the KNN Imputed method

Average Accuracy: 0.9572
Average precision: 0.9685
Average f1: 0.9747


Average Accuracy: 0.9594
Average precision: 0.9708
Average f1: 0.9759


Average Accuracy: 0.9500
Average precision: 0.9587
Average f1: 0.9707


Average Accuracy: 0.9500
Average precision: 0.9587
Average f1: 0.9707


Average Accuracy: 0.9539
Average precision: 0.9653
Average f1: 0.9728


Average Accuracy: 0.9554
Average precision: 0.9669
Average f1: 0.9736


Average Accuracy: 0.9271
Average precision: 0.9381
Average f1: 0.9583


Average Accuracy: 0.9296
Average precision: 0.9399
Average f1: 0.9598




In [3]:
normalized_Final = pd.DataFrame(scalerS.fit_transform(dataNB_NeighboursImp), columns=dataNB_80.columns)
#Now we are goin to use several method to eliminate unwanted varuables starting by Feature selection Using correlation
# Calculate Pearson correlation coefficient matrix
corr_matrix = pd.DataFrame(np.corrcoef(normalized_Final.T))
target_corr = corr_matrix.iloc[0, 1:]
# Sort the correlation values in descending order
sorted_corr = target_corr.abs().sort_values(ascending=False)
# Select the top 5 variables with the highest absolute correlation
bottom_5_variables = sorted_corr.tail(5)
bottom_5_variable_names = bottom_5_variables.index.map(lambda x: normalized_Final.columns[x])
correlation_normalized_Final = normalized_Final.drop(bottom_5_variable_names, axis=1)

In [4]:
#now we are goin to use feature selection using stepwise methods, this time we are using Sequential Selector form scikit
BioD_80_Bin = BioD_80.replace({'RB': 1, 'NRB': 0})
X_train, X_test, y_train, y_test = train_test_split(normalized_Final, BioD_80_Bin, test_size=0.2)
lmr=LinearRegression()
sfs = SequentialFeatureSelector(lmr, n_features_to_select=36) #36 melhores ou seja ignora as últimas 5
sfs.fit(X_train, y_train)
#get the relevant columns
features=sfs.get_support()
features_selected = np.where(features)[0]
frontward_normalized_Final = normalized_Final.iloc[:, features_selected]
#the same but this time using backwards:
sfs = SequentialFeatureSelector(lmr, n_features_to_select=36, direction= 'backward') #36 melhores ou seja ignora as últimas 5
sfs.fit(X_train, y_train)
#get the relevant columns
features2=sfs.get_support()
features_selected2 = np.where(features)[0]
backward_normalized_Final = normalized_Final.iloc[:, features_selected2]
#now to select the best features we are going to use linear and Kernel PCA starting with linear
from sklearn.decomposition import PCA
pca = PCA(n_components=normalized_Final.shape[1])
pca.fit(normalized_Final)
explained_variance_ratio = pca.explained_variance_ratio_
sorted_variance_ratio = np.sort(explained_variance_ratio)
column_names = normalized_Final.columns
bottom_5_column_names = column_names[np.argsort(explained_variance_ratio)[:5]]
PCA_normalized_Final = normalized_Final.drop(columns=bottom_5_column_names)

In [5]:
#now we will test to see wich method produces the best results
cuted = [correlation_normalized_Final, frontward_normalized_Final, backward_normalized_Final, PCA_normalized_Final]
rfc = RandomForestClassifier()
lmr = LogisticRegression()
dtc = DecisionTreeClassifier(max_depth=5)
kf = KFold(n_splits=5, shuffle=True)
method_scores = {}
for cuter in cuted:
    scores = []
    for train_index, test_index in kf.split(cuter):
        X_train, X_test = cuter.iloc[train_index], cuter.iloc[test_index]
        y_train, y_test = BioD_80_Bin.iloc[train_index], BioD_80_Bin.iloc[test_index]
        rfc.fit(X_train, y_train)
        lmr.fit(X_train, y_train)
        dtc.fit(X_train, y_train)
        rf_preds = rfc.predict(X_test)
        lr_preds = lmr.predict(X_test)
        dt_preds = dtc.predict(X_test)
        score_rf = explained_variance_score(y_test, rf_preds)
        score_lr = explained_variance_score(y_test, lr_preds)
        score_dt = explained_variance_score(y_test, dt_preds)
        scores.append((score_rf + score_lr + score_dt) / 3)
    method_scores[str(cuter)] = sum(scores) / len(scores)   
dataNB_N = pd.DataFrame(neighboursImp.fit_transform(dataNB))
dataNB_NMM = pd.DataFrame(scalerS.fit_transform(dataNB_N), columns=dataNB.columns)
final_dataSet = dataNB_NMM.drop(bottom_5_column_names, axis = 1)
final_dataSet['Biodegradable'] = BioD.values

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(final_dataSet.drop('Biodegradable', axis=1), final_dataSet['Biodegradable'], test_size=0.2, random_state=42)
eval_metrics = []
# Test each model
models = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('Naive Bayes', GaussianNB()),
    ('KNN', KNeighborsClassifier()),
    ('SVM', SVC())
]
for name, model in models:
    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5)  # 5-fold cross-validation
    accuracy = scores.mean()
    print(name)
    print(f"Cross-Validation Accuracy: {accuracy:.4f}")
    # Append evaluation metrics
    eval_metrics.append((name, accuracy))
# Compare evaluation metrics and choose the best model
best_model = max(eval_metrics, key=lambda x: x[1])
best_model_name, best_model_accuracy = best_model
print("Best Model:")
print(f"Model: {best_model_name}")
print(f"Cross-Validation Accuracy: {best_model_accuracy:.4f}")

Decision Tree
Cross-Validation Accuracy: 0.9474
Logistic Regression
Cross-Validation Accuracy: 0.9521
Naive Bayes
Cross-Validation Accuracy: 0.9417
KNN
Cross-Validation Accuracy: 0.9548
SVM
Cross-Validation Accuracy: 0.9638
Best Model:
Model: SVM
Cross-Validation Accuracy: 0.9638


In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(final_dataSet.drop('Biodegradable', axis=1), final_dataSet['Biodegradable'], test_size=0.2, random_state=42)
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [1, 10, 100, 1000],
    'gamma': [1e-1, 1e-3, 1e-5, 1e-7]
}
# Create the SVC model
model = SVC()
# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
# Get the best model and its performance
best_model = grid_search.best_estimator_
best_model_accuracy = grid_search.best_score_
print("Best Model:")
print(f"Model: {best_model}")
print(f"Cross-Validation Accuracy: {best_model_accuracy:.4f}")
# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
cr = classification_report(y_test, y_pred)
print(cr)

Best Model:
Model: SVC(C=10, gamma=0.1)
Cross-Validation Accuracy: 0.9690
              precision    recall  f1-score   support

         NRB       0.94      0.92      0.93       156
          RB       0.98      0.99      0.99       757

    accuracy                           0.98       913
   macro avg       0.96      0.95      0.96       913
weighted avg       0.98      0.98      0.98       913



In [8]:
# Supondo que X_train contenha a matriz de recursos de treinamento e y_train contenha a variável alvo de treinamento
# Supondo que final_dataSet seja o nome do DataFrame contendo os dados completos
# Crie um classificador SVC
modelo = SVC()
# Crie um objeto SequentialFeatureSelector e especifique o modelo, a métrica de avaliação e o número de recursos a serem selecionados
sfs = SequentialFeatureSelector(modelo, n_features_to_select=10)
# Ajuste o SFS na matriz de recursos de treinamento e variável alvo de treinamento
sfs.fit(X_train, y_train)
# Obtenha os índices dos recursos selecionados
indices_recursos_selecionados = sfs.get_support(indices=True)
# Obtenha os nomes dos recursos selecionados
recursos_selecionados = final_dataSet.columns[indices_recursos_selecionados]
# Imprima os recursos selecionados
print("Most significant features:")
for recurso in recursos_selecionados:
    print(recurso)

Most significant features:
nHM
NssssC
nCb
nCp
F03
nArNO2
SdO
nCrt
F02_CN
nHDon
