In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, PowerTransformer
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC


# Load data from CSV file
data = pd.read_csv('biodegradable_a.csv')
#print(data.head())

#Since the Biodegradable has no Numbers we have to separate it from de data set 

# select all columns except for 'Biodegradable'
dataNB = data.drop('Biodegradable', axis=1)

# create a separate variable for 'Biodegradable'
BioD = data['Biodegradable']


#now we are gointto separte the data in a 80/20% split portion 
dataNB_80, dataNB_20, BioD_80, BioD_20 = train_test_split(dataNB, BioD, test_size=0.2, random_state=42)


#First we are going to use 2 different methods to replace all the missing values
#Then we are going to normalize the dataset using 4 different methods and classify using each one to test the best outcome

#Using Univariate Imputation
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
dataNB_Imputed = pd.DataFrame(imputer.fit_transform(dataNB_80))

#Using KNN Imputation, for now we decided to use 4 neighbours
neighboursImp = KNNImputer(n_neighbors=4, weights="uniform")
dataNB_NeighboursImp = pd.DataFrame(neighboursImp.fit_transform(dataNB_80))

#Let's star with the Standar Scaler
scalerS = StandardScaler()
scalerNB_Imputed = scalerS.fit_transform(dataNB_Imputed)
scalerNB_NeighboursImp = scalerS.fit_transform(dataNB_NeighboursImp)

# normalize the data using MinMaxScaler
MinMax = MinMaxScaler()
MinMaxNB_Imputed = MinMax.fit_transform(dataNB_Imputed)
MinMaxNB_Neigbours = MinMax.fit_transform(dataNB_NeighboursImp)

#normalize the data using PowerTransformer
normalizerPT = PowerTransformer()
PT_NB_Imputed = normalizerPT.fit_transform(dataNB_Imputed)
PT_NB_Neigbours = normalizerPT.fit_transform(dataNB_NeighboursImp)

#normalize the data using Normalizer
theNormalizer = Normalizer()
imputed_Normalised_NB = theNormalizer.fit_transform(dataNB_Imputed)
neighbours_Normalised_NB = theNormalizer.fit_transform(dataNB_NeighboursImp)



In [9]:
#We now are going to determine which normalised model works better for this dataset using simple cross validation
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix

scalers = [scalerNB_Imputed, scalerNB_NeighboursImp, MinMaxNB_Imputed, MinMaxNB_Neigbours, PT_NB_Imputed, PT_NB_Neigbours, imputed_Normalised_NB, neighbours_Normalised_NB]

mdl = DecisionTreeClassifier(min_samples_leaf=5)

for scaler in scalers:

    X_train, X_test, y_train, y_test = train_test_split(scaler, BioD_80, test_size=0.2)
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)

    
    print("For class NRB:")
    print("The Precision is: %7.4f" % precision_score(y_test, preds, pos_label='NRB'))
    print("The Recall is: %7.4f" % recall_score(y_test, preds, pos_label='NRB'))
    print("The F1 score is: %7.4f" % f1_score(y_test, preds, pos_label='NRB'))

    print("\nFor class RB:")
    print("The Precision is: %7.4f" % precision_score(y_test, preds, pos_label='RB'))
    print("The Recall is: %7.4f" % recall_score(y_test, preds, pos_label='RB'))
    print("The F1 score is: %7.4f" % f1_score(y_test, preds, pos_label='RB'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, preds))
    print("\n")

#After analyzing all the vales from all scalers we concluded that the best scaler to use is the MinMaxNB_Neigbours, which is the normalized data set using the minmax scaler and with all the values
# Replaced with KNN Imputation method

For class NRB:
The Precision is:  0.8349
The Recall is:  0.8585
The F1 score is:  0.8465

For class RB:
The Precision is:  0.9759
The Recall is:  0.9712
The F1 score is:  0.9735
The Matthews correlation coefficient is:  0.8202


For class NRB:
The Precision is:  0.8879
The Recall is:  0.8120
The F1 score is:  0.8482

For class RB:
The Precision is:  0.9647
The Recall is:  0.9805
The F1 score is:  0.9725
The Matthews correlation coefficient is:  0.8220


For class NRB:
The Precision is:  0.9062
The Recall is:  0.8131
The F1 score is:  0.8571

For class RB:
The Precision is:  0.9685
The Recall is:  0.9856
The F1 score is:  0.9770
The Matthews correlation coefficient is:  0.8358


For class NRB:
The Precision is:  0.9024
The Recall is:  0.8740
The F1 score is:  0.8880

For class RB:
The Precision is:  0.9737
The Recall is:  0.9801
The F1 score is:  0.9769
The Matthews correlation coefficient is:  0.8651


For class NRB:
The Precision is:  0.8468
The Recall is:  0.7966
The F1 score is:  0.

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score, confusion_matrix,matthews_corrcoef, precision_score, recall_score
from sklearn.metrics import explained_variance_score, mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

normalized_Final = pd.DataFrame(MinMax.fit_transform(dataNB_NeighboursImp), columns=dataNB_80.columns)

#print(normalized_Final)
#Now we are goin to use several method to eliminate unwanted varuables starting by Feature selection Using correlation
# Calculate Pearson correlation coefficient matrix
corr_matrix = pd.DataFrame(np.corrcoef(normalized_Final.T))

target_corr = corr_matrix.iloc[0, 1:]

# Sort the correlation values in descending order
sorted_corr = target_corr.abs().sort_values(ascending=False)

# Select the top 5 variables with the highest absolute correlation
bottom_5_variables = sorted_corr.tail(5)


bottom_5_variable_names = bottom_5_variables.index.map(lambda x: normalized_Final.columns[x])

correlation_normalized_Final = normalized_Final.drop(bottom_5_variable_names, axis=1)

#print(correlation_normalized_Final)



In [None]:
#now we are goin to use feature selection using stepwise methods, this time we ar eusing Sequential Selector form scikit

BioD_80_Bin = BioD_80.replace({'RB': 1, 'NRB': 0})

X_train, X_test, y_train, y_test = train_test_split(normalized_Final, BioD_80_Bin, test_size=0.2)

lmr=LinearRegression()
sfs = SequentialFeatureSelector(lmr, n_features_to_select=36) #36 melhores ou seja ignora as últimas 5
sfs.fit(X_train, y_train)


#get the relevant columns
features=sfs.get_support()
features_selected = np.where(features)[0]

frontward_normalized_Final = normalized_Final.iloc[:, features_selected]

#the same but this time using backwards:

sfs = SequentialFeatureSelector(lmr, n_features_to_select=36, direction= 'backward') #36 melhores ou seja ignora as últimas 5
sfs.fit(X_train, y_train)


#get the relevant columns
features2=sfs.get_support()
features_selected2 = np.where(features)[0]


backward_normalized_Final = normalized_Final.iloc[:, features_selected2]




In [None]:
#now to select the best features we are going to use linear and Kernel PCA starting with linear
from sklearn.decomposition import PCA

pca = PCA(n_components=normalized_Final.shape[1])
pca.fit(normalized_Final)
explained_variance_ratio = pca.explained_variance_ratio_

sorted_variance_ratio = np.sort(explained_variance_ratio)

column_names = normalized_Final.columns

bottom_5_column_names = column_names[np.argsort(explained_variance_ratio)[:5]]

PCA_normalized_Final = normalized_Final.drop(columns=bottom_5_column_names)

print(PCA_normalized_Final)




In [None]:
#now we will test to see wich method produces the best results

cuted = [correlation_normalized_Final, frontward_normalized_Final, backward_normalized_Final, PCA_normalized_Final]

for cuter in cuted:
    X_train, X_test, y_train, y_test = train_test_split(cuter, BioD_80_Bin, test_size=0.2, random_state=42)

    rfc = RandomForestClassifier()
    rfc.fit(X_train, y_train)

    lmr=LinearRegression()
    lmr.fit(X_train, y_train)

    dtc= DecisionTreeClassifier(max_depth=5)
    dtc.fit(X_train, y_train)

    rf_preds=rfc.predict(X_test)
    lr_preds=lmr.predict(X_test)
    dt_preds=dtc.predict(X_test)

    #print("RVE RFs: %7.4f" % explained_variance_score(y_test, rf_preds))
    #print("RVE LRs: %7.4f" % explained_variance_score(y_test, lr_preds))
    #print("RVE DTs: %7.4f" % explained_variance_score(y_test, dt_preds))
    #print("\n")

#AND SO WE DECIDED THAT THE BETS ONE OF THE NIGHT IS BACKWADR_NORMALIZINF FINAL
dataNB_N = pd.DataFrame(neighboursImp.fit_transform(dataNB))
dataNB_NMM = pd.DataFrame(MinMax.fit_transform(dataNB_N), columns=dataNB.columns)

final_dataSet = dataNB_NMM.iloc[:, features_selected2]

final_dataSet['Biodegradable'] = BioD.values








In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(final_dataSet.drop('Biodegradable', axis=1), final_dataSet['Biodegradable'], test_size=0.2, random_state=42)

eval_metrics = []

# Test each model
models = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('Naive Bayes', GaussianNB()),
    ('KNN', KNeighborsClassifier()),
    ('SVM', SVC())
]

for name, model in models:
    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5)  # 5-fold cross-validation
    accuracy = scores.mean()
    
    print(name)
    print(f"Cross-Validation Accuracy: {accuracy:.4f}")
    
    # Fit the model on the entire training set
    model.fit(X_train, y_train)
    
    # Evaluate on the test set
    y_pred = model.predict(X_test)
    cr = classification_report(y_test, y_pred)
    print (cr)
    
    # Append evaluation metrics
    eval_metrics.append((name, accuracy))

# Compare evaluation metrics and choose the best model
best_model = max(eval_metrics, key=lambda x: x[1])
best_model_name, best_model_accuracy = best_model
print("Best Model:")
print(f"Model: {best_model_name}")
print(f"Cross-Validation Accuracy: {best_model_accuracy:.4f}")






In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(final_dataSet.drop('Biodegradable', axis=1), final_dataSet['Biodegradable'], test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 25],
    'min_samples_leaf': [1, 2, 3, 4, 5]
}

# Create the DecisionTreeClassifier model
model = DecisionTreeClassifier()
for i in range(1,6):
    print(f"teste : {i}")
# Perform grid search with cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

# Get the best model and its performance
    best_model = grid_search.best_estimator_
    best_model_accuracy = grid_search.best_score_

    print("Best Model:")
    print(f"Model: {best_model}")
    print(f"Cross-Validation Accuracy: {best_model_accuracy:.4f}")

# Evaluate the best model on the test set
    y_pred = best_model.predict(X_test)
    cr = classification_report(y_test, y_pred)
    print(cr)
    


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SequentialFeatureSelector

# Supondo que X_train contenha a matriz de recursos de treinamento e y_train contenha a variável alvo de treinamento
# Supondo que final_dataSet seja o nome do DataFrame contendo os dados completos

# Crie um classificador de árvore de decisão
modelo = DecisionTreeClassifier()

# Crie um objeto SequentialFeatureSelector e especifique o modelo, a métrica de avaliação e o número de recursos a serem selecionados
sfs = SequentialFeatureSelector(modelo, n_features_to_select=10)

# Ajuste o SFS na matriz de recursos de treinamento e variável alvo de treinamento
sfs.fit(X_train, y_train)

# Obtenha os índices dos recursos selecionados
indices_recursos_selecionados = sfs.get_support(indices=True)

# Obtenha os nomes dos recursos selecionados
recursos_selecionados = final_dataSet.columns[indices_recursos_selecionados]

# Imprima os recursos selecionados
print("Most significant features:")
for recurso in recursos_selecionados:
    print(recurso)
