In [1]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("soybean.csv")

#
SVC

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from yellowbrick.classifier import ConfusionMatrix

In [3]:
# Separar a variável alvo
df_data = df.drop(columns="class")
df_target = df["class"]

In [4]:
# Realizar o Label Encoder das variáveis
for i in range(len(df_data.columns)):
  df_data.iloc[:, i] = LabelEncoder().fit_transform(df_data.iloc[:, i])

In [5]:
# Separar as variáveis de treino e teste (70/30)
x_treinamento, x_teste, y_treinamento, y_teste = train_test_split(
  df_data,
  df_target,
  test_size=0.3,
  random_state=42
)

In [6]:
# Definir o modelo SVC
svc = SVC(random_state=42)

In [7]:
# Definir a grade de parâmetros a serem testados
param_grid = {
  'C': [0.1,1,10,100],
  'gamma':[1,0.1,0.01,0.001],
  'kernel':['rbf']
}

In [8]:
# Configurar o GridSearchCV
grid_search = GridSearchCV(
  svc,
  param_grid,
  refit=True,
  verbose=0,
  cv=5
)

In [None]:
# Realizar a busca pelos melhores hiperparâmetros
grid_search.fit(df_data, df_target)

In [None]:
grid_search.best_params_

In [11]:
# Criar o modelo de Support Vector Machine para Classificação
svm = SVC(
  C=10,
  gamma=0.01,
  kernel='rbf',
  random_state=42
)
svm.fit(x_treinamento, y_treinamento)

# Realizar a previsão
previsoes = svm.predict(x_teste)

# Calcular a taxa de acerto
taxa_acerto = accuracy_score(y_teste, previsoes)

In [None]:
taxa_acerto

In [None]:
# Criar Matriz Confusão
matrix = ConfusionMatrix(SVC(
  C=10,
  gamma=0.01,
  kernel='rbf',
  random_state=42
))
matrix.fit(x_treinamento, y_treinamento)
matrix.score(x_teste, y_teste)
matrix.poof()

#
Random Forest

In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from yellowbrick.classifier import ConfusionMatrix
import matplotlib.pyplot as plt

In [None]:
df_target = df["class"]
df_data = df.drop(columns="class", axis=1)

In [4]:
# Realizar one-hot encoding para todas as colunas categóricas
df_encoded = pd.get_dummies(df_data)

In [5]:
# Dividir os dados em treino e teste
x_train, x_test, y_train, y_test = train_test_split(
  df_encoded,
  df_target,
  test_size=0.3,
  random_state=42
)

In [6]:
# Definir o modelo
rf = RandomForestClassifier(
  random_state=42
)

In [7]:
# Definir a grade de parâmetros a serem testados
param_grid = {
  'n_estimators':[100,200],
  'max_depth':[None, 10,20,30],
  'min_samples_split':[2,5,10],
  'min_samples_leaf':[1,2,4]
}

In [None]:
# Configurar o GridSearchCV
grid_search = GridSearchCV(rf, param_grid, refit=True, verbose=0, cv=5)

In [None]:
# Realizar a busca de hiperparâmetros
grid_search.fit(df_encoded, df_target)

In [None]:
grid_search.best_params_

In [None]:
# Treinar um modelo de Random Forest
model = RandomForestClassifier(
  n_estimators=200,
  max_depth=10,
  min_samples_split=5,
  min_samples_leaf=1,
  random_state=42
)
model.fit(x_train, y_train)

In [None]:
# Fazer previsões e avaliar o modelo
previsoes = model.predict(x_test)
taxa_acerto = accuracy_score(y_test, previsoes)
taxa_acerto

In [None]:
# Criar Matriz Confusão
plt.figure(figsize=(15,10))
matrix = ConfusionMatrix(RandomForestClassifier())
matrix.fit(x_train, y_train)
matrix.score(x_test, y_test)
matrix.poof()
plt.show()

#
Gradient Boosting Machine (GBM)

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from yellowbrick.classifier import ConfusionMatrix
import numpy as np
import optuna

In [24]:
df_target = df["class"]
df_data = df.drop(columns="class")
df_encoded = pd.get_dummies(df_data)

In [25]:
x_train, x_test, y_train, y_test = train_test_split(
  df_encoded,
  df_target,
  test_size=0.3,
  random_state=42
)

In [6]:
def objective(trial):
  n_estimators = trial.suggest_int('n_estimators', 100, 200)
  max_depth = trial.suggest_int('max_depth', 2, 10)
  min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
  
  model = GradientBoostingClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=42
  )
  
  model.fit(x_train, y_train)
  
  return taxa_acerto

In [None]:
# Controlar a saída do log
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Criar um estudo e otimizar
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

{'n_estimators': 148,
 'max_depth': 10,
 'min_samples_split': 8,
 'min_samples_leaf': 1}

In [None]:
study.best_params

In [38]:
# Inicializar o modelo
gbm = GradientBoostingClassifier(
  n_estimators=148,
  max_depth=10,
  min_samples_split=8,
  min_samples_leaf=1,
  random_state=42
  )

In [None]:
gbm.fit(x_train, y_train)

In [None]:
previsoes = gbm.predict(x_test)
taxa_acerto = accuracy_score(y_test, previsoes)
taxa_acerto

In [None]:
# Criar Matriz Confusão
plt.figure(figsize=(15,10))
matrix = ConfusionMatrix(GradientBoostingClassifier())
matrix.fit(x_train, y_train)
matrix.score(x_test, y_test)
matrix.poof()
plt.show()

#
KNeighborsClassifier

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from yellowbrick.classifier import ConfusionMatrix

In [4]:
df_target = df["class"]
df_data = df.drop(columns="class", axis=1)
df_encoded = pd.get_dummies(df_data)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
  df_encoded,
  df_target,
  test_size=0.3,
  random_state=42
)

In [6]:
knn = KNeighborsClassifier()

In [11]:
param_grid = {
  'n_neighbors': [3, 5, 7, 9, 11],
  'weights': ['uniform', 'distance'],
  'metric': ['euclidean', 'manhattan', 'minkowski']
}

In [12]:
grid_search = GridSearchCV(knn, param_grid, refit=True,verbose=0, cv=5)

In [None]:
grid_search.fit(df_encoded, df_target)

In [None]:
grid_search.best_params_

In [15]:
model = KNeighborsClassifier(
  n_neighbors=3,
  metric='euclidean',
  weights='uniform'
)
model.fit(x_train, y_train)
  
previsoes = model.predict(x_test)
taxa_acerto = accuracy_score(y_test, previsoes)

In [None]:
taxa_acerto

In [None]:
# Criar Matriz Confusão
plt.figure(figsize=(15,10))
matrix = ConfusionMatrix(KNeighborsClassifier())
matrix.fit(x_train, y_train)
matrix.score(x_test, y_test)
matrix.poof()
plt.show()