In [1]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [3]:
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import time

warnings.filterwarnings("ignore")

In [4]:
def logistic_regression_binary(X_train, X_test, y_train, y_test, debug=1):
    lr_model = LogisticRegression(random_state=42)
    
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'lbfgs', 'saga'],
        'max_iter': [100, 200, 300]
    }

    grid_search = GridSearchCV(lr_model, param_grid, cv=10)
    start = time.time()
    grid_search.fit(X_train, y_train)
    end = time.time()

    execution_time = end - start

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    class_report = classification_report(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)

    if debug == 1:
        # Visualizando resultados
        print("Logistic Regression")
        print("-" * 40)
        print('Tempo de execução:', execution_time)
        print('Acurácia:', accuracy)
        print('best_estimator_', grid_search.best_estimator_)
        print("\nMatris de Confusão:\n", confusion)
        print("\nRelatório de Classificação:\n", class_report)

In [5]:
def random_forest(X_train, X_test, y_train, y_test, debug=1):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

    param_grid = {
        'n_estimators': [100, 500],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt']
    }

    grid_search = GridSearchCV(rf_model, param_grid, cv=10)
    start = time.time()
    grid_search.fit(X_train, y_train)
    end = time.time()

    execution_time = end - start

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    class_report = classification_report(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)

    class_report = classification_report(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)

    if debug == 1:
        # Visualizando resultados
        print("Random Forest")
        print("-" * 40)
        print('Tempo de execução:', execution_time)
        print('Acurácia:', accuracy)
        print('best_estimator_', grid_search.best_estimator_)
        print("\nMatris de Confusão:\n", confusion)
        print("\nRelatório de Classificação:\n", class_report)

In [6]:
def xgb(X_train, X_test, y_train, y_test, debug=1):
    xgb_model = XGBClassifier(random_state=42)

    param_grid = {
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.5, 1],
        # 'colsample_bytree': [0.6, 1.0],
        # 'max_depth': [3, 4, 5, 6]
        'subsample': [0.8],
        'colsample_bytree': [0.8],
        'max_depth': [None, 3, 5, 10, 20],
        'n_estimators': [100, 500],
    }

    grid_search = GridSearchCV(xgb_model, param_grid, cv=10)
    start = time.time()
    grid_search.fit(X_train, y_train)
    end = time.time()

    execution_time = end - start

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    class_report = classification_report(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)

    if debug == 1:
        # Visualizando resultados
        print("XGB")
        print("-" * 40)
        print('Tempo de execução:', execution_time)
        print('Acurácia:', accuracy)
        print('best_estimator_', grid_search.best_estimator_)
        print("\nMatris de Confusão:\n", confusion)
        print("\nRelatório de Classificação:\n", class_report)

In [7]:
df = pd.read_csv("../datasets/transformed_dataset.csv", sep=";")

X = df.drop(["target"], axis=1)
y = df["target"]

scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
logistic_regression_binary(X_train, X_test, y_train, y_test)

Logistic Regression
----------------------------------------
Tempo de execução: 1054.701627254486
Acurácia: 0.8561121418264276
best_estimator_ LogisticRegression(C=0.1, penalty='l1', random_state=42, solver='saga')

Matris de Confusão:
 [[1865  398]
 [ 300 2288]]

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.86      0.82      0.84      2263
           1       0.85      0.88      0.87      2588

    accuracy                           0.86      4851
   macro avg       0.86      0.85      0.86      4851
weighted avg       0.86      0.86      0.86      4851



In [9]:
random_forest(X_train, X_test, y_train, y_test)

Random Forest
----------------------------------------
Tempo de execução: 3073.408132791519
Acurácia: 0.8270459699031127
best_estimator_ RandomForestClassifier(min_samples_split=5, n_estimators=500, random_state=42)

Matris de Confusão:
 [[1780  483]
 [ 356 2232]]

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.83      0.79      0.81      2263
           1       0.82      0.86      0.84      2588

    accuracy                           0.83      4851
   macro avg       0.83      0.82      0.83      4851
weighted avg       0.83      0.83      0.83      4851



In [10]:
xgb(X_train, X_test, y_train, y_test)

XGB
----------------------------------------
Tempo de execução: 1957.0745494365692
Acurácia: 0.8538445681302824
best_estimator_ XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)

Matris de Confusão:
 [[1885  378]
 [ 331 2257]]

Relatório de Classificação:
               precision    recall  f1-score   su