In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (ConfusionMatrixDisplay, confusion_matrix,
                             f1_score, precision_score, recall_score)
from sklearn.model_selection import (GridSearchCV, LeaveOneOut, LeavePOut,
                                     StratifiedKFold, cross_val_predict)
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (LabelEncoder, MaxAbsScaler, MinMaxScaler,
                                   Normalizer, PowerTransformer,
                                   QuantileTransformer, RobustScaler,
                                   StandardScaler)
from sklearn.svm import SVC

In [None]:
from resolve_path import ajuste_path

In [None]:
# Configuração para não exibir os warnings
import warnings

warnings.filterwarnings("ignore")

In [None]:
path = "data/util/"
path = ajuste_path(path)

df = pd.read_csv(path + "dataset_treinamento.csv")
df.info()

## Preparando o dataframe para o modelo

### Encoding do local

In [None]:
label_encoder = LabelEncoder()

# locais antes do encoding
print(df["Local de instalação"].nunique())

df["local encoded"] = label_encoder.fit_transform(df["Local de instalação"])
df = df.drop(columns=["Local de instalação"])

# locais encoded
print(df["local encoded"].nunique())

### Escolhendo as colunas utilizadas

In [None]:
print(df["binario acidentes"].value_counts())
print(df["Quantidade de Acidentes"].sum())

columns = ["Ano", "Mes", "HH total", "local encoded", "binario acidentes"]
df = df[columns]

## Matriz de correlação entre as colunas escolhidas

In [None]:
corr = df.drop(columns="Ano").corr()

sns.heatmap(corr, cmap='RdBu', annot=True, vmin=-1, vmax=1)
plt.title("Matriz de Correlação")
plt.xticks(rotation=45)

plt.show()

## Definindo os datasets

### Escolhendo as features e o target

In [None]:
X_columns = ["Ano", "Mes", "HH total",
             "local encoded"]
y_column = "binario acidentes"

X = df[X_columns]
y = df[y_column]

## Modelo

### Modelos

In [None]:
scalers = [None, StandardScaler(), MinMaxScaler(), MaxAbsScaler(),
           RobustScaler(), Normalizer(), QuantileTransformer(), PowerTransformer()]

models = {
    "Logistic Regression": {
        "model": Pipeline([
            ('scaler', None),
            ('clf', LogisticRegression())
        ]),
        "probs": [],
        "preds": [],
        "params": {
            'scaler': scalers,
            'clf__C': [0.01, 0.05, 0.1, 1, 5],
            'clf__max_iter': [100, 1000, 10000],
            'clf__class_weight': ["balanced", None],
        },
        "best_params": {}
    },
    "Linear Discriminant Analysis": {
        "model": Pipeline([
            ('scaler', None),
            ('clf', LinearDiscriminantAnalysis())
        ]),
        "probs": [],
        "preds": [],
        "params": {
            'scaler': scalers,
            'clf__solver': ["svd", "lsqr", "eigen"],
            'clf__n_components': [None, 1],
        },
        "best_params": {}
    },
    "SVC": {
        "model": Pipeline([
            ('scaler', None),
            ('clf', SVC(probability=True))
        ]),
        "probs": [],
        "preds": [],
        "params": {
            'scaler': scalers,
            'clf__class_weight': ["balanced", None],
            'clf__C': [0.01, 0.1, 1],
            'clf__kernel': ["linear", "poly", "rbf", "sigmoid"],
        },
        "best_params": {}
    },
    "Naive Bayes": {
        "model": Pipeline([
            ('scaler', None),
            ('clf', GaussianNB())
        ]),
        "probs": [],
        "preds": [],
        "params": {
            'scaler': scalers,
            'clf__var_smoothing': [1e-9, 1e-8, 1e-7],
        },
        "best_params": {}
    },
    "GBM": {
        "model": Pipeline([
            ('scaler', None),
            ('clf', GradientBoostingClassifier())
        ]),
        "probs": [],
        "preds": [],
        "params": {
            'scaler': scalers,
            'clf__n_estimators': [50, 100, 150, 200, 250],
            'clf__learning_rate': [0.01, 0.1, 1],
            'clf__max_depth': [1, 3, 5, 7, None],
        },
        "best_params": {}
    },
    "Random Forest": {
        "model": Pipeline([
            ('scaler', None),
            ('clf', RandomForestClassifier())
        ]),
        "probs": [],
        "preds": [],
        "params": {
            'scaler': scalers,
            'clf__n_estimators': [50, 100, 150, 200, 250],
            'clf__max_depth': [1, 3, 5, 7, None],
            'clf__class_weight': ["balanced", None],
        },
        "best_params": {}
    },
}

### Cross validation

In [None]:
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

### GridSearch

In [None]:
results = pd.DataFrame(
    columns=["Model", "Best Params", "F1", "Precision", "Recall"])

In [None]:
threshold = 0.5

for model_name, model in models.items():
    if model["best_params"] != {}:
        continue

    print(f"GridSearch: {model_name}")
    grid_search = GridSearchCV(
        model["model"], model["params"], cv=skfold, scoring="f1", n_jobs=-1)
    grid_search.fit(X, y)
    model["best_params"] = grid_search.best_params_

    result = {
        "Model": model_name,
        "Best Params": grid_search.best_params_,
        "F1": grid_search.best_score_,
    }

    results = pd.concat([results, pd.DataFrame([result])])

    print("Model: ", model_name)
    print("Best Params:", model["best_params"])

    print("-"*50)

In [None]:
results.to_csv(path + "predicao/results.csv", index=False)

### Treino e Predição

In [None]:
for model_name, model in models.items():
    print(f"Training {model_name}")
    model["model"].set_params(**model["best_params"])
    print(model["model"].get_params())

    model["probs"] = cross_val_predict(
        model["model"], X, y, cv=skfold, method="predict_proba")[:, 1]

    model["preds"] = (model["probs"] >= threshold).astype(int)

### Distribuição normal das probabilidades

In [None]:
for model_name, m in models.items():
    print(f"Model: {model_name}")

    max = m["probs"].max()
    freq_probs = pd.Series(m["probs"]).value_counts(
        bins=np.arange(0, max + 0.01, 0.01)).sort_index()
    print(freq_probs[freq_probs > 0])
    print("\n")

### Avaliação das métricas básicas

In [None]:
for model_name, m in models.items():
    print(f"Model: {model_name}")
    print(f"Precision: {precision_score(y, m['preds'])}")
    print(f"Recall: {recall_score(y, m['preds'])}")
    print(f"F1 Score: {f1_score(y, m['preds'])}")

    cm = confusion_matrix(y_true=y, y_pred=m["preds"])

    cm_disp = ConfusionMatrixDisplay(
        confusion_matrix=cm, display_labels=["Negativo", "Positivo"])
    cm_disp.plot(cmap="Blues")

    plt.title(model_name)
    plt.show()