In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                            confusion_matrix, ConfusionMatrixDisplay,
                            make_scorer)
from sklearn.model_selection import KFold, LeaveOneOut, LeavePOut, cross_val_predict, cross_val_score
from sklearn.preprocessing import LabelEncoder


In [None]:
from resolve_path import ajuste_path

In [None]:
path = "data/util/"
path = ajuste_path(path)

df = pd.read_csv(path + "dataset_treinamento.csv")
df.info()

## Preparando o dataframe para o modelo

### Encoding do local

In [None]:
label_encoder = LabelEncoder()

# locais antes do encoding
print(df["Local de instalação"].nunique())

df["local encoded"] = label_encoder.fit_transform(df["Local de instalação"])
df = df.drop(columns=["Local de instalação"])

# locais encoded
print(df["local encoded"].nunique())

### Escolhendo as colunas utilizadas

In [None]:
# df.rename(columns={"Binario Acidentes de Alto Potencial": "Alto Potencial"}, inplace=True)

# teste
print(df["binario acidentes"].value_counts())
print(df["Quantidade de Acidentes"].sum())

columns = ["Ano", "Mes", "HH total", "local encoded", "binario acidentes"]
df = df[columns]

# print(df.info())

## Matriz de correlação entre as colunas escolhidas

In [None]:
corr = df.drop(columns="Ano").corr()

# list(plt.colormaps)
sns.heatmap(corr, cmap='RdBu', annot=True, vmin=-1, vmax=1)
plt.title("Matriz de Correlação")
plt.xticks(rotation=45)

plt.show()

## Definindo os datasets

### Escolhendo as features e o target

In [None]:
X_columns = ["Ano", "Mes", "HH total", "local encoded"]
y_column = "binario acidentes"

X = df[X_columns]
y = df[y_column]

## Modelo

### Modelos

In [None]:
models = {
    "Logistic Regression": {
        "model": LogisticRegression(),
        "probs": [],
        "preds": [],
        },
    "Linear Discriminant Analysis": {
        "model": LinearDiscriminantAnalysis(),
        "probs": [],
        "preds": [],
        },
    "SVC": {
        "model": SVC(probability=True, class_weight="balanced"),
        "probs": [],
        "preds": [],
        },
    "Naive Bayes": {
        "model": GaussianNB(),
        "probs": [],
        "preds": [],
        },
    "GBM": {
        "model":  GradientBoostingClassifier(),
        "probs": [],
        "preds": [],
        },
    "Random Forest": {
        "model": RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced"),
        "probs": [],
        "preds": [],
        },
}

### Cross validation

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
loo = LeaveOneOut()
lpo = LeavePOut(p=5)

### Treino e Predição

In [None]:
X = X.drop(columns=["Ano"], axis=1)
for model_name, m in models.items():
    model = m["model"]
    threshold = 0.5

    print(f"Model: {model_name}")
    m["probs"] = cross_val_predict(model, X, y, cv=kfold, method="predict_proba")[:, 1]
    m["preds"] = (m["probs"] >= threshold).astype(int)

### Distribuição normal das probabilidades

In [None]:
for model_name, m in models.items():
    print(f"Model: {model_name}")

    max = m["probs"].max()
    freq_probs = pd.Series(m["probs"]).value_counts(bins=np.arange(0, max + 0.01, 0.01)).sort_index()
    print(freq_probs[freq_probs > 0])
    print("\n")

### Avaliação das métricas básicas

In [None]:
# for model, y_pred in y_preds.items():
#     accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
#     precision = precision_score(y_true=y_test, y_pred=y_pred, average="binary")
#     recall = recall_score(y_true=y_test, y_pred=y_pred, average="binary")
#     f1 = f1_score(y_true=y_test, y_pred=y_pred, average="binary")

#     cm = confusion_matrix(y_true=y_test, y_pred=y_pred)

#     # print(f"Accuracy: {accuracy:.2f}")
#     print(f"Precision: {precision:.2f}")
#     print(f"Recall: {recall:.2f}")
#     print(f"F1-Score: {f1:.2f}")

#     cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negativo", "Positivo"])
#     cm_disp.plot(cmap="Blues")

#     plt.title(model)
#     plt.show()
