In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                            confusion_matrix, ConfusionMatrixDisplay)
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression


In [None]:
from resolve_path import ajuste_path

In [None]:
path = "data/util/"
path = ajuste_path(path)

df = pd.read_csv(path + "dataset_treinamento.csv")

## Preparando o dataframe para o modelo

### Escolhendo as colunas utilizadas

In [None]:
df.rename(columns={"Binario Acidentes de Alto Potencial": "Alto Potencial"}, inplace=True)

columns = ["Ano", "Mes", "Local de instalação", "HH por mes", "Alto Potencial"]
df = df[columns]

print(df.info())
print(df["Alto Potencial"].value_counts())

### Encoding do local

In [None]:
label_encoder = LabelEncoder()

# locais antes do encoding
print(df["Local de instalação"].nunique())

df["local encoded"] = label_encoder.fit_transform(df["Local de instalação"])
df = df.drop(columns=["Local de instalação"])

# locais encoded
print(df["local encoded"].nunique())

columns = ["Ano", "Mes", "HH por mes", "local encoded", "Alto Potencial"]
df = df[columns]

## Matriz de correlação entre as colunas escolhidas

In [None]:
corr = df.drop(columns="Ano").corr()

# list(plt.colormaps)
sns.heatmap(corr, cmap='RdBu', annot=True, vmin=-1, vmax=1)
plt.title("Matriz de Correlação")
plt.xticks(rotation=45)

plt.show()

## Definindo o treino e teste

In [None]:
X_columns = ["Ano", "Mes", "HH por mes", "local encoded"]
y_column = "Alto Potencial"

X = df[X_columns]
y = df[y_column]

X_train = X[((X['Ano'] >= 2020) & (X['Ano'] <= 2022)) | ((X['Ano'] == 2023) & (X['Mes'] <= 6))].drop(columns=["Ano"], axis=1)
y_train = y[((X['Ano'] >= 2020) & (X['Ano'] <= 2022)) | ((X['Ano'] == 2023) & (X['Mes'] <= 6))]

print(y_train.value_counts())

X_test = X[(X['Ano'] >= 2024) | ((X['Ano'] == 2023) & (X['Mes'] > 6))].drop(columns=["Ano"], axis=1)
y_test = y[(X['Ano'] >= 2024) | ((X['Ano'] == 2023) & (X['Mes'] > 6))]

print(y_test.value_counts())

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

## Modelo

### Treino e Predição

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis()
}

y_preds = {}

for m, func in models.items():
    print(f"Training {m}")

    model = func
    model.fit(X_train, y_train)

    print(f"Testing {m}")
    y_pred = model.predict(X_test)
    y_preds[m] = y_pred

### Avaliação das métricas básicas

In [None]:
for model, y_pred in y_preds.items():
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    precision = precision_score(y_true=y_test, y_pred=y_pred, average="binary")
    recall = recall_score(y_true=y_test, y_pred=y_pred, average="binary")
    f1 = f1_score(y_true=y_test, y_pred=y_pred, average="binary")

    cm = confusion_matrix(y_true=y_test, y_pred=y_pred)

    # print(f"Accuracy: {accuracy:.2f}")
    # print(f"Precision: {precision:.2f}")
    # print(f"Recall: {recall:.2f}")
    # print(f"F1-Score: {f1:.2f}")

    cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negativo", "Positivo"])
    cm_disp.plot(cmap="Blues")
    # plt.set_cmap("Blues")
    plt.title(model)
    plt.show()
