In [17]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn
from sklearn.model_selection import GridSearchCV

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)


In [18]:
# Leitura do dataset refinado
diretorio_acidentes = 'data/refined/acidentes_recife_normalizados.csv'
df_acidentes = pd.read_csv(diretorio_acidentes, sep=',', parse_dates=['data'])
       

# Escolhendo coluna para Classificação

In [19]:

# Preparação dos dados para um modelo de classificação (natureza_acidente)
df_classificacao = df_acidentes.copy()

# Selecionando as colunas relevantes para classificação
columns_to_drop_classificacao = [
    'Unnamed: 0', 'data', 'hora', 'endereco', 'complemento', 'bairro_cruzamento',
    'descricao', 'Protocolo', 'estacao'
]
#adicionar  flag is_feriado no  pré-processamento

df_classificacao = df_classificacao.drop(columns=columns_to_drop_classificacao)

# Remover linhas com valores ausentes na coluna alvo
df_classificacao = df_classificacao.dropna(subset=['natureza_acidente'])

# Tratamento dos valores ausentes (NaNs) nas features
for column in df_classificacao.columns:
    if df_classificacao[column].dtype == 'object':
        df_classificacao[column].fillna(df_classificacao[column].mode()[0], inplace=True)
    else:
        df_classificacao[column].fillna(df_classificacao[column].mean(), inplace=True)

# Convertendo colunas categóricas para numéricas usando Label Encoding
categorical_columns = df_classificacao.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df_classificacao[column] = df_classificacao[column].astype('category').cat.codes


#Estudar trocar Label Encoding por outra técnica - Target Encoding - Olhar a cardinalidade das variáveis categóricas. Se tiver baixa cardinalidade, one hot encoding.   Se fora alta, target encode

# Separando dados de treinamento, validação e teste.

In [20]:
# Separando os dados em features e target
X_classificacao = df_classificacao.drop(columns=['natureza_acidente'])
y_classificacao = df_classificacao['natureza_acidente']

# Separando os dados em treinamento, validação e teste
X_train, X_temp, y_train, y_temp = train_test_split(X_classificacao, y_classificacao, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)



# Treinando 4 modelos de classificação.

In [21]:
# Padronizar os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

#verificar um possível vazamento de dados

### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

with mlflow.start_run(run_name="Logistic Regression"):
    # Definindo o modelo
    model_lr = LogisticRegression()

    # Treinamento
    model_lr.fit(X_train, y_train)

    # Previsões
    y_pred_val_lr = model_lr.predict(X_val)

    # Avaliação
    accuracy_lr = accuracy_score(y_val, y_pred_val_lr)
    report_lr = classification_report(y_val, y_pred_val_lr, output_dict=True)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy_lr)

    # Log model
    mlflow.sklearn.log_model(model_lr, "model")


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Definir os parâmetros de busca para Logistic Regression
param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l2']
}

# Executar GridSearchCV para Logistic Regression
grid_search_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lr.fit(X_train, y_train)

best_model_lr = grid_search_lr.best_estimator_
print(f"Best Logistic Regression model: {best_model_lr}")
print(f"Best parameters: {grid_search_lr.best_params_}")
print(f"Best score: {grid_search_lr.best_score_}")

# Avaliar o melhor modelo no conjunto de teste
y_pred_test_lr = best_model_lr.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_test_lr)
print(f"Test accuracy for Logistic Regression: {accuracy_lr}")

# Registrar o melhor modelo no MLflow
with mlflow.start_run(run_name="Best Model - Logistic Regression"):
    mlflow.log_metric("test_accuracy", accuracy_lr)
    mlflow.sklearn.log_model(best_model_lr, "best_model")


Best Logistic Regression model: LogisticRegression(C=1)
Best parameters: {'C': 1, 'penalty': 'l2'}
Best score: 0.8795576837856312
Test accuracy for Logistic Regression: 0.8764167690837089


### Decision Tree

In [24]:
with mlflow.start_run(run_name="Decision Tree"):
    # Definindo o modelo
    model_dt = DecisionTreeClassifier()

    # Treinamento
    model_dt.fit(X_train, y_train)

    # Previsões
    y_pred_val_dt = model_dt.predict(X_val)

    # Avaliação
    accuracy_dt = accuracy_score(y_val, y_pred_val_dt)
    report_dt = classification_report(y_val, y_pred_val_dt, output_dict=True)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy_dt)
    
    # Log classification report
    mlflow.log_metrics({
        'precision': report_dt['weighted avg']['precision'],
        'recall': report_dt['weighted avg']['recall'],
        'f1-score': report_dt['weighted avg']['f1-score']
    })

    # Log model
    mlflow.sklearn.log_model(model_dt, "model")


In [28]:
# Definir os parâmetros de busca para Decision Tree
param_grid_dt = {
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Executar GridSearchCV para Decision Tree
grid_search_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_dt.fit(X_train, y_train)

best_model_dt = grid_search_dt.best_estimator_
print(f"Best Decision Tree model: {best_model_dt}")
print(f"Best parameters: {grid_search_dt.best_params_}")
print(f"Best score: {grid_search_dt.best_score_}")

# Avaliar o melhor modelo no conjunto de teste
y_pred_test_dt = best_model_dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_test_dt)
print(f"Test accuracy for Decision Tree: {accuracy_dt}")

# Registrar o melhor modelo no MLflow
with mlflow.start_run(run_name="Best Model - Decision Tree"):
    mlflow.log_metric("test_accuracy", accuracy_dt)
    mlflow.sklearn.log_model(best_model_dt, "best_model")


Best Decision Tree model: DecisionTreeClassifier(max_depth=10, min_samples_split=10)
Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best score: 0.959761479682629
Test accuracy for Decision Tree: 0.9586235149528881


### Random Forest

In [25]:
with mlflow.start_run(run_name="Random Forest"):
    # Definindo o modelo
    model_rf = RandomForestClassifier()

    # Treinamento
    model_rf.fit(X_train, y_train)

    # Previsões
    y_pred_val_rf = model_rf.predict(X_val)

    # Avaliação
    accuracy_rf = accuracy_score(y_val, y_pred_val_rf)
    report_rf = classification_report(y_val, y_pred_val_rf, output_dict=True)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy_rf)
    
    # Log classification report
    mlflow.log_metrics({
        'precision': report_rf['weighted avg']['precision'],
        'recall': report_rf['weighted avg']['recall'],
        'f1-score': report_rf['weighted avg']['f1-score']
    })

    # Log model
    mlflow.sklearn.log_model(model_rf, "model")



In [29]:
# Definir os parâmetros de busca para Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Executar GridSearchCV para Random Forest
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

best_model_rf = grid_search_rf.best_estimator_
print(f"Best Random Forest model: {best_model_rf}")
print(f"Best parameters: {grid_search_rf.best_params_}")
print(f"Best score: {grid_search_rf.best_score_}")

# Avaliar o melhor modelo no conjunto de teste
y_pred_test_rf = best_model_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_test_rf)
print(f"Test accuracy for Random Forest: {accuracy_rf}")

# Registrar o melhor modelo no MLflow
with mlflow.start_run(run_name="Best Model - Random Forest"):
    mlflow.log_metric("test_accuracy", accuracy_rf)
    mlflow.sklearn.log_model(best_model_rf, "best_model")


Best Random Forest model: RandomForestClassifier(max_depth=20, min_samples_split=10)
Best parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 100}
Best score: 0.9614002435496285
Test accuracy for Random Forest: 0.9597159634029769


### Support Vector Machine

In [26]:
with mlflow.start_run(run_name="Support Vector Machine"):
    # Definindo o modelo
    model_svm = SVC()

    # Treinamento
    model_svm.fit(X_train, y_train)

    # Previsões
    y_pred_val_svm = model_svm.predict(X_val)

    # Avaliação
    accuracy_svm = accuracy_score(y_val, y_pred_val_svm)
    report_svm = classification_report(y_val, y_pred_val_svm, output_dict=True)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy_svm)
    
    # Log classification report metrics
    mlflow.log_metrics({
        'precision_weighted': report_svm['weighted avg']['precision'],
        'recall_weighted': report_svm['weighted avg']['recall'],
        'f1_score_weighted': report_svm['weighted avg']['f1-score'],
        'precision_macro': report_svm['macro avg']['precision'],
        'recall_macro': report_svm['macro avg']['recall'],
        'f1_score_macro': report_svm['macro avg']['f1-score']
    })

    # Log model
    mlflow.sklearn.log_model(model_svm, "model")


In [30]:
# Definir os parâmetros de busca para Support Vector Machine
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Executar GridSearchCV para Support Vector Machine
grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_svm.fit(X_train, y_train)

best_model_svm = grid_search_svm.best_estimator_
print(f"Best Support Vector Machine model: {best_model_svm}")
print(f"Best parameters: {grid_search_svm.best_params_}")
print(f"Best score: {grid_search_svm.best_score_}")

# Avaliar o melhor modelo no conjunto de teste
y_pred_test_svm = best_model_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_test_svm)
print(f"Test accuracy for Support Vector Machine: {accuracy_svm}")

# Registrar o melhor modelo no MLflow
with mlflow.start_run(run_name="Best Model - Support Vector Machine"):
    mlflow.log_metric("test_accuracy", accuracy_svm)
    mlflow.sklearn.log_model(best_model_svm, "best_model")


KeyboardInterrupt: 

In [31]:
#comando feito para inicializar o mlflow no navegador. Descomente quando quiser visualizar.
#http://localhost:5000/


!mlflow ui


^C
