In [None]:
# ============================
# üì¶ IMPORTA√á√ïES
# ============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

sns.set(style="whitegrid")


In [None]:
# ============================
# üì• CARREGAR PLANILHA
# ============================

df = pd.read_excel("projetos_completos_final.xlsx")
df.head()

In [None]:
# ============================
# üéØ CRIAR VARI√ÅVEL ALVO
# ============================

categorias_risco = ["Suspenso", "Paralisado", "Cancelado"]

df["risco"] = df["status_atual_do_projeto"].apply(
    lambda x: 1 if str(x).strip() in categorias_risco else 0
)

df["risco"].value_counts()

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(x=df["risco"])
plt.title("Distribui√ß√£o da vari√°vel RISCO")
plt.show()


In [None]:
plt.figure(figsize=(12,5))
df.groupby(["setor", "risco"]).size().unstack().plot(kind="bar", figsize=(12,5))
plt.title("Projetos em risco por setor")
plt.ylabel("Quantidade")
plt.show()


In [None]:
plt.figure(figsize=(7,5))
sns.boxplot(data=df, x="risco", y="custo_estimado")
plt.title("Rela√ß√£o entre custo estimado e risco")
plt.show()

In [None]:
# ============================
# üì¶ FEATURE SELECTION
# ============================

num_features = ["custo_estimado", "latitude", "longitude"]

cat_features = [
    "tipo_projeto", "setor", "subsetor", "organizacao",
    "status_do_contrato", "status_atividade", "arranjo_contratual"
]

# remover vari√°veis muito textuais / IDs
df_model = df[num_features + cat_features + ["risco"]].copy()

In [None]:
X = df_model.drop("risco", axis=1)
y = df_model["risco"]

In [None]:
# ============================
# üßπ PREPROCESSADOR
# ============================

preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_features),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_features)
    ]
)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=300),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [None]:
# ============================
# ü§ñ TRAIN + CV AVALIA√á√ÉO
# ============================

results = {}

for name, model in models.items():
    pipe = Pipeline([("prep", preprocess), ("model", model)])
    scores = cross_val_score(pipe, X, y, cv=5, scoring="f1")
    results[name] = scores.mean()
    print(f"{name} ‚Üí F1-score m√©dio: {scores.mean():.4f}")

In [None]:
# ============================
# üèÜ TREINAR MODELO FINAL
# ============================

best_model = Pipeline([("prep", preprocess), ("model", RandomForestClassifier(n_estimators=300))])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

best_model.fit(X_train, y_train)

preds = best_model.predict(X_test)

print(classification_report(y_test, preds))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_predictions(y_test, preds)
plt.show()

In [None]:
# ============================
# üîç IMPORT√ÇNCIA DAS FEATURES
# ============================

# O RandomForest depois do OneHotEncoder expande colunas
model_rf = best_model.named_steps["model"]
ohe = best_model.named_steps["prep"].named_transformers_["cat"].named_steps["ohe"]

feature_names = num_features + list(ohe.get_feature_names_out(cat_features))

importances = pd.Series(model_rf.feature_importances_, index=feature_names)
importances.sort_values(ascending=False).head(20)