# Import Libraries


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load Data


In [None]:
# Load Data
df = pd.read_csv("data/sods.csv")

# Show Info
df.info()

In [None]:
# Drop rows with NaN values in 'cargo' column
df = df.dropna(subset=["cargo"])

# Show NaN values in 'cargo' column
df["cargo"].isnull().sum()

# Pre-process


In [None]:
# Copy DataFrame for Pre-processing
pp_df = df.copy()

# Show Info
pp_df.info()

In [None]:
# Show the NaN values of the DataFrame
pp_df.isnull().sum()

In [None]:
# Fill NaN in 'idade' with median and convert to int
pp_df["idade"] = pp_df["idade"].fillna(pp_df["idade"].median()).astype(int)
pp_df["idade"].isnull().sum()

In [None]:
# Fill NaN in 'genero' with mode
pp_df["genero"] = pp_df["genero"].fillna(pp_df["genero"].mode()[0])
pp_df["genero"].isnull().sum()

In [None]:
# Fill NaN in 'formacao' with 'nenhuma' value
pp_df["formacao"] = pp_df["formacao"].fillna("nenhuma")
pp_df["formacao"].isnull().sum()

In [None]:
# Drop columns with too many NaN values
pp_df = pp_df.drop(columns=["estado_moradia", "bancos_de_dados"])

# Drop rows with NaN values
pp_df = pp_df.dropna()

# Show the NaN values of the DataFrame
pp_df.isnull().sum()

# Transformation


In [None]:
# Copy DataFrame for Transformation
tt_df = pp_df.copy()

# Show Info
tt_df.info()

## Normalize Columns


In [None]:
# Normalize columns to lower case, strip whitespace, and remove accents
def normalize_column(column):
    return (
        column.str.lower()
        .str.strip()
        .str.normalize("NFKD")
        .str.encode("ascii", errors="ignore")
        .str.decode("utf-8")
    )


tt_df.iloc[:, 1:4] = tt_df.iloc[:, 1:4].apply(normalize_column)
tt_df.iloc[:, 5:] = tt_df.iloc[:, 5:].apply(normalize_column)

tt_df

## "nivel_ensino" Transformation


In [None]:
# Map 'nivel_ensino' values to integers
nivel_ensino_map = {
    "estudante de graduacao": 1,
    "graduacao/bacharelado": 2,
    "pos-graduacao": 3,
    "mestrado": 4,
    "doutorado ou phd": 5,
    "nao tenho graduacao formal": 0,
    "prefiro nao informar": 0,
}

tt_df["nivel_ensino"] = tt_df["nivel_ensino"].replace(nivel_ensino_map)
tt_df["nivel_ensino"].value_counts(dropna=False)

## "tempo_experiencia_dados" Transformation


In [None]:
# Map 'tempo_experiencia_dados' values to floating-point numbers
tempo_experiencia_dados_map = {
    "menos de 1 ano": 0.5,
    "de 1 a 2 anos": 1.5,
    "de 3 a 4 anos": 3.5,
    "de 4 a 6 anos": 5.0,
    "de 7 a 10 anos": 8.0,
    "mais de 10 anos": 12.0,
    "nao tenho experiencia na area de dados": 0,
}

tt_df["tempo_experiencia_dados"] = tt_df["tempo_experiencia_dados"].replace(
    tempo_experiencia_dados_map
)
tt_df["tempo_experiencia_dados"].value_counts(dropna=False)

## "linguagens_preferidas" Transformation


In [None]:
# Map 'linguagens_preferidas' values to a common set
linguagens_preferidas_map = {
    "pyspark": "spark",
    "xlsx": "excel",
    "m language": "m",
    "aql": "sql",
    "sql postegres": "sql",
    "nao sei": "nenhuma",
    "nao utilizo": "nenhuma",
    "softwares estatisticos como spss": "nenhuma",
    "nao atuo com programacao": "nenhuma",
    "nenhum": "nenhuma",
    "nao uso": "nenhuma",
}
tt_df["linguagens_preferidas"] = tt_df["linguagens_preferidas"].replace(
    linguagens_preferidas_map
)

lp_min_freq = 5
lp_counts = tt_df["linguagens_preferidas"].explode().value_counts()
common_lp = lp_counts[lp_counts >= lp_min_freq].index

tt_df["linguagens_preferidas"] = tt_df["linguagens_preferidas"].apply(
    lambda x: x if x in common_lp else "outro"
)
tt_df["linguagens_preferidas"].value_counts(dropna=False)

## "bancos_de_dados" Transformation


In [None]:
# Split 'bancos_de_dados' into a list of lowercased values
tt_df["bancos_de_dados_split"] = tt_df["bancos_de_dados"].str.lower().str.split(",")
tt_df["bancos_de_dados_split"]

In [None]:
from collections import Counter

contagem = Counter()
for lista in tt_df["bancos_de_dados_split"]:
    contagem.update([item.strip() for item in lista if item.strip() != ""])

# Ordenar por frequência
top_bancos = pd.Series(contagem).sort_values(ascending=False)
print(top_bancos.head(20))

In [None]:
top_n = 20
bancos_top = set(top_bancos.head(top_n).index)


def simplificar(lista):
    return [
        item.strip() if item.strip() in bancos_top else "outros"
        for item in lista
        if item.strip() != ""
    ]


tt_df["bancos_de_dados_simplificado"] = tt_df["bancos_de_dados_split"].apply(
    simplificar
)
tt_df["bancos_de_dados_simplificado"]

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
bds_encoded = mlb.fit_transform(tt_df["bancos_de_dados_simplificado"])

bds_df = pd.DataFrame(
    bds_encoded,
    columns=[f"bds_{c.replace(' ', '_')}" for c in mlb.classes_],
    index=tt_df.index,
)

tt_df = pd.concat([tt_df, bds_df], axis=1).drop(
    columns=["bancos_de_dados", "bancos_de_dados_split", "bancos_de_dados_simplificado"]
)
tt_df

## One Hot Encoder


In [None]:
from sklearn.preprocessing import OneHotEncoder

categorical_columns = tt_df.select_dtypes(include=["object", "bool"]).columns.drop(
    "cargo"
)

ohe_encoder = OneHotEncoder(sparse_output=False)

encoded_data = ohe_encoder.fit_transform(tt_df[categorical_columns])
new_columns = ohe_encoder.get_feature_names_out(categorical_columns)
df_ohe = pd.DataFrame(encoded_data, columns=new_columns)

df_not_phe = tt_df.drop(columns=categorical_columns).reset_index(drop=True)
tt_df = pd.concat([df_not_phe, df_ohe], axis=1)

tt_df

## Label Encoder


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_label = label_encoder.fit_transform(tt_df["cargo"])
tt_df["cargo_label_encoded"] = encoded_label
tt_df["cargo_label_encoded"][:100]

## Drop "cargo" column and reset DataFrame index


In [None]:
tt_df = tt_df.drop(columns=["cargo"]).reset_index(drop=True)
tt_df

# Training


In [None]:
from sklearn.model_selection import train_test_split

X = tt_df.drop(columns=["cargo_label_encoded"])
y = tt_df["cargo_label_encoded"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"X_train shape:{X_train.shape}")
print(f"y_train shape:{y_train.shape}")
print(f"X_test shape:{X_test.shape}")
print(f"y_test shape:{y_test.shape}")

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

mlflow.set_tracking_uri("http://127.0.0.1:5000")


def run_experiment(model, param_grid, experiment_name="ML_Models"):
    mlflow.set_experiment(experiment_name)

    search = RandomizedSearchCV(
        model, param_grid, cv=3, n_iter=50, random_state=42, n_jobs=-1
    )

    with mlflow.start_run(run_name=model.__class__.__name__):
        search.fit(X_train, y_train)

        # Predições no treino
        y_train_pred = search.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_precision = precision_score(
            y_train, y_train_pred, average="weighted", zero_division=0
        )
        train_recall = recall_score(
            y_train, y_train_pred, average="weighted", zero_division=0
        )
        train_f1 = f1_score(y_train, y_train_pred, average="weighted", zero_division=0)

        # Predições no teste
        y_test_pred = search.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_precision = precision_score(
            y_test, y_test_pred, average="weighted", zero_division=0
        )
        test_recall = recall_score(
            y_test, y_test_pred, average="weighted", zero_division=0
        )
        test_f1 = f1_score(y_test, y_test_pred, average="weighted", zero_division=0)

        # Logar melhores hiperparâmetros
        mlflow.log_params(search.best_params_)

        # Logar métricas de treino
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("train_precision", train_precision)
        mlflow.log_metric("train_recall", train_recall)
        mlflow.log_metric("train_f1", train_f1)

        # Logar métricas de teste
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("test_precision", test_precision)
        mlflow.log_metric("test_recall", test_recall)
        mlflow.log_metric("test_f1", test_f1)

        # Salvar modelo treinado
        mlflow.sklearn.log_model(search.best_estimator_, name=model.__class__.__name__)

        print(
            f"{model.__class__.__name__} → Best Params: {search.best_params_}, "
            f"Train Acc={train_accuracy:.4f}, Test Acc={test_accuracy:.4f}, "
            f"Test Precision={test_precision:.4f}, Test Recall={test_recall:.4f}, Test F1={test_f1:.4f}"
        )


# 1. Logistic Regression
param_lr = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear", "lbfgs"],
    "max_iter": [300, 500],
}
run_experiment(
    LogisticRegression(random_state=42),
    param_lr,
    experiment_name="LogisticRegression",
)

# 2. Decision Tree
param_dt = {
    "criterion": ["gini", "entropy"],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [2, 5, 10],
}
run_experiment(
    DecisionTreeClassifier(random_state=42),
    param_dt,
    experiment_name="DecisionTree",
)

# 3. Random Forest
param_rf = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [2, 5, 10],
    "max_features": ["sqrt", "log2"],
    "class_weight": ["balanced"],
}
run_experiment(
    RandomForestClassifier(random_state=42),
    param_rf,
    experiment_name="RandomForest",
)

# 4. Support Vector Machine (SVM)
param_svm = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "poly", "sigmoid"],
    "gamma": ["scale", "auto"],
}
run_experiment(
    SVC(probability=True, random_state=42),
    param_svm,
    experiment_name="SVM",
)

# 5. Rede Neural (MLPClassifier)
param_mlp = {
    "hidden_layer_sizes": [(50,), (100,), (100, 50)],
    "activation": ["relu", "tanh"],
    "solver": ["adam", "lbfgs"],
    "learning_rate_init": [0.001, 0.01],
    "alpha": [0.0001, 0.001, 0.01],
    "max_iter": [300, 500],
}
run_experiment(
    MLPClassifier(random_state=42),
    param_mlp,
    experiment_name="MLPClassifier",
)