## XGBoost otimizado


In [None]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score

import xgboost as xgb

RSEED = 42
np.random.seed(RSEED)


**1. Carregar dados preparados**

In [2]:
train_df = pd.read_csv("train_data_prepared.csv")
test_df  = pd.read_csv("test_data_prepared.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

train_df.head()


Train shape: (6812, 33)
Test shape: (1500, 32)


Unnamed: 0,AVERAGE_FREE_FLOW_SPEED,AVERAGE_TIME_DIFF,AVERAGE_FREE_FLOW_TIME,LUMINOSITY,AVERAGE_TEMPERATURE,AVERAGE_ATMOSP_PRESSURE,AVERAGE_HUMIDITY,AVERAGE_WIND_SPEED,AVERAGE_CLOUDINESS,RAIN_INTENSITY,...,supermarket_peak_level,periodo_aulas,fim_de_semana,hour_sin,hour_cos,dow_sin,dow_cos,month_sin,month_cos,congestion_ratio
0,41.5,11.5,71.4,2,15.0,1019.0,100.0,3.0,0,0,...,0,0,0,0.9659258,-0.258819,0.433884,-0.900969,-0.8660254,-0.5,0.161064
1,41.7,48.3,87.4,2,21.0,1021.0,53.0,5.0,1,0,...,1,0,0,-0.5,-0.866025,-0.433884,-0.900969,-0.8660254,-0.5,0.552632
2,38.6,38.4,85.2,2,26.0,1014.0,61.0,4.0,0,0,...,1,0,1,-0.8660254,-0.5,-0.781831,0.62349,-1.0,-1.83697e-16,0.450704
3,37.4,61.0,94.1,2,18.0,1025.0,48.0,4.0,1,0,...,0,1,0,0.258819,-0.965926,0.781831,0.62349,0.8660254,0.5,0.648247
4,41.6,50.4,77.0,2,15.0,1008.0,82.0,10.0,0,0,...,1,1,0,1.224647e-16,-1.0,0.433884,-0.900969,1.224647e-16,-1.0,0.654545


**Separar features e target**

In [None]:
TARGET = "AVERAGE_SPEED_DIFF"  # já codificado como 0-4

X = train_df.drop(columns=[TARGET])
y = train_df[TARGET].astype(int)

print("X shape:", X.shape)
print("y distribution:", y.value_counts().sort_index())


X shape: (6812, 32)
y distribution: AVERAGE_SPEED_DIFF
0    2200
1    1419
2    1651
3    1063
4     479
Name: count, dtype: int64


**2. Baseline XGBoost com early stopping**

Serve como ponto de comparação antes do fazer o tuning.

In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RSEED
)

baseline_xgb = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=5,
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_alpha=0.5,
    reg_lambda=1,
    eval_metric="mlogloss",
    random_state=RSEED,
    n_jobs=-1,
    tree_method="hist",
    early_stopping_rounds=50
)

baseline_xgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False,
)

y_val_pred = np.argmax(baseline_xgb.predict_proba(X_val), axis=1)
print("Baseline validation accuracy:", accuracy_score(y_val, y_val_pred))


Baseline validation accuracy: 0.8158473954512105


**3. Tuning com RandomizedSearchCV**

Exploramos apenas os hiperparâmetros com efeito mais forte e fixamos o resto. 

In [5]:
param_grid = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.03, 0.05],
    "n_estimators": [300, 500, 700],
    "min_child_weight": [1, 3],
    "reg_alpha": [0, 0.5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RSEED)

xgb_base = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=5,
    eval_metric="mlogloss",
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_lambda=1,
    random_state=RSEED,
    n_jobs=-1,
    tree_method="hist"
)

search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_grid,
    n_iter=50,
    scoring="accuracy",
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=RSEED
)

search.fit(X, y)

print("Best params:", search.best_params_)
print("Best CV accuracy:", search.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best params: {'reg_alpha': 0, 'n_estimators': 700, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.03}
Best CV accuracy: 0.8137107938672898


**4. Treinar modelo final com todos os dados**

In [6]:
best_params = search.best_params_

final_xgb = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=5,
    eval_metric="mlogloss",
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_lambda=1,
    random_state=RSEED,
    n_jobs=-1,
    tree_method="hist",
    **best_params
)

final_xgb.fit(X, y)

print("Modelo final treinado.")


Modelo final treinado.


**5. Prever para o dataset de teste**

In [7]:
test_pred = np.argmax(final_xgb.predict_proba(test_df), axis=1)
test_pred[:10], test_pred.shape


(array([0, 1, 0, 3, 1, 2, 2, 2, 1, 2]), (1500,))

**6. Criar ficheiro de submissão Kaggle**

In [8]:
# Converter para as categorias originais
reverse_mapping = {
    0: "None",
    1: "Low",
    2: "Medium",
    3: "High",
    4: "Very_High"
}

y_test_labels = pd.Series(test_pred).map(reverse_mapping)

submission = pd.DataFrame({
    "RowId": range(1, len(test_df) + 1),
    "Speed_Diff": y_test_labels
})

submission.to_csv("submission.csv", index=False)
print("\nO ficheiro submission.csv foi criado com sucesso")


O ficheiro submission.csv foi criado com sucesso


Accuracy no Kaggle: 0.83777