In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, label_binarize
from scipy.stats import uniform, randint


# Classification / classificação

In [None]:
# Carregar o arquivo Excel / Load the Excel file
try:
    df = pd.read_excel('NAME.xlsx')
    print("NAME.xlsx file loaded successfully!")
    print("First 5 rows of the DataFrame:")
    display(df.head())
    print("\nDataFrame Information:")
    df.info()
except FileNotFoundError:
    print("Error: The file 'NAME.xlsx' was not found. Please ensure the file name is correct and it is in the correct directory.")
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

In [None]:
# Carregar o arquivo CSV
try:
    df = pd.read_csv('NAME.csv')
    print("NAME.csv file loaded successfully!")
    print("First 5 rows of the DataFrame:")
    display(df.head())
    print("\nDataFrame Information:")
    df.info()
except FileNotFoundError:
    print("Error: The file 'NAME.xlsx' was not found. Please ensure the file name is correct and it is in the correct directory.")
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

In [None]:
# Defina a coluna alvo pelo índice (substitua '0' pelo índice correto da sua coluna alvo) / Define the target column by its index (replace '0' with the correct index of your target column)
target_column_index = 0
target_column_name = df.columns[target_column_index]

# Separação das features (X) e da variável alvo (y) / Separation of features (X) and target variable (y)
X = df.drop(columns=[target_column_name])
y = df[target_column_name]

In [None]:
# Codificar a variável alvo se for categórica (Label Encoding) / Encode the target variable if it is categorical (Label Encoding)
# Isso é necessário para que o XGBoost possa trabalhar com classes não numéricas / This is necessary for XGBoost to work with non-numeric classes
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Divisão dos dados em conjuntos de treinamento e teste / Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Inicialização e treinamento do modelo XGBoost / Initialize and train the XGBoost model
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_), eval_metric='mlogloss', random_state=42)
model.fit(X_train, y_train)

In [None]:
# Previsões no conjunto de teste / Predictions on the test set
y_pred = model.predict(X_test)

# Avaliação do modelo / Model evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)

# Visualização da Matriz de Confusão como Heatmap / Confusion Matrix Visualization as Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predictions')
plt.ylabel('True Values')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Opcional: Para ter uma ideia da importância das features / Optional: To get an idea of feature importance
feature_importances = model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance (top 10):")
print(importance_df.head(10))

In [None]:
# K-Fold Cross Validation
n_splits = 5 # Você pode alterar o número de folds / You can change the number of folds

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

accuracy_scores = []
reports = []
confusion_matrices = []

print(f"Starting K-Fold Cross Validation with {n_splits} folds...")

for fold, (train_index, test_index) in enumerate(skf.split(X, y_encoded)):
    print(f"\n--- Fold {fold+1}/{n_splits} ---")
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y_encoded[train_index], y_encoded[test_index]

    model_kf = xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_), eval_metric='mlogloss', random_state=42)
    model_kf.fit(X_train_fold, y_train_fold)

    y_pred_fold = model_kf.predict(X_test_fold)

    accuracy_fold = accuracy_score(y_test_fold, y_pred_fold)
    accuracy_scores.append(accuracy_fold)

    report_fold = classification_report(y_test_fold, y_pred_fold, target_names=label_encoder.classes_)
    reports.append(report_fold)

    conf_matrix_fold = confusion_matrix(y_test_fold, y_pred_fold)
    confusion_matrices.append(conf_matrix_fold)

    print(f"Accuracy of Fold {fold+1}: {accuracy_fold:.4f}")

print(f"\n--- Final Results of K-Fold Cross Validation (Average of {n_splits} Folds) ---")
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Standard Deviation of Accuracy: {np.std(accuracy_scores):.4f}")

In [None]:
# Obter as probabilidades previstas para cada classe / Get predicted probabilities for each class
y_proba = model.predict_proba(X_test)

# Número de classes / Number of classes
n_classes = len(label_encoder.classes_)

# Binarizar as classes verdadeiras para o formato One-vs-Rest / Binarize true classes to One-vs-Rest format
# NOTA: Para classificação binária, label_binarize pode retornar um array com apenas uma coluna.
#       Para evitar IndexError ao plotar ROC, ajustaremos y_true dentro do loop.
# NOTE: For binary classification, label_binarize might return an array with only one column.
#       To avoid IndexError when plotting ROC, we will adjust y_true inside the loop.
y_test_binarized = label_binarize(y_test, classes=range(n_classes))

# Plotar a curva ROC para cada classe / Plot the ROC curve for each class
plt.figure(figsize=(10, 8))
for i in range(n_classes):
    # Para cada classe, y_true deve ser 1 se a amostra pertencer à classe i, e 0 caso contrário.
    # For each class, y_true should be 1 if the sample belongs to class i, and 0 otherwise.
    fpr, tpr, _ = roc_curve((y_test == i).astype(int), y_proba[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'ROC curve of class {label_encoder.classes_[i]} (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2) # Linha pontilhada de referência (classificador aleatório) / Reference dashed line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - One-vs-Rest')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
# As variáveis y_test_binarized, y_proba e n_classes já foram calculadas e estão disponíveis / The variables y_test_binarized, y_proba and n_classes have already been calculated and are available

# Calcular a curva ROC macro-média / Calculate the macro-average ROC curve
# Primeiro, obtenha FPR e TPR para cada classe / First, get FPR and TPR for each class
fpr = dict()
tpr = dict()
roc_auc_per_class = []
for i in range(n_classes):
    # Para cada classe, y_true deve ser 1 se a amostra pertencer à classe i, e 0 caso contrário.
    # For each class, y_true should be 1 if the sample belongs to class i, and 0 otherwise.
    fpr[i], tpr[i], _ = roc_curve((y_test == i).astype(int), y_proba[:, i])
    roc_auc_per_class.append(auc(fpr[i], tpr[i]))

# Agregação de todas as taxas de falsos positivos (FPR) / Aggregation of all False Positive Rates (FPR)
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Interpolar todas as curvas ROC neste conjunto de pontos / Interpolate all ROC curves at this set of points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finalmente, a média e a área / Finally, the average and the area
mean_tpr /= n_classes

fpr_macro = all_fpr
tpr_macro = mean_tpr
roc_auc_macro = auc(fpr_macro, tpr_macro)

plt.figure(figsize=(10, 8))
plt.plot(fpr_macro, tpr_macro, lw=2, label=f'Macro-average ROC curve (area = {roc_auc_macro:.2f})', color='navy', linestyle='--')

plt.plot([0, 1], [0, 1], 'k--', lw=2) # Linha pontilhada de referência (classificador aleatório) / Reference dashed line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - Macro-Average')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Regressão / regression

## sem label / without label

In [None]:
# Carregar o arquivo Excel / Load the Excel file
try:
    df = pd.read_excel('NAME.xlsx')
    display(df.head())
except FileNotFoundError:
    print("Error: The file 'NAME.xlsx' was not found. Make sure it is in the correct directory.")
    # Você pode querer adicionar um código para upload aqui, se necessário. / You might want to add upload code here, if necessary.
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

In [None]:
# Carregar o arquivo CSV / Load the CSV file
try:
    df = pd.read_csv('NAME.csv')
    display(df.head())
except FileNotFoundError:
    print("Error: The file 'NAME.csv' was not found. Make sure it is in the correct directory.")
    # Você pode querer adicionar um código para upload aqui, se necessário. / You might want to add upload code here, if necessary.
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

In [None]:
# Supondo que a primeira coluna seja a variável alvo (y) e as outras sejam as features (X) / Assuming the first column is the target variable (y) and the others are features (X)
X = df.iloc[:, 1:] # Todas as colunas exceto a primeira / All columns except the first
y = df.iloc[:, 0]  # A primeira coluna / The first column

print(f"Target column name: {y.name}")

# Converter colunas de objetos para numéricas em X, se possível. / Convert object columns to numeric in X, if possible.
for col in X.select_dtypes(include=['object']).columns:
    try:
        X[col] = pd.to_numeric(X[col])
    except ValueError:
        print(f"Warning: Column '{col}' in X could not be converted to numeric and will be ignored or require specific pre-processing.")

# Converter y para numérica, se for objeto / Convert y to numeric, if it is an object
if y.dtype == 'object':
    try:
        y = pd.to_numeric(y)
    except ValueError:
        print("Warning: The target variable (y) could not be converted to numeric. XGBoost requires a numeric target for regression.")
        print("Please check if your target column contains only numeric values or if it needs different pre-processing.")

# **Nova etapa de limpeza robusta:** / **New robust cleaning step:**
# Remover NaN e valores infinitos de X e y, mantendo a correspondência entre as linhas / Remove NaN and infinite values from X and y, maintaining correspondence between rows
initial_rows = X.shape[0]

# Substituir valores infinitos por NaN em X e y / Replace infinite values with NaN in X and y
X.replace([np.inf, -np.inf], np.nan, inplace=True)
y.replace([np.inf, -np.inf], np.nan, inplace=True)

# Criar uma máscara para linhas com qualquer NaN em X ou y / Create a mask for rows with any NaN in X or y
nan_mask_X = X.isnull().any(axis=1)
nan_mask_y = y.isnull()

# Combinar máscaras: manter linhas onde nem X nem y têm NaN / Combine masks: keep rows where neither X nor y have NaN
valid_rows_mask = ~(nan_mask_X | nan_mask_y)

X = X[valid_rows_mask]
y = y[valid_rows_mask]

if X.shape[0] < initial_rows:
    print(f"Warning: {initial_rows - X.shape[0]} rows were removed due to NaN or infinite values in X or y.")
    # Verificar se X ou y ficaram vazios após a limpeza / Check if X or y became empty after cleaning
    if X.empty or y.empty:
        raise ValueError("After cleaning, X or y are empty. Cannot continue with training.")


# Dividir os dados em conjuntos de treinamento e teste / Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
#Evaluation of parameters / avaliação de parametros.

# For regression, we directly use y_train and y_test, no need for argmax.

n_estimators_range = range(1, 101, 5) # Increased range for n_estimators, adjusted step
train_rmse_scores = []
test_rmse_scores = []

for n_est in n_estimators_range:
    # Initialize the XGBoost Regressor for evaluation
    model_eval = xgb.XGBRegressor(
        objective='reg:squarederror', # Objective for regression problems
        n_estimators=n_est,          # Number of trees (from the range)
        learning_rate=0.1,           # Learning rate (can be adjusted)
        random_state=42
    )
    model_eval.fit(X_train, y_train)

    # Make predictions for training and test sets
    y_train_pred = model_eval.predict(X_train)
    y_test_pred = model_eval.predict(X_test)

    # Calculate RMSE for training and test sets
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    train_rmse_scores.append(train_rmse)
    test_rmse_scores.append(test_rmse)

plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, train_rmse_scores, label='Training', marker='o')
plt.plot(n_estimators_range, test_rmse_scores, label='Validation', marker='o')
plt.xlabel('Number of Estimators')
plt.ylabel('RMSE')
plt.title('XGBoost Regression: RMSE vs. Number of Estimators')
plt.legend()
plt.grid(True)
plt.show()

# Find the n_estimators with the minimum validation RMSE
min_rmse = min(test_rmse_scores)
min_rmse_index = test_rmse_scores.index(min_rmse)
best_n_estimators = n_estimators_range[min_rmse_index]

print(f"Number of Estimators with the minimum validation RMSE: {best_n_estimators}")
print(f"Minimum validation RMSE: {min_rmse:.4f}")

In [None]:
# Inicializar o regressor XGBoost / Initialize the XGBoost regressor
xgb_reg = xgb.XGBRegressor(
    objective='reg:squarederror', # Objetivo para problemas de regressão / Objective for regression problems
    n_estimators=46,             # Número de árvores (ajustável) / Number of trees (adjustable)
    learning_rate=0.1,            # Taxa de aprendizado (ajustável) / Learning rate (adjustable)
    random_state=42
)

# Treinar o modelo / Train the model
xgb_reg.fit(X_train, y_train)

print("XGBoost model trained successfully!")

In [None]:
# Fazer previsões no conjunto de teste / Make predictions on the test set
y_pred = xgb_reg.predict(X_test)

# Avaliar o modelo / Evaluate the model
mse = mean_squared_error(y_test, y_pred);
rmse = np.sqrt(mse);
r2 = r2_score(y_test, y_pred);

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values (XGBoost Regression)")
plt.grid(True)
plt.show()

In [None]:
# Define the XGBoost Regressor model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 200),  # Number of boosting rounds
    'learning_rate': uniform(0.01, 0.2), # Step size shrinkage to prevent overfitting
    'max_depth': randint(3, 10),      # Maximum depth of a tree
    'subsample': uniform(0.6, 0.4),   # Subsample ratio of the training instance
    'colsample_bytree': uniform(0.6, 0.4), # Subsample ratio of columns when constructing each tree
    'gamma': uniform(0, 0.5)          # Minimum loss reduction required to make a further partition on a leaf node
}

# Initialize RandomizedSearchCV
# n_iter: Number of parameter settings that are sampled. More is better but takes longer.
# cv: Number of cross-validation folds.
# scoring: Metric to evaluate the performance (e.g., 'neg_mean_squared_error' for regression)
random_search = RandomizedSearchCV(estimator=model,
                                   param_distributions=param_dist,
                                   n_iter=100, # You can adjust this number
                                   cv=5,
                                   scoring='neg_mean_squared_error',
                                   verbose=1,
                                   n_jobs=-1, # Use all available cores
                                   random_state=42)

# Fit RandomizedSearchCV to the data
print("Starting RandomizedSearchCV...")
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print("\nBest parameters found: ", random_search.best_params_)
print("Best RMSE found: ", np.sqrt(-random_search.best_score_))

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)

print(f"\nRMSE of the best model on the test set: {rmse_best:.4f}")
print(f"R-squared of the best model on the test set: {r2_best:.4f}")


## com label / with label

In [None]:
# Carregar o arquivo Excel / Load the Excel file
try:
    df = pd.read_excel('NAME.xlsx')
    display(df.head())
except FileNotFoundError:
    print("Error: The file 'NAME.xlsx' was not found. Make sure it is in the correct directory.")
    # Você pode querer adicionar um código para upload aqui, se necessário. / You might want to add upload code here, if necessary.
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

In [None]:
# Carregar o arquivo CSV / Load the CSV file
try:
    df = pd.read_csv('NAME.csv')

except FileNotFoundError:
    print("Error: The file 'NAME.csv' was not found. Make sure it is in the correct directory.")
    # Você pode querer adicionar um código para upload aqui, se necessário. / You might want to add upload code here, if necessary.
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

In [None]:
# Supondo que a primeira coluna do dataframe deve ser excluída e o novo primeiro coluna será a variável alvo (y) e as outras serão as features (X) / Assuming the first column of the dataframe should be excluded and the new first column will be the target variable (y) and the others will be the features (X)
df_processed = df.drop(columns=[df.columns[0]]) # Exclui a primeira coluna do dataframe / Excludes the first column from the dataframe
df_processed.head()

In [None]:
y = df_processed.iloc[:, 0]  # A nova primeira coluna / The new first column
X = df_processed.iloc[:, 1:] # Todas as outras colunas / All other columns

print(f"Target column name: {y.name}")

# Converter colunas de objetos para numéricas em X, se possível. / Convert object columns to numeric in X, if possible.
for col in X.select_dtypes(include=['object']).columns:
    try:
        X[col] = pd.to_numeric(X[col])
    except ValueError:
        print(f"Warning: Column '{col}' in X could not be converted to numeric and will be ignored or require specific pre-processing.")

# Converter y para numérica, se for objeto (já deveria ser numérica aqui) / Convert y to numeric, if it is an object
if y.dtype == 'object':
    try:
        y = pd.to_numeric(y)
    except ValueError:
        print("Warning: The target variable (y) could not be converted to numeric. XGBoost requires a numeric target for regression.")
        print("Please check if your target column contains only numeric values or if it needs different pre-processing.")

# **Nova etapa de limpeza robusta:** / **New robust cleaning step:**
# Remover NaN e valores infinitos de X e y, mantendo a correspondência entre as linhas / Remove NaN and infinite values from X and y, maintaining correspondence between rows
initial_rows = X.shape[0]

# Substituir valores infinitos por NaN em X e y / Replace infinite values by NaN in X and y
X.replace([np.inf, -np.inf], np.nan, inplace=True)
y.replace([np.inf, -np.inf], np.nan, inplace=True)

# Criar uma máscara para linhas com qualquer NaN em X ou y / Create a mask for rows with any NaN in X or y
nan_mask_X = X.isnull().any(axis=1)
nan_mask_y = y.isnull()

# Combinar máscaras: manter linhas onde nem X nem y têm NaN / Combine masks: keep rows where neither X nor y have NaN
valid_rows_mask = ~(nan_mask_X | nan_mask_y)

X = X[valid_rows_mask]
y = y[valid_rows_mask]

if X.shape[0] < initial_rows:
    print(f"Warning: {initial_rows - X.shape[0]} rows were removed due to NaN or infinite values in X or y.")
    # Verificar se X ou y ficaram vazios após a limpeza / Check if X or y became empty after cleaning
    if X.empty or y.empty:
        raise ValueError("After cleaning, X or y are empty. Cannot continue with training.")


# Dividir os dados em conjuntos de treinamento e teste / Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
#Evaluation of parameters / avaliação de parametros.

# For regression, we directly use y_train and y_test, no need for argmax.

n_estimators_range = range(1, 101, 5) # Increased range for n_estimators, adjusted step
train_rmse_scores = []
test_rmse_scores = []

for n_est in n_estimators_range:
    # Initialize the XGBoost Regressor for evaluation
    model_eval = xgb.XGBRegressor(
        objective='reg:squarederror', # Objective for regression problems
        n_estimators=n_est,          # Number of trees (from the range)
        learning_rate=0.1,           # Learning rate (can be adjusted)
        random_state=42
    )
    model_eval.fit(X_train, y_train)

    # Make predictions for training and test sets
    y_train_pred = model_eval.predict(X_train)
    y_test_pred = model_eval.predict(X_test)

    # Calculate RMSE for training and test sets
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    train_rmse_scores.append(train_rmse)
    test_rmse_scores.append(test_rmse)

plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, train_rmse_scores, label='Training', marker='o')
plt.plot(n_estimators_range, test_rmse_scores, label='Validation', marker='o')
plt.xlabel('Number of Estimators')
plt.ylabel('RMSE')
plt.title('XGBoost Regression: RMSE vs. Number of Estimators')
plt.legend()
plt.grid(True)
plt.show()

# Find the n_estimators with the minimum validation RMSE
min_rmse = min(test_rmse_scores)
min_rmse_index = test_rmse_scores.index(min_rmse)
best_n_estimators = n_estimators_range[min_rmse_index]

print(f"Number of Estimators with the minimum validation RMSE: {best_n_estimators}")
print(f"Minimum validation RMSE: {min_rmse:.4f}")

In [None]:
# Inicializar o regressor XGBoost / Initialize the XGBoost regressor
xgb_reg = xgb.XGBRegressor(
    objective='reg:squarederror', # Objetivo para problemas de regressão / Objective for regression problems
    n_estimators=96,             # Número de árvores (ajustável) / Number of trees (adjustable)
    learning_rate=0.1,            # Taxa de aprendizado (ajustável) / Learning rate (adjustable)
    random_state=42
)

# Treinar o modelo / Train the model
xgb_reg.fit(X_train, y_train)

print("XGBoost model trained successfully!")

In [None]:
# Fazer previsões no conjunto de teste / Make predictions on the test set
y_pred = xgb_reg.predict(X_test)

# Avaliar o modelo / Evaluate the model
mse = mean_squared_error(y_test, y_pred);
rmse = np.sqrt(mse);
r2 = r2_score(y_test, y_pred);

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values (XGBoost Regression)")
plt.grid(True)
plt.show()

In [None]:
# Define the XGBoost Regressor model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 200),  # Number of boosting rounds
    'learning_rate': uniform(0.01, 0.2), # Step size shrinkage to prevent overfitting
    'max_depth': randint(3, 10),      # Maximum depth of a tree
    'subsample': uniform(0.6, 0.4),   # Subsample ratio of the training instance
    'colsample_bytree': uniform(0.6, 0.4), # Subsample ratio of columns when constructing each tree
    'gamma': uniform(0, 0.5)          # Minimum loss reduction required to make a further partition on a leaf node
}

# Initialize RandomizedSearchCV
# n_iter: Number of parameter settings that are sampled. More is better but takes longer.
# cv: Number of cross-validation folds.
# scoring: Metric to evaluate the performance (e.g., 'neg_mean_squared_error' for regression)
random_search = RandomizedSearchCV(estimator=model,
                                   param_distributions=param_dist,
                                   n_iter=100, # You can adjust this number
                                   cv=5,
                                   scoring='neg_mean_squared_error',
                                   verbose=1,
                                   n_jobs=-1, # Use all available cores
                                   random_state=42)

# Fit RandomizedSearchCV to the data
print("Starting RandomizedSearchCV...")
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print("\nBest parameters found: ", random_search.best_params_)
print("Best RMSE found: ", np.sqrt(-random_search.best_score_))

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)

print(f"\nRMSE of the best model on the test set: {rmse_best:.4f}")
print(f"R-squared of the best model on the test set: {r2_best:.4f}")
