In [16]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('data/BRAZIL_CITIES_REV2022_CLEANED.csv')
df.head()

Unnamed: 0,STATE,IBGE_RES_POP,IBGE_RES_POP_BRAS,IBGE_DU,IBGE_DU_URBAN,IBGE_DU_RURAL,IBGE_POP,IBGE_1,IBGE_1-4,IBGE_5-9,...,Pu_Agencies,Pr_Bank,Pu_Bank,Cars,Motorcycles,Wheeled_tractor,UBER,MAC,WAL-MART,POST_OFFICES
0,16,5098,5098,1536,1100,436,3594,46,198,265,...,0,0,0,282,1185,0,0,0,0,1
1,10,2709,2706,875,364,511,1099,13,40,69,...,0,0,0,479,332,0,0,0,0,1
2,14,5239,5239,1567,639,928,2062,40,132,162,...,0,0,0,297,274,0,0,0,0,1
3,14,2366,2366,676,353,323,1231,22,113,119,...,0,0,0,198,104,0,0,0,0,1
4,9,11063,11063,2632,1118,1515,3154,31,204,316,...,0,0,0,196,1008,0,0,0,0,1


In [17]:
# Split the data into features and target
X = df.drop(columns=['IDHM'])
y = df['IDHM']

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split dados

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

# Normalização de Escalas 

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Definir o grid de hiperparâmetros para Random Forest
param_grid_rf = {
    'n_estimators': [45, 50, 65, 100, 150, 200, 250, 300],  # Número de árvores na floresta
    'max_depth': [5, 10, 20, 30, None],  # Profundidade máxima de cada árvore
    'min_samples_split': [2, 5, 10, 20],  # Número mínimo de amostras para dividir um nó
    'min_samples_leaf': [1, 5, 10, 25, 50],  # Número mínimo de amostras em cada folha
    'max_features': ['sqrt', 'log2', None],  # Número de recursos a serem considerados para a melhor divisão
    'random_state': [66]  # Garantir reprodutibilidade
}

# Configurar o GridSearch para RandomForest com um valor fixo de cv
grid_search_rf = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')

# Treinar e buscar os melhores hiperparâmetros
grid_search_rf.fit(X_train, y_train)

# Avaliar o modelo
y_pred = grid_search_rf.predict(X_test)
r2 = r2_score(y_test, y_pred)

# Exibir os melhores hiperparâmetros + melhor score
print("Melhor score:", r2)
print("Melhores hiperparâmetros para Random Forest:", grid_search_rf.best_params_)

Melhor score: 0.5430954301660416
Melhores hiperparâmetros para Random Forest: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 66}


In [24]:
# Treinar o modelo final com os melhores hiperparâmetros
final_rf_model = RandomForestRegressor(
    n_estimators=grid_search_rf.best_params_['n_estimators'],
    max_depth=grid_search_rf.best_params_['max_depth'],
    min_samples_split=grid_search_rf.best_params_['min_samples_split'],
    min_samples_leaf=grid_search_rf.best_params_['min_samples_leaf'],
    max_features=grid_search_rf.best_params_['max_features'],
    random_state=grid_search_rf.best_params_['random_state']
)

# Printing best parameters
print("Melhores hiperparâmetros para Random Forest:", grid_search_rf.best_params_)

grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')

# Ajustar o modelo final com os dados de treino
final_rf_model.fit(X_train, y_train)

# Fazer previsões com o conjunto de teste
y_pred_rf = final_rf_model.predict(X_test)

# Avaliar o desempenho do modelo usando RMSE e R²
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"RMSE do modelo Random Forest: {rmse_rf}")
print(f"R² do modelo Random Forest: {r2_rf}")
print(f"Número de k-folds: {scores.index(max(scores)) + 2}")


Melhores hiperparâmetros para Random Forest: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 66}
RMSE do modelo Random Forest: 0.02481367990680944
R² do modelo Random Forest: 0.5430954301660416
Número de k-folds: 3


