# Regressão Bagging and Bossting

In [40]:
#importando as bibliotecas necessarias para realizar a regressão
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.datasets import make_regression

In [41]:
#lendo a base de dados
df = pd.read_csv(r'C:\Users\Gabriel Makhoul\Desktop\ML 2\projetoML2\data\diamonds.csv')

In [42]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [43]:
#jogando fora as colunas x,y e z pelo fato de serem muito coorelacionadas com o carat, e a primeira coluna que é o index
df = df.drop(['x','y','z','Unnamed: 0'], axis=1)

In [44]:
#separando o data frame em x e y
X = df.drop('price', axis=1)
y = df['price']

In [45]:
#separando o dado
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
#criando o pipeline de regressão de random forest
pipeline_bagging = Pipeline([
    ('scaler', OrdinalEncoder()),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [47]:
# Hyperparameter do bagging
param_grid_bagging = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [5, 10, None]
}

In [48]:
#criando o pipeline de regressão de random forest
pipeline_boosting = Pipeline([
    ('scaler', OrdinalEncoder()),   
    ('regressor', GradientBoostingRegressor(random_state=42))
])

In [49]:
# Hyperparameter do boosting
param_grid_boosting = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 10]
}

In [50]:
# Grid do Bagging
grid_search_bagging = GridSearchCV(
    pipeline_bagging,
    param_grid_bagging,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

In [51]:
grid_search_bagging.fit(X_train, y_train)



In [52]:
# Grid do bossting
grid_search_boosting = GridSearchCV(
    pipeline_boosting,
    param_grid_boosting,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

In [53]:
grid_search_boosting.fit(X_train, y_train)

 nan nan nan nan nan nan nan nan nan]


In [54]:
best_bagging_model = grid_search_bagging.best_estimator_
best_boosting_model = grid_search_boosting.best_estimator_

In [55]:
# Predictions and evaluation
bagging_predictions = best_bagging_model.predict(X_test)
boosting_predictions = best_boosting_model.predict(X_test)

ValueError: Found unknown categories [2.71, 3.5, 3.51, 4.0, 4.01] in column 0 during transform

In [None]:
bagging_mse = mean_squared_error(y_test, bagging_predictions)
boosting_mse = mean_squared_error(y_test, boosting_predictions)

In [None]:
print(f"Bagging Best Params: {grid_search_bagging.best_params_}")
print(f"Boosting Best Params: {grid_search_boosting.best_params_}")
print(f"Bagging Test MSE: {bagging_mse}")
print(f"Boosting Test MSE: {boosting_mse}")