# Aula 2

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [2]:
def process_data():
    df = pd.read_csv('house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

    # Criação de pipelines para pré-processamento
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Transformador de coluna para processamento conjunto
    pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # Aplicação do pré-processador nos conjuntos de treino e teste
    X_train_processed = pre_processor.fit_transform(X_train)
    X_test_processed = pre_processor.transform(X_test)

    return X_train, X_test, y_train, y_test, pre_processor

X_train, X_test, y_train, y_test, pre_processor = process_data()

# Criando o pipeline com SVR
pipe_svr = Pipeline([('pre_process', pre_processor), ("svr", SVR())])

# Ajustando o modelo
pipe_svr.fit(X_train, y_train)

param_grid = {
    'svr__kernel': ['linear', 'poly', 'rbf'],
    'svr__C': [0.1, 1, 10],
    'svr__gamma': [0.1, 1, 'auto'],
    'svr__epsilon': [0.1, 0.2, 0.3]
}

# Configurando e executando o Grid Search
grid_search = GridSearchCV(estimator=pipe_svr, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Obtendo os melhores parâmetros e o melhor modelo
best_params_svr = grid_search.best_params_
best_model_svr = grid_search.best_estimator_

# Avaliando o modelo final no conjunto de teste
test_score = best_model_svr.score(X_test, y_test)
print(f"Melhores Parâmetros: {best_params_svr}")
print(f"Score no conjunto de teste: {test_score}")

y_pred_train = best_model_svr.predict(X_train)
y_pred_test = best_model_svr.predict(X_test)


# Exibindo os resultados
print("Resultados do conjunto de treino:")
print(f"R2: { r2_score(y_train, y_pred_train):.2f}")
print(f"MAE: { mean_absolute_error(y_train, y_pred_train):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_train, y_pred_train)):.2f}")

print("\nResultados do conjunto de teste:")
print(f"R2: { r2_score(y_test, y_pred_test):.2f}")
print(f"MAE: { mean_absolute_error(y_test, y_pred_test):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f}")

Melhores Parâmetros: {'svr__C': 1, 'svr__epsilon': 0.3, 'svr__gamma': 1, 'svr__kernel': 'poly'}
Score no conjunto de teste: 0.8529538929631331
Resultados do conjunto de treino:
R2: 1.00
MAE: 354.60
RMSE: 4363.80

Resultados do conjunto de teste:
R2: 0.85
MAE: 16703.23
RMSE: 32032.82


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score , classification_report, ConfusionMatrixDisplay,precision_score,recall_score, f1_score,roc_auc_score,roc_curve
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

In [4]:
def process_data():
    df = pd.read_csv('house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

    # Criação de pipelines para pré-processamento
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Transformador de coluna para processamento conjunto
    pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # Aplicação do pré-processador nos conjuntos de treino e teste
    X_train_processed = pre_processor.fit_transform(X_train)
    X_test_processed = pre_processor.transform(X_test)

    return X_train, X_test, y_train, y_test, pre_processor

X_train, X_test, y_train, y_test, pre_processor = process_data()

rf = RandomForestRegressor(random_state=42)

# Criando o pipeline com SVR
pipe_rf = Pipeline([('pre_process', pre_processor),
                    ("rf", rf)])


parametros_rf = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 5, 10, 15]
}



grid_search_rf = GridSearchCV(estimator = pipe_rf,
                           param_grid = parametros_rf,
                           cv = 5,
                           scoring = 'neg_mean_squared_error',
                           n_jobs = -1)


# Configurando e executando o Grid Search
grid_search_rf.fit(X_train, y_train)

# Obtendo os melhores parâmetros e o melhor modelo
best_params_rf = grid_search_rf.best_params_
best_model_rf = grid_search_rf.best_estimator_

# Avaliando o modelo final no conjunto de teste
test_score = best_model_rf.score(X_test, y_test)
print(f"Melhores Parâmetros: {best_params_rf}")
print(f"Score no conjunto de teste: {test_score}")

y_pred_train = best_model_rf.predict(X_train)
y_pred_test = best_model_rf.predict(X_test)


# Exibindo os resultados
print("Resultados do conjunto de treino:")
print(f"R2: { r2_score(y_train, y_pred_train):.2f}")
print(f"MAE: { mean_absolute_error(y_train, y_pred_train):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_train, y_pred_train)):.2f}")

print("\nResultados do conjunto de teste:")
print(f"R2: { r2_score(y_test, y_pred_test):.2f}")
print(f"MAE: { mean_absolute_error(y_test, y_pred_test):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f}")

Melhores Parâmetros: {'rf__max_depth': 15, 'rf__n_estimators': 100}
Score no conjunto de teste: 0.8967964246960485
Resultados do conjunto de treino:
R2: 0.98
MAE: 6920.81
RMSE: 12002.27

Resultados do conjunto de teste:
R2: 0.90
MAE: 16811.30
RMSE: 26835.88


In [5]:
from sklearn.tree import DecisionTreeRegressor

def process_data():
    df = pd.read_csv('house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

    # Criação de pipelines para pré-processamento
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Transformador de coluna para processamento conjunto
    pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # Aplicação do pré-processador nos conjuntos de treino e teste
    X_train_processed = pre_processor.fit_transform(X_train)
    X_test_processed = pre_processor.transform(X_test)

    return X_train, X_test, y_train, y_test, pre_processor

X_train, X_test, y_train, y_test, pre_processor = process_data()

dt = DecisionTreeRegressor(random_state=42)

# Criando o pipeline com SVR
pipe_dt = Pipeline([('pre_process', pre_processor),
                    ("dt", dt)])


parametros_dt = {
    'dt__max_depth': [None, 5, 10, 15],
    'dt__min_samples_split': [2, 5, 10],
    'dt__min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(estimator = pipe_dt,
                           param_grid = parametros_dt,
                           cv = 5,
                           scoring = 'neg_mean_squared_error',
                           n_jobs = -1)


# Configurando e executando o Grid Search
grid_search_dt.fit(X_train, y_train)

# Obtendo os melhores parâmetros e o melhor modelo
best_params_dt = grid_search_dt.best_params_
best_model_dt = grid_search_dt.best_estimator_

# Avaliando o modelo final no conjunto de teste
test_score = best_model_dt.score(X_test, y_test)
print(f"Melhores Parâmetros: {best_params_dt}")
print(f"Score no conjunto de teste: {test_score}")

y_pred_train = best_model_dt.predict(X_train)
y_pred_test = best_model_dt.predict(X_test)


# Exibindo os resultados
print("Resultados do conjunto de treino:")
print(f"R2: { r2_score(y_train, y_pred_train):.2f}")
print(f"MAE: { mean_absolute_error(y_train, y_pred_train):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_train, y_pred_train)):.2f}")

print("\nResultados do conjunto de teste:")
print(f"R2: { r2_score(y_test, y_pred_test):.2f}")
print(f"MAE: { mean_absolute_error(y_test, y_pred_test):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f}")

Melhores Parâmetros: {'dt__max_depth': 5, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 2}
Score no conjunto de teste: 0.7743361226474328
Resultados do conjunto de treino:
R2: 0.85
MAE: 22303.28
RMSE: 30399.22

Resultados do conjunto de teste:
R2: 0.77
MAE: 27370.83
RMSE: 39682.54


In [6]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import  KFold

# Função para processamento dos dados
def process_data():
    df = pd.read_csv('house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    return X, y

X, y = process_data()

# Divisão dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Pipeline de pré-processamento
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

pre_processor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Criando o pipeline com KNeighborsRegressor
knn = KNeighborsRegressor()

pipe_knn = Pipeline([('pre_process', pre_processor),
                    ("knn", knn)])

# Parâmetros para GridSearchCV
parametros_knn = {
    'knn__n_neighbors': range(2, 150),  # Número de vizinhos
    'knn__weights': ['uniform', 'distance'],  # Peso dos vizinhos
    'knn__p': [1, 2]  # Parâmetro de distância (1 para Manhattan, 2 para Euclidiana)
}

splitter = KFold(n_splits=5, shuffle=True, random_state=42)
metric = 'neg_mean_squared_error'

grid_search_knn = GridSearchCV(estimator=pipe_knn,
                               param_grid=parametros_knn,
                               cv=splitter,
                               scoring=metric,
                               n_jobs=-1)

grid_search_knn.fit(X_train, y_train)

best_model_knn = grid_search_knn.best_estimator_
best_params_knn = grid_search_knn.best_params_

print(f' Melhor parametro K {best_params_knn}')

y_pred_train = best_model_knn.predict(X_train)
y_pred_test = best_model_knn.predict(X_test)

# Métricas para o conjunto de treino
mse_train_knn_reg = mean_squared_error(y_train, y_pred_train)
r2_train_knn_reg = r2_score(y_train, y_pred_train)
mae_train_knn_reg = mean_absolute_error(y_train, y_pred_train)

print("Métricas para o conjunto de treino:")
print(f"MSE: {mse_train_knn_reg:.2f}")
print(f"R²: {r2_train_knn_reg:.2f}")
print(f"MAE: {mae_train_knn_reg:.2f}")

# Métricas para o conjunto de teste
mse_test_knn_reg = mean_squared_error(y_test, y_pred_test)
r2_test_knn_reg = r2_score(y_test, y_pred_test)
mae_test_knn_reg = mean_absolute_error(y_test, y_pred_test)

print("\nMétricas para o conjunto de teste:")
print(f"MSE: {mse_test_knn_reg:.2f}")
print(f"R²: {r2_test_knn_reg:.2f}")
print(f"MAE: {mae_test_knn_reg:.2f}")


 Melhor parametro K {'knn__n_neighbors': 5, 'knn__p': 1, 'knn__weights': 'distance'}
Métricas para o conjunto de treino:
MSE: 0.00
R²: 1.00
MAE: 0.00

Métricas para o conjunto de teste:
MSE: 1151717957.43
R²: 0.83
MAE: 20117.01


In [7]:
def process_data():
    df = pd.read_csv('house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

    # Criação de pipelines para pré-processamento
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Transformador de coluna para processamento conjunto
    pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # Aplicação do pré-processador nos conjuntos de treino e teste
    X_train_processed = pre_processor.fit_transform(X_train)
    X_test_processed = pre_processor.transform(X_test)

    return X_train, X_test, y_train, y_test, pre_processor

from sklearn.ensemble import AdaBoostRegressor


X_train, X_test, y_train, y_test, pre_processor = process_data()

pipe_ab = Pipeline([('pre_process', pre_processor),
                    ("ab", AdaBoostRegressor(random_state = 55) )])
pipe_ab.fit(X_train, y_train)

param_grid_ab = {
    'ab__n_estimators': [50, 100, 200],
    'ab__learning_rate': [0.01, 0.1, 1],
    'ab__loss': ['linear', 'square', 'exponential']
}

# Create the grid search object for regression
grid_search_ab = GridSearchCV(pipe_ab,
                               param_grid_ab,
                               cv=5,
                               n_jobs=-1)


grid_search_ab.fit(X_train, y_train)

best_parameters_ab = grid_search_ab.best_params_

y_pred_train_ab = grid_search_ab.predict(X_train)
y_pred_test_ab = grid_search_ab.predict(X_test)

mse_train_ab = mean_squared_error(y_train, y_pred_train_ab)
mae_train_ab = mean_absolute_error(y_train, y_pred_train_ab)
r2_train_ab = r2_score(y_train, y_pred_train_ab)

mse_test_ab = mean_squared_error(y_test, y_pred_test_ab)
mae_test_ab = mean_absolute_error(y_test, y_pred_test_ab)
r2_test_ab = r2_score(y_test, y_pred_test_ab)

print(f'Best parameters for AdaBoost: {best_parameters_ab}')
print("Metrics for training set:")
print(f"MSE: {mse_train_ab}")
print(f"MAE: {mae_train_ab}")
print(f"R-squared: {r2_train_ab}")

print("\nMetrics for test set:")
print(f"MSE: {mse_test_ab}")
print(f"MAE: {mae_test_ab}")
print(f"R-squared: {r2_test_ab}")

Best parameters for AdaBoost: {'ab__learning_rate': 1, 'ab__loss': 'linear', 'ab__n_estimators': 50}
Metrics for training set:
MSE: 772574105.8224336
MAE: 21617.452000844165
R-squared: 0.871635055514137

Metrics for test set:
MSE: 1118729721.8424554
MAE: 24093.27092394054
R-squared: 0.8396797520685841


In [9]:
from sklearn.ensemble import  BaggingRegressor


def process_data():
    df = pd.read_csv('house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

def test_ensemble_models(X_train, X_test, y_train, y_test, best_params):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    X_train_processed = pre_processor.fit_transform(X_train)
    X_test_processed = pre_processor.transform(X_test)

    estimators = [
        ("KNN", KNeighborsRegressor(**best_params["knn"])),
        ("RandomForest", RandomForestRegressor(**best_params["random_forest"])),
        ("DecisionTree", DecisionTreeRegressor(**best_params["decision_tree"])),
        ("SVR", SVR(**best_params["svr"]))
    ]

    for name, estimator in estimators:
        # AdaBoost
        ada_boost = AdaBoostRegressor(base_estimator=estimator, random_state=42)
        ada_boost.fit(X_train_processed, y_train)
        ada_boost_score = ada_boost.score(X_test_processed, y_test)
        print(f"AdaBoost with {name}: {ada_boost_score}")

        # Bagging
        bagging = BaggingRegressor(base_estimator=estimator, random_state=42)
        bagging.fit(X_train_processed, y_train)
        bagging_score = bagging.score(X_test_processed, y_test)
        print(f"Bagging with {name}: {bagging_score}")

# Seus parâmetros GridSearchCV
best_params = {
    "knn": {'n_neighbors': 5, 'p': 1, 'weights': 'distance'},
    "random_forest": {'max_depth': 15, 'n_estimators': 300},
    "decision_tree": {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5},
    "svr": {'C': 1, 'epsilon': 0.3, 'gamma': 1, 'kernel': 'poly'}
}

# Obtendo os dados
X_train, X_test, y_train, y_test = process_data()

# Chamando a função com os dados e os melhores parâmetros
test_ensemble_models(X_train, X_test, y_train, y_test, best_params)



AdaBoost with KNN: 0.8575839833916596




Bagging with KNN: 0.8412667586319754




AdaBoost with RandomForest: 0.9127299221110883




Bagging with RandomForest: 0.8916003419982743




AdaBoost with DecisionTree: 0.8926093459152553




Bagging with DecisionTree: 0.8646024293920559




AdaBoost with SVR: 0.9002477895892588




Bagging with SVR: 0.9160423839432946
