In [1]:
import pandas as pd
import numpy as np
from sklearn import (
    datasets
)
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV
)
from sklearn.linear_model import (
    LinearRegression
)
from sklearn.metrics import (
    mean_squared_error
)
from sklearn.svm import (
    SVR
)
from xgboost import (
    XGBRegressor
)

In [2]:
class DataModeling():
    def __init__(self) -> None:
        self.sobre_o_df = datasets.load_boston()
        self.df = pd.DataFrame(
            data=self.sobre_o_df.data,
            columns=self.sobre_o_df.feature_names
        )

    def dataframe(self):
        df = self.df.head(5)
        display(df)

    def chaves_do_df(self):
        df = self.sobre_o_df.keys()
        print(df)

    def descrição_do_df(self):
        df = self.sobre_o_df.DESCR
        print(df)

    def variavel_targat(self):
        df = self.sobre_o_df.target
        print(df)

    def separação_de_train_e_test(self):
        """
            Separação de Train/Test dataset padrão com 
            20% de massa para teste via metodo SKLEARN
        """
        x_dados = self.sobre_o_df.data
        y_objetivo = self.sobre_o_df.target
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            x_dados,
            y_objetivo,
            test_size=0.2,
            random_state=42
        )
    
    def modelo_de_regreção_linear(self):
        linear_regression = LinearRegression()

        treinamento = (
            linear_regression
            .fit(X=self.x_train, y=self.y_train)
        )

        y_predict_linear = treinamento.predict(X=self.x_test)

       # Avaliação do modelo:
        mse = mean_squared_error(self.y_test, y_predict_linear)
        rmse = np.sqrt(mse)

        print(f'Previsão: \n {y_predict_linear}')
        print(f'MSE: {mse}')
        print(f'RmSE: {rmse}')

    def modelo_svr(self):
        svr = SVR()

        treinamento = (
            svr
            .fit(self.x_train, self.y_train)
        )

        y_svr = treinamento.predict(self.x_test)
        
        # Avaliação do modelo:
        mse = mean_squared_error(self.y_test, y_svr)
        rmse = np.sqrt(mse)

        print(f'Previsão: \n {y_svr}')
        print(f'MSE: {mse}')
        print(f'RmSE: {rmse}')

    def modelo_xgb_regressor(self):
        xgb_regressor = XGBRegressor()

        treinamento = (
            xgb_regressor
            .fit(self.x_train, self.y_train)
        )

        y_xgb_regressor = treinamento.predict(self.x_test)
        
        # Avaliação do modelo:
        mse = mean_squared_error(self.y_test, y_xgb_regressor)
        rmse = np.sqrt(mse)

        print(f'Previsão: \n {y_xgb_regressor}')
        print(f'MSE: {mse}')
        print(f'RmSE: {rmse}')

    def modelo_xgb_regressor_otimizado(self):
        
        parameters = {
            "max_depth": [5, 6, 7],
            "learning_rate": [0.1, 0.2,0.3],
            "objective": ['reg:squarederror'],
            "booster": ['gbtree'],
            "n_jobs": [5],
            "gamma": [0, 1],
            "min_child_weight": [1,3],
            "max_delta_step": [0,1],
            "subsample": [0.5, 1]
        }

        xgb_regressor = XGBRegressor()

        xgb = GridSearchCV(
            estimator=xgb_regressor,
            param_grid=parameters,
            refit='neg_mean_squared_error',
            verbose=True
        )

        treinamento = (
            xgb
            .fit(self.x_train, self.y_train)
        )

        keys = treinamento.get_params().keys()

        y_xgb_regressor = treinamento.predict(self.x_test)
        
        # Avaliação do modelo:
        mse = mean_squared_error(self.y_test, y_xgb_regressor)
        rmse = np.sqrt(mse)

        print(f'Parametros: {keys}')
        print(f'Previsão: \n {y_xgb_regressor}')
        print(f'MSE: {mse}')
        print(f'RmSE: {rmse}')

if __name__ == '__main__':
    data_modeling = DataModeling()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

# Iniciando Código:

In [3]:
data_modeling.dataframe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
data_modeling.chaves_do_df()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module'])


In [5]:
data_modeling.descrição_do_df()

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [6]:
data_modeling.variavel_targat()

[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 3

In [7]:
data_modeling.separação_de_train_e_test()

In [8]:
data_modeling.modelo_de_regreção_linear()

Previsão: 
 [28.99672362 36.02556534 14.81694405 25.03197915 18.76987992 23.25442929
 17.66253818 14.34119    23.01320703 20.63245597 24.90850512 18.63883645
 -6.08842184 21.75834668 19.23922576 26.19319733 20.64773313  5.79472718
 40.50033966 17.61289074 27.24909479 30.06625441 11.34179277 24.16077616
 17.86058499 15.83609765 22.78148106 14.57704449 22.43626052 19.19631835
 22.43383455 25.21979081 25.93909562 17.70162434 16.76911711 16.95125411
 31.23340153 20.13246729 23.76579011 24.6322925  13.94204955 32.25576301
 42.67251161 17.32745046 27.27618614 16.99310991 14.07009109 25.90341861
 20.29485982 29.95339638 21.28860173 34.34451856 16.04739105 26.22562412
 39.53939798 22.57950697 18.84531367 32.72531661 25.0673037  12.88628956
 22.68221908 30.48287757 31.52626806 15.90148607 20.22094826 16.71089812
 20.52384893 25.96356264 30.61607978 11.59783023 20.51232627 27.48111878
 11.01962332 15.68096344 23.79316251  6.19929359 21.6039073  41.41377225
 18.76548695  8.87931901 20.83076916 13

In [9]:
data_modeling.modelo_svr()

Previsão: 
 [22.5061509  24.44637611 15.48342923 23.47861138 15.85575665 20.41359379
 22.22729603 19.30750829 16.00684204 20.69013802 22.5948442  22.17932651
 13.22283187 20.88634606 22.74343046 16.11962825 23.6578174  15.50113391
 24.80649803 15.86119502 24.85903189 24.42774747 21.65632951 23.06890895
 15.18170664 15.47518332 21.28809591 13.27760452 22.58811671 20.84375287
 22.39598564 23.17949952 15.87553566 15.81646768 15.26751873 18.53736675
 23.75510385 24.20893309 23.03137207 23.35824097 20.74014355 24.26097498
 25.29050354 22.45914878 23.21111935 15.87406945 21.6649305  23.42123755
 16.08170391 23.06164042 22.35542867 24.67123285 22.08392839 21.02076269
 23.7698124  16.48115196 15.85681759 24.99519292 23.87040225 22.19275612
 23.61440768 25.38561307 23.18766117 21.82639671 21.46613519 22.43757636
 15.82206507 23.86467246 24.9310726  13.3354447  24.23171731 16.22743397
 14.35580648 23.47121935 21.03977262 15.66672407 20.63701249 25.28324943
 15.81293928 15.0221233  22.55700153 15

In [10]:
data_modeling.modelo_xgb_regressor()

Previsão: 
 [23.25328   30.024755  15.632249  23.313478  17.775118  21.142563
 20.19583   15.010124  21.23614   22.242369  20.457346  19.209145
  8.551788  21.210636  20.696491  26.74365   18.824339  10.525872
 45.68885   14.116162  26.618996  24.94542   13.3510275 20.87231
 15.400073  15.636547  22.324673  12.777009  20.726126  22.56401
 20.346395  22.303246  18.523277  21.764612  15.568828  15.683646
 33.073547  19.115112  21.955132  22.399914  18.998787  31.328337
 43.464993  18.20766   22.09233   14.353467  14.607512  22.716745
 19.700527  27.072327  22.579268  35.133675  16.241447  25.214682
 46.013332  21.89786   15.043295  32.93268   20.53731   16.568089
 24.07178   34.34796   28.542194  16.977676  25.867334  15.649837
 13.039615  23.00082   27.26897   15.414835  21.546648  31.72919
 10.665012  20.770847  21.848396   6.475782  20.939093  46.59454
 12.456056   8.739085  22.215406  13.390212  20.454681  10.45914
 19.722834  27.327946  16.254663  23.860172  25.414312  17.06042
 22.

In [11]:
data_modeling.modelo_xgb_regressor_otimizado()

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Parametros: dict_keys(['cv', 'error_score', 'estimator__objective', 'estimator__base_score', 'estimator__booster', 'estimator__callbacks', 'estimator__colsample_bylevel', 'estimator__colsample_bynode', 'estimator__colsample_bytree', 'estimator__early_stopping_rounds', 'estimator__enable_categorical', 'estimator__eval_metric', 'estimator__feature_types', 'estimator__gamma', 'estimator__gpu_id', 'estimator__grow_policy', 'estimator__importance_type', 'estimator__interaction_constraints', 'estimator__learning_rate', 'estimator__max_bin', 'estimator__max_cat_threshold', 'estimator__max_cat_to_onehot', 'estimator__max_delta_step', 'estimator__max_depth', 'estimator__max_leaves', 'estimator__min_child_weight', 'estimator__missing', 'estimator__monotone_constraints', 'estimator__n_estimators', 'estimator__n_jobs', 'estimator__num_parallel_tree', 'estimator__predictor', 'estimator__random_state', 'estimator__reg_alpha', 'estimator_