# Seleção do Modelo

Utilizando o conjunto de dados do covid-19 realize as seguintes atividades:
<ol>
    <li>Apresentar a "redução" do rmse com os diferentes regressores (LinearRegressor, DecisionTreeRegressor, RandonForestRegressor) e utilizando o GridSearchCV para definir o melhor set de parâmetros para o RandonForest </li>
    <li>Fazer uma função para realizar a seleção das características, conforme a ordem de importância (min(modelo_reg.feature_importances_ )) do melhor regressor e ir removendo as características até impactar no rmse do conjunto de teste</li>
    <li>Investigar utilizar Support Vector Regressor (sklearn.svm.SVR) variando automaticamente os hiperparâmetros (kernel e C) e apresentar o RMSE

In [89]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import itertools


## Carregando os pickles

In [61]:
def read_pickle(name):
    with (open(name, 'rb')) as openfile:
        while True:
            try:
                one_instance = pickle.load(openfile)
            except EOFError:
                break
    one_instance = np.asanyarray(one_instance)
    return one_instance


In [62]:
X_train = read_pickle('X_train.pickle')
X_test = read_pickle('X_test.pickle')
y_train = read_pickle('y_train.pickle')
y_test = read_pickle('y_test.pickle')


In [63]:
print(y_test.shape, y_train.shape, X_test.shape, X_train.shape)


(45,) (179,) (45, 7) (179, 7)


# #1 LinearRegressor

### Treinando o modelo

In [64]:
lin_reg = LinearRegression() 
lin_reg.fit(X_train, y_train) # y = ax + b


LinearRegression()

### Avaliando o modelo

In [65]:
predictions = lin_reg.predict(X_test)

In [66]:
print(predictions.shape)
print(predictions)

(45,)
[ 4.99255588e+00  1.45305904e+01  2.34490151e+01  5.77313937e+00
  3.31027064e+00  8.40255470e+00  5.86748796e+00 -5.08200007e-01
  9.77495527e+00  5.96162380e+00  2.40430019e+02  5.74509738e+00
  2.72821761e+00  5.93373579e+01  3.25664381e+01  8.12309505e+00
  1.27713950e+00  1.93651823e+00  4.00603186e+00  3.52493159e+01
  1.48997735e+01  1.22106862e+01  1.72248171e+01  1.79036466e+00
  8.95685191e+01  2.69712846e+00  6.51638884e+00  1.87900115e+01
  6.82933993e+01  7.72626581e+00  1.30685686e+02  4.47808753e+02
  2.69445830e+00  1.24569812e+01  5.16700124e+00  1.14614313e+00
  4.46685327e+00  4.65536141e+00  4.50264479e+00 -3.42994916e+00
  6.22391039e+01 -1.59564054e+00  7.28890571e-02  8.43171491e+00
  2.02255274e+01]


In [67]:
lin_mse = mean_squared_error(y_test, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


12.333727282130322

# #2 Regressor DecisionTree

### Treinando o modelo

In [68]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train,y_train)

DecisionTreeRegressor()

### Avaliando o modelo

In [69]:
predictions = dt_reg.predict(X_test)


In [70]:
print(predictions.shape)
print(predictions)

(45,)
[  5.  14.  13.  12.   1.  10.   6.   2.  14.  15. 179.   7.   5.  39.
  27.   9.   3.   2.   3.  37.  11.  19.  31.   5. 105.   5.   9.  19.
  38.  10. 167. 209.   9.  15.   8.   5.  12.  10.   2.   3.  31.   4.
   8.  12.  30.]


In [71]:
dt_mse = mean_squared_error(y_test,predictions)
dt_rmse = np.sqrt(dt_mse)
dt_rmse

42.270294271246534

# #3 Random Forest Regressor

### Treinando o modelo

In [72]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train,y_train)

RandomForestRegressor()

### Avaliando o modelo

In [14]:
rf_predictions = rf_reg.predict(X_test)


In [73]:
print(predictions.shape)
print(predictions)


(45,)
[  5.  14.  13.  12.   1.  10.   6.   2.  14.  15. 179.   7.   5.  39.
  27.   9.   3.   2.   3.  37.  11.  19.  31.   5. 105.   5.   9.  19.
  38.  10. 167. 209.   9.  15.   8.   5.  12.  10.   2.   3.  31.   4.
   8.  12.  30.]


In [74]:
rf_mse = mean_squared_error(y_test,rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_rmse


45.78550799349312

# Ajuste dos hiperparâmetros do modelo

In [17]:
lin_reg = LinearRegression()

In [18]:
lin_reg.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': 'deprecated',
 'positive': False}

In [75]:
param_grid = [ 
    {'fit_intercept': [False, True], 'n_jobs': [-1]}
]


In [76]:
lin_reg_best = LinearRegression()

In [77]:
grid_search = GridSearchCV(lin_reg_best, param_grid, cv=5, scoring='neg_root_mean_squared_error')

In [78]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LinearRegression(),
             param_grid=[{'fit_intercept': [False, True], 'n_jobs': [-1]}],
             scoring='neg_root_mean_squared_error')

In [23]:
grid_search.best_params_

{'fit_intercept': True, 'n_jobs': -1}

In [79]:
lin_reg_best = LinearRegression(n_jobs=-1)
lin_reg_best.fit(X_train, y_train)

LinearRegression(n_jobs=-1)

In [80]:
predictions = lin_reg_best.predict(X_test)


In [81]:
lin_mse = mean_squared_error(y_test, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

12.333727282130322

# #Ponto 2

Fazer uma função para realizar a seleção das características, conforme a ordem de importância (min(modelo_reg.feature_importances_ )) do melhor regressor e ir removendo as características até impactar no rmse do conjunto de teste

In [27]:
# lrb = LinearRegression(n_jobs=-1)
# lrb.fit(X_train, y_train)
# importante = list(lrb.coef_)
# importante = list(map(abs, importante))
# print(importante.index(min(importante)))



# predictions = lrb.predict(X_test)
# lin_mse = mean_squared_error(y_test, predictions)
# lin_rmse = np.sqrt(lin_mse)


def lSC(rmse, X_train, y_train, X_test, y_test):
    lrb = LinearRegression(n_jobs=-1)
    lrb.fit(X_train, y_train)
    importante = list(lrb.coef_)
    importante = list(map(abs, importante))
    i = importante.index(min(importante))

    X_train_feat_select = np.delete(X_train, i, 1)
    X_test_feat_select = np.delete(X_test, i, 1)

    lrb.fit(X_train_feat_select, y_train)
    predictions = lrb.predict(X_test_feat_select)
    lin_mse = mean_squared_error(y_test, predictions)
    lin_rmse = np.sqrt(lin_mse)

    if(lin_rmse < rmse):
        return lSC(lin_rmse, X_train_feat_select, y_train, X_test_feat_select, y_test)
    else:
        return rmse

In [28]:
print(lSC(lin_rmse, X_train, y_train, X_test, y_test))

12.322002298288908


# #Ponto 3

In [94]:
kernels = ["rbf", "linear", "poly"]
c = [0.1, 0.5, 1, 10, 100]


In [95]:
for k, c in itertools.product(kernels, c):
    svr = SVR(kernel=k, C=c)
    prediction = svr.fit(X_train, y_train).predict(X_test)
    mse = mean_squared_error(y_test, prediction)
    rmse = np.sqrt(mse)
    print(c, k, rmse)


0.1 rbf 77.77663407331384
0.5 rbf 77.0863599554768
1 rbf 76.51146926165018
10 rbf 68.96267981910991
100 rbf 54.75810545542386
0.1 linear 73.47658788986281
0.5 linear 60.901630650467624
1 linear 48.29936517388608
10 linear 12.295417221136136
100 linear 13.554044602427627
0.1 poly 76.00262738698635
0.5 poly 75.34858066257054
1 poly 74.68607656067178
10 poly 66.7751510882104
100 poly 35.67846343344034
