## ITA 2021

<br>

Dicionário de Dados:

* n: número de agentes
* p: fração de traders
* f: grau de interesse dos traders
* x, y, z: dimensões do espaço aéreo
* a1, a2: média e desvio padrão do coeficiente do preço fundamental dos consumidores
* a3, a4: idem para os traders
* b1, b2: média e desvio padrão do coeficiente do preço de mercado dos consumidores
* b3, b4: idem para os traders
* c1, c2: média e desvio padrão do coeficiente do preço aleatório dos consumidores
* c3, c4: idem para os traders
* g1, g2: média e desvio padrão do grau de agressividade dos consumidores
* l1, l2: média e desvio padrão do coeficiente de desvalorização para os consumidores
* l3, l4: idem para os traders
* e1, e2: variabilidade no preço fundamental dos consumidores e traders, respectivamente
* cent_price_cor: correlação entre o preço final e centralidade das permissões de vôo
* cent_trans_cor: idem para o número de transações

In [None]:
import sys
!{sys.executable} -m pip install jupyternotify

In [1]:
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [1]:
# Importando Ferramentas Básicas
import pandas                  as pd
import matplotlib.pyplot       as plt
import numpy                   as np
import                            os
from   datetime            import datetime

In [2]:
# Importando Ferramentas de Limpeza
from sklearn.decomposition    import PCA
from sklearn.preprocessing    import StandardScaler
from sklearn.pipeline         import make_pipeline, Pipeline

In [3]:
# Importando Ferramentas de Modelo
from sklearn.svm              import SVR
from xgboost                  import XGBRegressor
from sklearn.model_selection  import train_test_split
from sklearn.model_selection  import GridSearchCV, RandomizedSearchCV
from sklearn.metrics          import accuracy_score, mean_absolute_error
from sklearn.linear_model     import LinearRegression, LogisticRegression, Lasso
from sklearn.base             import BaseEstimator

In [63]:
# Importando os dados
train = pd.read_csv('./../Dados/train.csv')
test = pd.read_csv('./../Dados/test.csv')

In [64]:
# Criando Features
dataframes = [train, test]

for df in dataframes:
    df['volume']  = df.x * df.y * df.z
    df['densidade'] = df.volume / df.n

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.isna().sum()/len(train)

In [None]:
test.isna().sum()/len(train)

In [None]:
train.cent_price_cor.describe()

In [None]:
train.cent_trans_cor.describe()

In [None]:
train.corr()["cent_price_cor"].abs().sort_values(ascending = True)

In [None]:
train.corr()["cent_trans_cor"].abs().sort_values(ascending = True)

In [None]:
y_1_pred = pipe_1.predict(X_test)

In [None]:
mean_absolute_error(y_1_test, y_1_pred)

## PCA

In [27]:
X = train.drop(columns = ['cent_price_cor', 'cent_trans_cor'], axis = 1)

scaler = StandardScaler()

transf_X_train = scaler.fit_transform(X_train)
transf_X_test = scaler.fit_transform(X_test)

In [28]:
y_price = train.cent_price_cor
y_trans = train.cent_trans_cor

X_train, X_test, y_price_train, y_price_test = train_test_split(X,y_price,
                                                    test_size = 0.25,
                                                    random_state = 0)

X_train, X_test, y_trans_train, y_trans_test = train_test_split(X,y_trans,
                                                    test_size = 0.25,
                                                    random_state = 0)

In [29]:
models = {'Linear Regression': LinearRegression(n_jobs = -1),
          'SVR': SVR(epsilon=0.2),
          'Lasso': Lasso(),
          'XGBoostRegressor': XGBRegressor()}

In [67]:
def fit_score_PCA(models,X_train,y_train,X_test,y_test,components):

    # Make a dict to keep model scores
    model_scores = {}
    
    for i in components:
        
        pca = PCA(n_components = i)
        X_train_PCA = pca.fit_transform(X_train)
        X_test_PCA = pca.transform(X_test)
        
        print(X_train_PCA.shape)
        print(X_test_PCA.shape)

        # Loop through models
        for name, model in models.items():

            # Fit the model to the data
            model.fit(X_train_PCA,y_train)
        
            y_pred = model.predict(X_test_PCA)

            #Evaluates the model and append its score to model_scores
            model_scores[name + '_' + str(i)] = mean_absolute_error(y_test, y_pred)

    return model_scores

In [68]:
model_scores_trans = fit_score_PCA(models,X_train,y_trans_train,X_test,y_trans_test, [0.95])
model_scores_price = fit_score_PCA(models,X_train,y_price_train,X_test,y_price_test, [0.95])

(8955, 1)
(2985, 1)
(8955, 1)
(2985, 1)


In [None]:
# Melhores scores para 0.95 (sem scaling)
0.0941312117033256 + 0.090252152275057

In [61]:
model_scores_trans = fit_score_PCA(models,transf_X_train,y_trans_train,transf_X_test,y_trans_test, [0.2])
model_scores_price = fit_score_PCA(models,transf_X_train,y_price_train,transf_X_test,y_price_test, [1])

(8955, 4)
(8955, 1)


In [None]:
# Melhores scores para 0.95 (com scaling)
0.0941305091243686 + 0.09025552474334281

In [74]:
model_scores_trans = fit_score_PCA(models,transf_X_train,y_trans_train,transf_X_test,y_trans_test, [0.1])
model_scores_price = fit_score_PCA(models,transf_X_train,y_price_train,transf_X_test,y_price_test, [1])

(8955, 2)
(2985, 2)
(8955, 1)
(2985, 1)


In [89]:
model_scores_trans = fit_score_PCA(models,transf_X_train,y_trans_train,transf_X_test,y_trans_test, [0.8,0.85,0.9,0.95])
#model_scores_price = fit_score_PCA(models,transf_X_train,y_price_train,transf_X_test,y_price_test, [0.91,0.95,1])

(8955, 21)
(2985, 21)
(8955, 22)
(2985, 22)
(8955, 24)
(2985, 24)
(8955, 25)
(2985, 25)


In [17]:
#'Linear Regression_0.95': 0.09020619481829613

sorted(model_scores_trans, key = model_scores_trans.get)

['Linear Regression_0.95',
 'Linear Regression_0.91',
 'Lasso_0.91',
 'Lasso_0.95',
 'Lasso_1',
 'Linear Regression_1',
 'SVR_1',
 'XGBoostRegressor_1',
 'XGBoostRegressor_0.91',
 'XGBoostRegressor_0.95',
 'SVR_0.91',
 'SVR_0.95']

In [18]:
# 'Lasso_0.91': 0.09411110306791731,
sorted(model_scores_price, key = model_scores_price.get)

['Lasso_0.91',
 'Lasso_0.95',
 'Lasso_1',
 'Linear Regression_1',
 'Linear Regression_0.95',
 'Linear Regression_0.91',
 'SVR_1',
 'XGBoostRegressor_1',
 'SVR_0.95',
 'SVR_0.91',
 'XGBoostRegressor_0.95',
 'XGBoostRegressor_0.91']

In [None]:
model_scores_trans = fit_score_PCA(models,transf_X_train,y_trans_train,transf_X_test,y_trans_test, [0.8,0.85,0.9,0.95])
model_scores_price = fit_score_PCA(models,transf_X_train,y_price_train,transf_X_test,y_price_test, [0.8,0.85,0.9,0.95])

In [19]:
{k: v for k, v in sorted(model_scores_price.items(), key=lambda item: item[1])}

{'Lasso_0.91': 0.09411110306791731,
 'Lasso_0.95': 0.09411110306791731,
 'Lasso_1': 0.09411110306791731,
 'Linear Regression_1': 0.09413049877635919,
 'Linear Regression_0.95': 0.09417792238315124,
 'Linear Regression_0.91': 0.09417835915003633,
 'SVR_1': 0.09434654907849423,
 'XGBoostRegressor_1': 0.09705142031202166,
 'SVR_0.95': 0.09830355128110657,
 'SVR_0.91': 0.09886755546696292,
 'XGBoostRegressor_0.95': 0.10000986378128596,
 'XGBoostRegressor_0.91': 0.10054150387570636}

In [20]:
{k: v for k, v in sorted(model_scores_trans.items(), key=lambda item: item[1])}

{'Linear Regression_0.95': 0.09020619481829613,
 'Linear Regression_0.91': 0.09021593345191478,
 'Lasso_0.91': 0.09025387237696017,
 'Lasso_0.95': 0.09025387237696017,
 'Lasso_1': 0.09025387237696017,
 'Linear Regression_1': 0.09025552931887561,
 'SVR_1': 0.0931376154572009,
 'XGBoostRegressor_1': 0.09335223579586452,
 'XGBoostRegressor_0.91': 0.09602769537295529,
 'XGBoostRegressor_0.95': 0.09618455696513305,
 'SVR_0.91': 0.10137796527210743,
 'SVR_0.95': 0.10148067436569513}

## GridSearch

In [4]:
# Execucao do programa

# Importando os dados
train = pd.read_csv('./../Dados/train.csv')
test = pd.read_csv('./../Dados/test.csv')

dataframes = [train, test]

for df in dataframes:
    df['volume']  = df.x * df.y * df.z
    df['densidade'] = df.volume / df.n
    
X = train.drop(columns = ['cent_price_cor', 'cent_trans_cor'], axis = 1)

In [5]:
y_price = train.cent_price_cor
y_trans = train.cent_trans_cor

X_train, X_test, y_price_train, y_price_test = train_test_split(X,y_price,
                                                    test_size = 0.25,
                                                    random_state = 0)

X_train, X_test, y_trans_train, y_trans_test = train_test_split(X,y_trans,
                                                    test_size = 0.25,
                                                    random_state = 0)

In [6]:
scaler = StandardScaler()

transf_X = scaler.fit_transform(X)
transf_X_test = scaler.fit_transform(X_test)

In [9]:
# Coletanea de parametros para o GridSearch
params_grid = [

#Linear Regression
#{'normalize': ['True', 'False'],
#'fit_intercept': ['True', 'False']},
 
#SVR teste
{'C': [0.1], 
 'coef0': [0.01], 
 'degree': [3], 
 'epsilon': [0.1], 
 'gamma': ['auto'], 
 'kernel': ['rbf']}
    
 #SVR RBF
# {'kernel': ['rbf'],
# 'C':[0.1, 0.5, 1, 5, 10],
# 'degree': [3,8],
# 'coef0': [0.01,10,0.5],
 #'gamma': ('auto','scale'),
 #'epsilon': [0.1,0.2]}
    
 #SVR POLY
 #,{'kernel': ['poly'],
# 'C':[0.1, 0.5, 1, 5, 10],
 #'degree': [3,8],
 #'coef0': [0.01,10,0.5],
# 'gamma': ('auto','scale'),
 #'epsilon': [0.1,0.2]}
    
#Lasso
#,{'alpha':[0.02, 0.024, 0.025, 0.026, 0.03]}  
    
# #XGBoost
# ,{'nthread':[4], #when use hyperthread, xgboost may become slower
# 'objective':['reg:linear'],
# 'learning_rate': [.03, 0.05, .07], #so called `eta` value
# 'max_depth': [5, 6, 7],
# 'min_child_weight': [4],
# 'silent': [1],
# 'subsample': [0.7],
# 'colsample_bytree': [0.7],
# 'n_estimators': [500]}
 ]

In [None]:
def prever(X_train, X_test, y_train, y_test, target_name, components = [20,21,22,23,24,25,26,27,28]):
    
    lista_scores = []
    lista_PCA = []
    lista_params = []
    lista_models = []
    
    models = [
        #LinearRegression(),
        SVR(),
        #SVR(),
        #Lasso()
        #XGBRegressor()
        ]
     
    for n in components:
        
        pca = PCA(n_components = n)
        X_train_PCA = pca.fit_transform(X_train)
        X_test_PCA = pca.transform(X_test)
            
        for i, model in enumerate(models):

            print(f"\n\nModelo: {model}\nComponent: {n}\n\n" + str(X_train_PCA.shape) + str(X_test_PCA.shape))

            clf = GridSearchCV(model, param_grid = params_grid[i],
                               scoring = 'neg_mean_absolute_error', #destaque Ã  mÃ©trica pedida
                               n_jobs=-1, refit=True, cv=5, verbose=5,
                               pre_dispatch='2*n_jobs', error_score='raise', 
                               return_train_score=True)
            
            clf.fit(X_train_PCA, y_train)

            pred_cv = clf.predict(X_test_PCA)
            score_cv = mean_absolute_error(y_test, pred_cv)
            print(f"Melhores parametros: {clf.best_params_}")
            print(f"\nScore Grid: {score_cv}")
            
            lista_params.append(clf.best_params_)
            lista_models.append(model)
            lista_scores.append(round(score_cv,15))
            lista_PCA.append(n)

    print("Exportando DataFrame de Scores\n")

    df_scores = pd.DataFrame()
    
    df_scores.insert(loc=0, column='PCA', value= pd.Series(lista_PCA))
    df_scores.insert(loc=0, column='Scores', value= pd.Series(lista_scores))
    df_scores.insert(loc=0, column='Params', value= pd.Series(lista_params))
    df_scores.insert(loc=0, column='Model', value= pd.Series(lista_models))
    df_scores.to_csv(f"./../Resultados/{target_name}_scores_"+"{}.csv".format(datetime.now().strftime("%d-%m-%Y_%Hh%Mm%Ss")))
            
    return df_scores

In [37]:
df_scores_trans = prever(X, X_test, y_trans, y_trans_test, "trans")



Modelo: SVR()
Component: 20

(11940, 20)(2985, 20)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   32.0s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07128245446971172


Modelo: SVR()
Component: 21

(11940, 21)(2985, 21)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   34.1s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07128272031290148


Modelo: SVR()
Component: 22

(11940, 22)(2985, 22)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   30.2s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07128472659045745


Modelo: SVR()
Component: 23

(11940, 23)(2985, 23)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   30.6s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07128433503560794


Modelo: SVR()
Component: 24

(11940, 24)(2985, 24)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   30.0s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07128793312845744


Modelo: SVR()
Component: 25

(11940, 25)(2985, 25)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   29.9s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.0712895330005062


Modelo: SVR()
Component: 26

(11940, 26)(2985, 26)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   31.7s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.0712918767151264


Modelo: SVR()
Component: 27

(11940, 27)(2985, 27)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   30.9s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07129374706004338


Modelo: SVR()
Component: 28

(11940, 28)(2985, 28)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   31.7s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07129807710730442
Exportando DataFrame de Scores



In [14]:
df_scores_price = prever(X, X_test, y_price, y_price_test, "price")



Modelo: SVR()
Component: 20

(11940, 20)(2985, 20)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   32.5s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07340112996089755


Modelo: SVR()
Component: 21

(11940, 21)(2985, 21)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   33.0s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07340545735792878


Modelo: SVR()
Component: 22

(11940, 22)(2985, 22)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   36.9s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07340937641843025


Modelo: SVR()
Component: 23

(11940, 23)(2985, 23)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   36.2s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.0734123720169559


Modelo: SVR()
Component: 24

(11940, 24)(2985, 24)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   34.8s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07341799510140862


Modelo: SVR()
Component: 25

(11940, 25)(2985, 25)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   34.3s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.07342286630702807


Modelo: SVR()
Component: 26

(11940, 26)(2985, 26)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   34.0s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.0734280944597349


Modelo: SVR()
Component: 27

(11940, 27)(2985, 27)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   35.8s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.0734339078476797


Modelo: SVR()
Component: 28

(11940, 28)(2985, 28)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   35.1s finished


Melhores parametros: {'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

Score Grid: 0.0734392225418337
Exportando DataFrame de Scores



In [16]:
df_scores_price.Scores.min()

0.073401129960898

In [18]:
df_scores_price[df_scores_price.Scores == 0.073401129960898]

Unnamed: 0,Model,Params,Scores,PCA
0,SVR(),"{'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilo...",0.073401,20


In [38]:
df_scores_trans.Scores.min()

0.071282454469712

In [16]:
df_scores_trans[df_scores_trans.Scores == 0.089959673362208]

Unnamed: 0,Model,Params,Scores,PCA
16,LinearRegression(),"{'fit_intercept': 'True', 'normalize': 'True'}",0.08996,28


In [17]:
# Resultado sem scaling

0.089947941686274 + 0.093949935453886
0.089959673362208 + 0.093937671406177

0.18389734476838498

## Melhores resultados:

Trans = 0.089947941686274 (com scaling) <br><br>
``LinearRegression()	{'fit_intercept': 'True', 'normalize': 'True'}	PCA = 25``

<br><br><br>
Price = 0.093937671406177 (sem scaling) <br><br>
``LinearRegression()	{'fit_intercept': 'True', 'normalize': 'True'}  PCA = 26``


In [46]:
0.089947941686274  + 0.093937671406177

0.183885613092451

In [17]:
0.071282454469712 + 0.073401129960898

0.14468358443061

In [41]:
def geral_resultados_submissao(test_price_pca, test_trans_pca, clf_price, clf_trans):

    # pca_price = PCA(n_components = n_price)
    # pca_trans = PCA(n_components = n_price)

    # test_price_PCA = pca_price.fit_transform(test.drop("id", axis=1))
    # test_trans_PCA = pca_trans.fit_transform(test.drop("id", axis=1))
    
    cent_price_cor = clf_price.predict(test_price_PCA)
    cent_trans_cor = clf_trans.predict(test_trans_PCA)


    df_sub = pd.DataFrame({"cent_price_cor": cent_price_cor, "cent_trans_cor": cent_trans_cor})
    
    df_sub.to_csv("./../Submissoes/df_sub_SVR_{}.csv".format(datetime.now().strftime("%d-%m-%Y_%Hh%Mm%Ss")), index=False)

    return df_sub

In [42]:
pca = PCA(n_components = 20)

train_PCA = pca.fit_transform(X)

clf_price = SVR(kernel = 'rbf',
                C = 0.1,
                coef0 = 0.01,
                degree = 3,
                epsilon = 0.1,
                gamma = 'auto')

clf_price.fit(train_PCA, y_price)

clf_trans = SVR(kernel = 'rbf',
                C = 0.1,
                coef0 = 0.01,
                degree = 3,
                epsilon = 0.1,
                gamma = 'auto')

clf_trans.fit(train_PCA, y_trans)

test_price_PCA = pca.fit_transform(test.drop("id", axis=1))

test_trans_PCA = pca.fit_transform(test.drop("id", axis=1))


df_sub = geral_resultados_submissao(test_price_PCA, test_trans_PCA, clf_price, clf_trans)

print(df_sub)

print("\nPrograma executado com sucesso \n")

      cent_price_cor  cent_trans_cor
0          -0.200193        0.358372
1          -0.200313        0.358371
2          -0.200313        0.358368
3          -0.200313        0.358370
4          -0.200313        0.358371
...              ...             ...
8054       -0.200313        0.358370
8055       -0.200313        0.358389
8056       -0.200313        0.358371
8057       -0.200313        0.358371
8058       -0.200313        0.358371

[8059 rows x 2 columns]

Programa executado com sucesso 



In [23]:
pca_price = PCA(n_components = 26)
pca_trans = PCA(n_components = 25)

X_trans = scaler.fit_transform(X)

train_price_PCA = pca_price.fit_transform(X)
train_trans_PCA = pca_trans.fit_transform(X_trans)

clf_price = LinearRegression({'fit_intercept': 'True', 'normalize': 'True'})
clf_price.fit(train_price_PCA, y_price)

clf_trans = LinearRegression({'fit_intercept': 'True', 'normalize': 'True'})
clf_trans.fit(train_trans_PCA, y_trans)

test_price_PCA = pca_price.fit_transform(test.drop("id", axis=1))

test_trans = scaler.fit_transform(test.drop("id", axis=1))
test_trans_PCA = pca_trans.fit_transform(test)


# PRICE ==> PCA = 26 // Linear Reg

# TRANS ==> PCA = 25 // Linear Reg - com scaling

df_sub = geral_resultados_submissao(test_price_PCA, test_trans_PCA, clf_price, clf_trans)

print(df_sub)

print("\nPrograma executado com sucesso \n")

NameError: name 'scaler' is not defined

In [36]:
df_scores_price_scaling.Scores.min()

0.093949935453886

In [13]:
df_scores_trans_scaling.Scores.min()

0.071282454469712

In [38]:
0.089947941686274 + 0.093949935453886

0.18389787714015998

In [44]:
df_scores_price_scaling[df_scores_price_scaling.Scores == 0.093949935453886]

Unnamed: 0,Model,Params,Scores,PCA
2,LinearRegression(),"{'fit_intercept': 'True', 'normalize': 'True'}",0.09395,21


In [15]:
df_scores_trans_scaling[df_scores_trans_scaling.Scores == 0.071282454469712]

Unnamed: 0,Model,Params,Scores,PCA
0,SVR(),"{'C': 0.1, 'coef0': 0.01, 'degree': 3, 'epsilo...",0.071282,20


In [60]:
test.head()

Unnamed: 0,id,n,p,f,x,y,z,a1,a2,a3,...,g1,g2,l1,l2,l3,l4,e1,e2,volume,densidade
0,1,558,0.5,0.2,28,21,2,1.0,0.29,1.3,...,0.198,1.8,0.149,2.0,0.078,1.7,0.5,2.9,1176,2.107527
1,2,910,0.9,0.3,28,19,9,2.2,0.14,1.6,...,0.172,1.0,0.086,0.8,0.03,0.8,0.2,1.6,4788,5.261538
2,3,213,0.8,0.5,21,15,8,2.2,0.22,2.6,...,0.083,0.7,0.019,1.4,0.016,1.2,2.6,2.4,2520,11.830986
3,4,654,0.7,0.5,14,15,2,2.2,0.12,3.8,...,0.185,1.3,0.11,0.8,0.006,1.7,0.2,1.7,420,0.642202
4,5,672,0.7,0.5,24,10,5,3.7,0.2,3.7,...,0.158,0.9,0.148,1.9,0.038,1.3,1.1,2.8,1200,1.785714
