## ITA 2021

<br>

Dicionário de Dados:

* n: número de agentes
* p: fração de traders
* f: grau de interesse dos traders
* x, y, z: dimensões do espaço aéreo
* a1, a2: média e desvio padrão do coeficiente do preço fundamental dos consumidores
* a3, a4: idem para os traders
* b1, b2: média e desvio padrão do coeficiente do preço de mercado dos consumidores
* b3, b4: idem para os traders
* c1, c2: média e desvio padrão do coeficiente do preço aleatório dos consumidores
* c3, c4: idem para os traders
* g1, g2: média e desvio padrão do grau de agressividade dos consumidores
* l1, l2: média e desvio padrão do coeficiente de desvalorização para os consumidores
* l3, l4: idem para os traders
* e1, e2: variabilidade no preço fundamental dos consumidores e traders, respectivamente
* cent_price_cor: correlação entre o preço final e centralidade das permissões de vôo
* cent_trans_cor: idem para o número de transações

In [1]:
# Importando Ferramentas Básicas
import pandas                  as pd
import matplotlib.pyplot       as plt
import numpy                   as np
import                            os
from   datetime            import datetime

In [2]:
# Importando Ferramentas de Limpeza
from sklearn.decomposition    import PCA
from sklearn.preprocessing    import StandardScaler

In [3]:
# Importando Ferramentas de Modelo
from sklearn.svm              import SVR
from xgboost                  import XGBRegressor
from sklearn.model_selection  import train_test_split
from sklearn.model_selection  import GridSearchCV, RandomizedSearchCV
from sklearn.metrics          import accuracy_score, mean_absolute_error
from sklearn.linear_model     import LinearRegression, LogisticRegression, Lasso

In [4]:
def importa_dados():
    # Importando os dados
    train = pd.read_csv('./../Dados/train.csv')
    test = pd.read_csv('./../Dados/test.csv')

    # Criando features
    dataframes = [train, test]

    # Criação de features
    for df in dataframes:
        df['volume']  = df.x * df.y * df.z
        df['densidade'] = df.volume / df.n
    
    return train, test

In [5]:
def prepara_fit(train, test):
    
    X = train.drop(columns = ['cent_price_cor', 'cent_trans_cor'], axis = 1)

    y_price = train["cent_price_cor"]
    y_trans = train["cent_trans_cor"]

    return X, y_price, y_trans

In [6]:
def prever(X, y, target_name, components = [20,21,22,23,24,25,26,27,28]):
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                        test_size = 0.25,
                                                        random_state = 0)


    params_grid = [ #Linear Regression
                    {'normalize': ['True', 'False'],
                    'fit_intercept': ['True', 'False']}

                    #Lasso
                    #,{'alpha':[0.02, 0.024, 0.025, 0.026, 0.03]} 
                    ]
    
    lista_scores = []
    lista_pca = []
    lista_params = []
    lista_models = []
    
    models = [
        LinearRegression(),
        #SVR(),
        #SVR(),
        #Lasso()
        #XGBRegressor()
        ]
     
    for n in components:
        
        pca = PCA(n_components = n)
        X_pca = pca.fit_transform(X)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)
            
        for i, model in enumerate(models):

            print(f"\n\nModelo: {model}\nComponent: {n}\n\n" + str(X_pca.shape) + str(X_pca.shape))

            clf = GridSearchCV(model, param_grid = params_grid[i],
                               scoring = 'neg_mean_absolute_error', #destaque a  metrica pedida
                               n_jobs=-1, refit=True, cv=5, verbose=4,
                               pre_dispatch='2*n_jobs', error_score='raise', 
                               return_train_score=True)
            
            clf.fit(X_train_pca, y_train)

            pred_cv = clf.predict(X_test_pca)
            score_cv = mean_absolute_error(y_test, pred_cv)

            print(f"Melhores parametros: {clf.best_params_}")
            print(f"\nScore Grid: {score_cv}")
            
            lista_params.append(clf.best_params_)
            lista_models.append(model)
            lista_scores.append(round(score_cv,15))
            lista_pca.append(n)

    print("Exportando DataFrame de Scores\n")

    df_scores = pd.DataFrame()
    
    df_scores.insert(loc=0, column='PCA', value= pd.Series(lista_pca))
    df_scores.insert(loc=0, column='Scores', value= pd.Series(lista_scores))
    df_scores.insert(loc=0, column='Params', value= pd.Series(lista_params))
    df_scores.insert(loc=0, column='Model', value= pd.Series(lista_models))
    df_scores.to_csv(f"./../Resultados/{target_name}_scores_"+"{}.csv".format(datetime.now().strftime("%d-%m-%Y_%Hh%Mm%Ss")))
            
    return df_scores

In [7]:
def gera_modelo(pca_price_n , pca_trans_n):
    
    pca_price = PCA(n_components = pca_price_n)
    pca_trans = PCA(n_components = pca_trans_n)

    train_price_pca = pca_price.fit_transform(X)
    train_trans_pca = pca_trans.fit_transform(X)

    clf_price = LinearRegression({'fit_intercept': 'True', 'normalize': 'True'})
    clf_price.fit(train_price_pca, y_price)

    clf_trans = LinearRegression({'fit_intercept': 'True', 'normalize': 'True'})
    clf_trans.fit(train_trans_pca, y_trans)

    test_price_pca = pca_price.fit_transform(test.drop("id", axis=1))
    test_trans_pca = pca_trans.fit_transform(test.drop("id", axis=1))
    
    return test_price_pca, test_trans_pca, clf_trans, clf_price

In [8]:
def geral_resultados_submissao(test_price_pca, test_trans_pca, clf_price, clf_trans):
    
    cent_price_cor = clf_price.predict(test_price_pca)
    cent_trans_cor = clf_trans.predict(test_trans_pca)


    df_sub = pd.DataFrame({"cent_price_cor": cent_price_cor, "cent_trans_cor": cent_trans_cor})
    
    df_sub.to_csv("./../Submissoes/df_sub_{}.csv".format(datetime.now().strftime("%d-%m-%Y_%Hh%Mm%Ss")), index=False)

    return df_sub

In [9]:
train, test = importa_dados()

X, y_price, y_trans = prepara_fit(train, test)

df_scores_price = prever(X, y_price, "price", components = [20,21,22,23,24,25,26,27,28])
df_scores_trans = prever(X, y_trans, "trans", components = [20,21,22,23,24,25,26,27,28])

pca_price_n = len(X.columns)
pca_trans_n = len(X.columns)

test_price_pca, test_trans_pca, clf_trans, clf_price = gera_modelo(pca_price_n, pca_trans_n)

df_sub = geral_resultados_submissao(test_price_pca, test_trans_pca, clf_price, clf_trans)

print(df_sub)

print("\nPrograma executado com sucesso \n")



Modelo: LinearRegression()
Component: 20

(11940, 20)(11940, 20)
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    1.8s remaining:   10.5s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    1.8s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    1.8s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.9s finished


Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09413476980710093


Modelo: LinearRegression()
Component: 21

(11940, 21)(11940, 21)
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09410039775303798


Modelo: LinearRegression()
Component: 22

(11940, 22)(11940, 22)
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s




Score Grid: 0.09409673294073545


Modelo: LinearRegression()
Component: 23

(11940, 23)(11940, 23)
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s




Score Grid: 0.09409868059838686


Modelo: LinearRegression()
Component: 24

(11940, 24)(11940, 24)
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09410742449209838


Modelo: LinearRegression()
Component: 25

(11940, 25)(11940, 25)

[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09409465774545293


Modelo: LinearRegression()
Component: 26

(11940, 26)(11940, 26)
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09410477665296128


Modelo: LinearRegression()
Component: 27

(11940, 27)(11940, 27)
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.0941433494480374


Modelo: LinearRegression()
Component: 28

(11940, 28)(11940, 28)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09416169456648882
Exportando DataFrame de Scores



[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished




Modelo: LinearRegression()
Component: 20

(11940, 20)(11940, 20)
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09019866134152262


Modelo: LinearRegression()
Component: 21

(11940, 21)(11940, 21)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09017688937548382


Modelo: LinearRegression()
Component: 22

(11940, 22)(11940, 22)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09019862919544662


Modelo: LinearRegression()
Component: 23

(11940, 23)(11940, 23)
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0

Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09021656621319836


Modelo: LinearRegression()
Component: 24

(11940, 24)(11940, 24)
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.0902181122466885


Modelo: LinearRegression()
Component: 25

(11940, 25)(11940, 25)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09018590753154558


Modelo: LinearRegression()
Component: 26

(11940, 26)(11940, 26)
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09021740652190091


Modelo: LinearRegression()
Component: 27

(11940, 27)(11940, 27)
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0

Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09021596915816536


Modelo: LinearRegression()
Component: 28

(11940, 28)(11940, 28)
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Melhores parametros: {'fit_intercept': 'True', 'normalize': 'True'}

Score Grid: 0.09023259792525386
Exportando DataFrame de Scores

      cent_price_cor  cent_trans_cor
0          -0.205584        0.374052
1          -0.198976        0.358398
2          -0.196557        0.356928
3          -0.210584        0.362855
4          -0.205494        0.367040
...              ...             ...
8054       -0.206812        0.362487
8055       -0.201517        0.362278
8056       -0.195008        0.353092
8057       -0.208837        0.365683
8058       -0.200655        0.366247

[8059 rows x 2 columns]

Programa executado com sucesso 





In [10]:
# Coletanea de parametros para o GridSearch

# params_grid = [

# #Linear Regression
# {'normalize': ['True', 'False'],
# 'fit_intercept': ['True', 'False']},
    
# #SVR RBF
# {'kernel': ['rbf'],
# 'C':[0.1, 0.5, 1, 5, 10],
# 'degree': [3,8],
# 'coef0': [0.01,10,0.5],
# 'gamma': ('auto','scale'),
# 'epsilon': [0.1,0.2]},
    
# #SVR POLY
# {'kernel': ['poly'],
# 'C':[0.1, 0.5, 1, 5, 10],
# 'degree': [3,8],
# 'coef0': [0.01,10,0.5],
# 'gamma': ('auto','scale'),
# 'epsilon': [0.1,0.2]},
    
# #Lasso
# {'alpha':[0.02, 0.024, 0.025, 0.026, 0.03],
# 'fit_alpha':[0.005, 0.02, 0.03, 0.05, 0.06]},  
    
# #XGBoost
# {'nthread':[4], #when use hyperthread, xgboost may become slower
# 'objective':['reg:linear'],
# 'learning_rate': [.03, 0.05, .07], #so called `eta` value
# 'max_depth': [5, 6, 7],
# 'min_child_weight': [4],
# 'silent': [1],
# 'subsample': [0.7],
# 'colsample_bytree': [0.7],
# 'n_estimators': [500]}
# ]