## ITA 2021

<br>

Dicionário de Dados:

* n: número de agentes
* p: fração de traders
* f: grau de interesse dos traders
* x, y, z: dimensões do espaço aéreo
* a1, a2: média e desvio padrão do coeficiente do preço fundamental dos consumidores
* a3, a4: idem para os traders
* b1, b2: média e desvio padrão do coeficiente do preço de mercado dos consumidores
* b3, b4: idem para os traders
* c1, c2: média e desvio padrão do coeficiente do preço aleatório dos consumidores
* c3, c4: idem para os traders
* g1, g2: média e desvio padrão do grau de agressividade dos consumidores
* l1, l2: média e desvio padrão do coeficiente de desvalorização para os consumidores
* l3, l4: idem para os traders
* e1, e2: variabilidade no preço fundamental dos consumidores e traders, respectivamente
* cent_price_cor: correlação entre o preço final e centralidade das permissões de vôo
* cent_trans_cor: idem para o número de transações

In [1]:
# Importando Ferramentas básicas
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# importando Ferramentas de Limpeza
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline      import make_pipeline, Pipeline
from sklearn.metrics       import mean_absolute_error


In [31]:
# Importando Ferramentas de Modelo
from sklearn.linear_model     import LogisticRegression
from sklearn.linear_model     import LinearRegression
from sklearn.svm              import SVR
from sklearn.model_selection  import train_test_split
from sklearn.model_selection  import cross_val_score
from sklearn.model_selection  import RepeatedKFold
from sklearn.metrics          import accuracy_score
from xgboost                  import XGBRegressor

In [4]:
# Importando os dados

train = pd.read_csv('./../Dados/train.csv')
test = pd.read_csv('./../Dados/test.csv')

In [5]:
train.head()

Unnamed: 0,n,p,f,x,y,z,a1,a2,a3,a4,...,g1,g2,l1,l2,l3,l4,e1,e2,cent_price_cor,cent_trans_cor
0,709,0.7,0.2,16,12,7,3.8,0.24,2.3,0.28,...,0.186,0.5,0.147,1.5,0.089,1.6,1.5,2.6,-0.169,0.375
1,921,0.6,0.3,19,17,10,3.0,0.12,2.4,0.19,...,0.079,1.2,0.186,1.6,0.076,1.8,0.6,0.4,-0.075,0.234
2,177,0.8,0.4,14,12,5,3.2,0.17,1.8,0.18,...,0.036,1.4,0.048,0.7,0.073,0.6,2.7,0.3,-0.177,0.389
3,415,0.7,0.5,24,11,2,1.3,0.17,1.5,0.18,...,0.063,1.1,0.151,0.8,0.022,1.1,0.5,0.1,-0.102,0.358
4,802,0.8,0.4,21,10,3,4.4,0.15,2.6,0.13,...,0.044,1.9,0.123,1.9,0.046,1.1,2.0,0.6,-0.034,0.18


In [6]:
test.head()

Unnamed: 0,id,n,p,f,x,y,z,a1,a2,a3,...,c3,c4,g1,g2,l1,l2,l3,l4,e1,e2
0,1,558,0.5,0.2,28,21,2,1.0,0.29,1.3,...,3.1,0.27,0.198,1.8,0.149,2.0,0.078,1.7,0.5,2.9
1,2,910,0.9,0.3,28,19,9,2.2,0.14,1.6,...,2.2,0.16,0.172,1.0,0.086,0.8,0.03,0.8,0.2,1.6
2,3,213,0.8,0.5,21,15,8,2.2,0.22,2.6,...,1.6,0.2,0.083,0.7,0.019,1.4,0.016,1.2,2.6,2.4
3,4,654,0.7,0.5,14,15,2,2.2,0.12,3.8,...,4.1,0.11,0.185,1.3,0.11,0.8,0.006,1.7,0.2,1.7
4,5,672,0.7,0.5,24,10,5,3.7,0.2,3.7,...,2.6,0.27,0.158,0.9,0.148,1.9,0.038,1.3,1.1,2.8


In [7]:
train.isna().sum()/len(train)

n                 0.0
p                 0.0
f                 0.0
x                 0.0
y                 0.0
z                 0.0
a1                0.0
a2                0.0
a3                0.0
a4                0.0
b1                0.0
b2                0.0
b3                0.0
b4                0.0
c1                0.0
c2                0.0
c3                0.0
c4                0.0
g1                0.0
g2                0.0
l1                0.0
l2                0.0
l3                0.0
l4                0.0
e1                0.0
e2                0.0
cent_price_cor    0.0
cent_trans_cor    0.0
dtype: float64

In [8]:
test.isna().sum()/len(train)

id    0.0
n     0.0
p     0.0
f     0.0
x     0.0
y     0.0
z     0.0
a1    0.0
a2    0.0
a3    0.0
a4    0.0
b1    0.0
b2    0.0
b3    0.0
b4    0.0
c1    0.0
c2    0.0
c3    0.0
c4    0.0
g1    0.0
g2    0.0
l1    0.0
l2    0.0
l3    0.0
l4    0.0
e1    0.0
e2    0.0
dtype: float64

In [9]:
train.cent_price_cor.describe()

count    11940.000000
mean        -0.202591
std          0.116617
min         -0.613000
25%         -0.284000
50%         -0.205000
75%         -0.122000
max          0.266000
Name: cent_price_cor, dtype: float64

In [10]:
train.cent_trans_cor.describe()

count    11940.000000
mean         0.362586
std          0.115221
min         -0.399000
25%          0.294000
50%          0.372000
75%          0.443000
max          0.717000
Name: cent_trans_cor, dtype: float64

In [11]:
train['volume'] = train.x * train.y * train.z

test['volume'] = test.x * test.y * test.z

In [12]:
train.corr()["cent_price_cor"].sort_values(ascending = False)

cent_price_cor    1.000000
g2                0.021134
l1                0.017993
c3                0.016272
y                 0.009405
volume            0.006063
n                 0.006058
f                 0.006039
l2                0.004810
c1                0.004732
x                 0.004589
e2                0.003390
z                 0.003054
a2                0.000586
a4               -0.000458
l3               -0.000939
p                -0.001929
b3               -0.002147
b4               -0.002732
l4               -0.003316
b1               -0.004355
g1               -0.005781
c4               -0.005893
a1               -0.006230
c2               -0.006745
a3               -0.014581
e1               -0.018194
b2               -0.021684
cent_trans_cor   -0.664482
Name: cent_price_cor, dtype: float64

In [13]:
train.corr()["cent_trans_cor"].sort_values(ascending = False)

cent_trans_cor    1.000000
b2                0.018168
b1                0.015772
g1                0.013346
e1                0.005901
c1                0.004218
n                 0.003236
p                 0.002209
l3                0.002189
y                 0.002117
b4                0.001637
c2                0.001442
x                 0.001371
c4                0.000903
b3                0.000284
a1                0.000264
a3                0.000141
a2               -0.000042
e2               -0.000337
l4               -0.000921
volume           -0.004703
z                -0.005470
l2               -0.005727
a4               -0.007882
f                -0.009006
c3               -0.015315
l1               -0.020887
g2               -0.026342
cent_price_cor   -0.664482
Name: cent_trans_cor, dtype: float64

In [14]:
train['densidade'] = train.volume/train.n

test['densidade'] = test.volume/test.n

In [50]:
X = train.drop(columns = ['cent_price_cor', 'cent_trans_cor'], axis = 1)
y_price = train.cent_price_cor

X_train, X_test, y_price_train, y_price_test = train_test_split(X,y_price,
                                                    test_size = 0.25,
                                                    random_state = 0)


#regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
#regr.fit(X, y)

pipe_1 = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

pipe_1.fit(X_train,y_price_train)

y_price_pred = pipe_1.predict(X_test)
mean_absolute_error(y_price_test, y_price_pred)

0.09784398363826143

In [51]:
X = train.drop(columns = ['cent_price_cor', 'cent_trans_cor'], axis = 1)
y_trans = train.cent_trans_cor

X_train, X_test, y_trans_train, y_trans_test = train_test_split(X,y_trans,
                                                    test_size = 0.25,
                                                    random_state = 0)


#regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
#regr.fit(X, y)

pipe_2 = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR())])

pipe_2.fit(X_train,y_2_train)

y_trans_pred = pipe_2.predict(X_test)

mean_absolute_error(y_trans_test, y_trans_pred)

0.09950724288212077

## Standard scaler separadamente

In [22]:
scaler = StandardScaler()

transf_X_train = scaler.fit_transform(X_train)
transf_X_test = scaler.fit_transform(X_test)


In [52]:
pipe_2 = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR())])

pipe_2.fit(transf_X_train,y_trans_train)

y_2_pred = pipe_2.predict(transf_X_test)

mean_absolute_error(y_trans_test, y_trans_pred)

0.09950724288212077

## Cross Validation

In [53]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

## XGBRegressor

In [56]:
xgb_model = XGBRegressor()

xgb_scores_price = cross_val_score(xgb_model, transf_X_train, y_price_train, scoring='neg_mean_absolute_error', cv=cv)

xgb_scores_price.mean()

In [60]:
xgb_scores_trans = cross_val_score(xgb_model, transf_X_train, y_trans_train, scoring='neg_mean_absolute_error', cv=cv)

xgb_scores_trans.mean()

-0.0982081147821315