## ITA 2021

<br>

Dicionário de Dados:

* n: número de agentes
* p: fração de traders
* f: grau de interesse dos traders
* x, y, z: dimensões do espaço aéreo
* a1, a2: média e desvio padrão do coeficiente do preço fundamental dos consumidores
* a3, a4: idem para os traders
* b1, b2: média e desvio padrão do coeficiente do preço de mercado dos consumidores
* b3, b4: idem para os traders
* c1, c2: média e desvio padrão do coeficiente do preço aleatório dos consumidores
* c3, c4: idem para os traders
* g1, g2: média e desvio padrão do grau de agressividade dos consumidores
* l1, l2: média e desvio padrão do coeficiente de desvalorização para os consumidores
* l3, l4: idem para os traders
* e1, e2: variabilidade no preço fundamental dos consumidores e traders, respectivamente
* cent_price_cor: correlação entre o preço final e centralidade das permissões de vôo
* cent_trans_cor: idem para o número de transações

In [5]:
# Importando Ferramentas básicas
import pandas                  as pd
import matplotlib.pyplot       as plt
import numpy                   as np

In [16]:
# importando Ferramentas de Limpeza
from sklearn.preprocessing    import StandardScaler
from sklearn.pipeline         import make_pipeline, Pipeline
from sklearn.decomposition    import PCA

In [64]:
# Importando Ferramentas de Modelo
from sklearn.svm              import SVR
from sklearn.model_selection  import train_test_split
from sklearn.metrics          import accuracy_score, mean_absolute_error
from sklearn.linear_model     import LinearRegression, SGDRegressor

In [8]:
# Importando os dados
train = pd.read_csv('./../Dados/train.csv')
test = pd.read_csv('./../Dados/test.csv')

In [15]:
# Criando Features
dataframes = [train, test]

for df in dataframes:
    df['volume']  = df.x * df.y * df.z
    df['densidade'] = df.volume / df.n

In [23]:
train.head()

Unnamed: 0,n,p,f,x,y,z,a1,a2,a3,a4,...,l1,l2,l3,l4,e1,e2,cent_price_cor,cent_trans_cor,volume,densidade
0,709,0.7,0.2,16,12,7,3.8,0.24,2.3,0.28,...,0.147,1.5,0.089,1.6,1.5,2.6,-0.169,0.375,1344,1.895628
1,921,0.6,0.3,19,17,10,3.0,0.12,2.4,0.19,...,0.186,1.6,0.076,1.8,0.6,0.4,-0.075,0.234,3230,3.507058
2,177,0.8,0.4,14,12,5,3.2,0.17,1.8,0.18,...,0.048,0.7,0.073,0.6,2.7,0.3,-0.177,0.389,840,4.745763
3,415,0.7,0.5,24,11,2,1.3,0.17,1.5,0.18,...,0.151,0.8,0.022,1.1,0.5,0.1,-0.102,0.358,528,1.272289
4,802,0.8,0.4,21,10,3,4.4,0.15,2.6,0.13,...,0.123,1.9,0.046,1.1,2.0,0.6,-0.034,0.18,630,0.785536


In [24]:
test.head()

Unnamed: 0,id,n,p,f,x,y,z,a1,a2,a3,...,g1,g2,l1,l2,l3,l4,e1,e2,volume,densidade
0,1,558,0.5,0.2,28,21,2,1.0,0.29,1.3,...,0.198,1.8,0.149,2.0,0.078,1.7,0.5,2.9,1176,2.107527
1,2,910,0.9,0.3,28,19,9,2.2,0.14,1.6,...,0.172,1.0,0.086,0.8,0.03,0.8,0.2,1.6,4788,5.261538
2,3,213,0.8,0.5,21,15,8,2.2,0.22,2.6,...,0.083,0.7,0.019,1.4,0.016,1.2,2.6,2.4,2520,11.830986
3,4,654,0.7,0.5,14,15,2,2.2,0.12,3.8,...,0.185,1.3,0.11,0.8,0.006,1.7,0.2,1.7,420,0.642202
4,5,672,0.7,0.5,24,10,5,3.7,0.2,3.7,...,0.158,0.9,0.148,1.9,0.038,1.3,1.1,2.8,1200,1.785714


In [25]:
train.isna().sum()/len(train)

n                 0.0
p                 0.0
f                 0.0
x                 0.0
y                 0.0
z                 0.0
a1                0.0
a2                0.0
a3                0.0
a4                0.0
b1                0.0
b2                0.0
b3                0.0
b4                0.0
c1                0.0
c2                0.0
c3                0.0
c4                0.0
g1                0.0
g2                0.0
l1                0.0
l2                0.0
l3                0.0
l4                0.0
e1                0.0
e2                0.0
cent_price_cor    0.0
cent_trans_cor    0.0
volume            0.0
densidade         0.0
dtype: float64

In [26]:
test.isna().sum()/len(train)

id           0.0
n            0.0
p            0.0
f            0.0
x            0.0
y            0.0
z            0.0
a1           0.0
a2           0.0
a3           0.0
a4           0.0
b1           0.0
b2           0.0
b3           0.0
b4           0.0
c1           0.0
c2           0.0
c3           0.0
c4           0.0
g1           0.0
g2           0.0
l1           0.0
l2           0.0
l3           0.0
l4           0.0
e1           0.0
e2           0.0
volume       0.0
densidade    0.0
dtype: float64

In [27]:
train.cent_price_cor.describe()

count    11940.000000
mean        -0.202591
std          0.116617
min         -0.613000
25%         -0.284000
50%         -0.205000
75%         -0.122000
max          0.266000
Name: cent_price_cor, dtype: float64

In [8]:
train.cent_trans_cor.describe()

count    11940.000000
mean         0.362586
std          0.115221
min         -0.399000
25%          0.294000
50%          0.372000
75%          0.443000
max          0.717000
Name: cent_trans_cor, dtype: float64

In [13]:
train.corr()["cent_price_cor"].abs().sort_values(ascending = True)

a4                0.000458
a2                0.000586
l3                0.000939
p                 0.001929
b3                0.002147
b4                0.002732
z                 0.003054
l4                0.003316
e2                0.003390
b1                0.004355
x                 0.004589
c1                0.004732
densidade         0.004775
l2                0.004810
g1                0.005781
c4                0.005893
f                 0.006039
n                 0.006058
volume            0.006063
a1                0.006230
c2                0.006745
y                 0.009405
a3                0.014581
c3                0.016272
l1                0.017993
e1                0.018194
g2                0.021134
b2                0.021684
cent_trans_cor    0.664482
cent_price_cor    1.000000
Name: cent_price_cor, dtype: float64

In [14]:
train.corr()["cent_trans_cor"].abs().sort_values(ascending = True)

a2                0.000042
a3                0.000141
a1                0.000264
b3                0.000284
e2                0.000337
c4                0.000903
l4                0.000921
x                 0.001371
c2                0.001442
b4                0.001637
y                 0.002117
l3                0.002189
p                 0.002209
n                 0.003236
c1                0.004218
volume            0.004703
z                 0.005470
l2                0.005727
e1                0.005901
a4                0.007882
densidade         0.007989
f                 0.009006
g1                0.013346
c3                0.015315
b1                0.015772
b2                0.018168
l1                0.020887
g2                0.026342
cent_price_cor    0.664482
cent_trans_cor    1.000000
Name: cent_trans_cor, dtype: float64

In [28]:
X = train.drop(columns = ['cent_price_cor', 'cent_trans_cor'], axis = 1)
y_1 = train.cent_price_cor

X_train, X_test, y_1_train, y_1_test = train_test_split(X,y_1,
                                                    test_size = 0.25,
                                                    random_state = 0)


#regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
#regr.fit(X, y)

pipe_1 = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

pipe_1.fit(X_train,y_1_train)

#pipe.score(X_test, y_test)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

In [29]:
y_1_pred = pipe_1.predict(X_test)

In [30]:
mean_absolute_error(y_1_test, y_1_pred)

0.09784398363826143

In [19]:
X = train.drop(columns = ['cent_price_cor', 'cent_trans_cor'], axis = 1)
y_2 = train.cent_trans_cor

X_train, X_test, y_2_train, y_2_test = train_test_split(X,y_2,
                                                    test_size = 0.25,
                                                    random_state = 0)


#regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
#regr.fit(X, y)

pipe_2 = Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

pipe_2.fit(X_train,y_2_train)

y_2_pred = pipe_2.predict(X_test)

mean_absolute_error(y_2_test, y_2_pred)

0.10156486690287644

## PCA

In [138]:
X = train.drop(columns = ['cent_price_cor', 'cent_trans_cor'], axis = 1)

scaler = StandardScaler()

transf_X_train = scaler.fit_transform(X_train)
transf_X_test = scaler.fit_transform(X_test)

In [139]:
y_price = train.cent_price_cor
y_trans = train.cent_trans_cor

X_train, X_test, y_price_train, y_price_test = train_test_split(X,y_price,
                                                    test_size = 0.25,
                                                    random_state = 0)

X_train, X_test, y_trans_train, y_trans_test = train_test_split(X,y_trans,
                                                    test_size = 0.25,
                                                    random_state = 0)

In [74]:
models = {'Linear Regression': LinearRegression(n_jobs = -1),
          'SVR': SVR(epsilon=0.2)}

In [177]:
def fit_score_PCA(models,X_train,y_train,X_test,y_test,components):

    for i in [components]:
        
        pca = PCA(n_components = i)
        X_train_PCA = pca.fit_transform(X_train)
        X_test_PCA = pca.transform(X_test)

        # Make a dict to keep model scores
        model_scores = {}

        # Loop through models
        for name, model in models.items():

            # Fit the model to the data
            model.fit(X_train_PCA,y_train)
        
            y_pred = model.predict(X_test_PCA)

            #Evaluates the model and append its score to model_scores
            model_scores[name + '_' + str(i)] = mean_absolute_error(y_test, y_pred)

    return model_scores

In [163]:
model_scores_trans = fit_score_PCA(models,X_train,y_trans_train,X_test,y_trans_test, 0.95)
model_scores_price = fit_score_PCA(models,X_train,y_price_train,X_test,y_price_test, 0.95)

In [164]:
model_scores_trans

{'Linear Regression_0.95': 0.090252152275057, 'SVR_0.95': 0.09325218584031905}

In [165]:
model_scores_price

{'Linear Regression_0.95': 0.0941312117033256, 'SVR_0.95': 0.0943234245525932}

In [129]:
# Melhores scores para 0.95 (sem scaling)
0.0941312117033256 + 0.090252152275057

0.1843833639783826

In [181]:
model_scores_trans = fit_score_PCA(models,transf_X_train,y_trans_train,transf_X_test,y_trans_test, 1)
model_scores_price = fit_score_PCA(models,transf_X_train,y_price_train,transf_X_test,y_price_test, 1)

In [182]:
model_scores_trans

{'Linear Regression_1': 0.09025552474334281, 'SVR_1': 0.09314800378339827}

In [183]:
model_scores_price

{'Linear Regression_1': 0.0941305091243686, 'SVR_1': 0.09434737464316084}

In [184]:
# Melhores scores para 0.95 (com scaling)
0.0941305091243686 + 0.09025552474334281

0.1843860338677114