In [1]:
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer


# models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

# set seed to ensure reproducibility
np.random.seed(42)
random.seed(42)

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.head()

Unnamed: 0,Mes,Ano,Data,Cambio (R$ / U$$),IPCA_BASE_FIXA (1993 = 100),IPCA_MES (% a.m.),ProdIndustrial,Selic_OVER (% a.a.),Base_monetaria(M1) (R$ Bln),CPI US (% a.m.),FED INTEREST RATE (% a.a.),Produção industrial US,Base_monetaria(M1) (U$$ Bln)
0,1,1999,01/01/1999,150,146841,7,,"29,54%",3911,"0,18%","4,63%",876357,"1.098,10"
1,2,1999,01/02/1999,191,148383,105,,"32,61%",3918,"0,00%","4,76%",881298,"1.096,70"
2,3,1999,01/03/1999,190,150015,11,,"48,16%",3762,"0,06%","4,81%",882987,"1.096,60"
3,4,1999,01/04/1999,169,150855,56,,"32,15%",3646,"0,67%","4,74%",884947,"1.101,60"
4,5,1999,01/05/1999,168,151308,3,,"27,12%",3551,"0,06%","4,74%",890447,"1.103,80"


In [4]:
df.columns

Index(['Mes', 'Ano', 'Data', 'Cambio (R$ / U$$)',
       'IPCA_BASE_FIXA (1993 = 100)', 'IPCA_MES (% a.m.)', 'ProdIndustrial',
       'Selic_OVER (% a.a.)', 'Base_monetaria(M1) (R$ Bln)', 'CPI US (% a.m.)',
       'FED INTEREST RATE (% a.a.)', 'Produção industrial US',
       'Base_monetaria(M1) (U$$ Bln)'],
      dtype='object')

In [5]:
def clean_numeric_data(x):
    x = x.replace('.','')
    x = x.replace(',','.')
    x = x.replace('%', '')
    return float(x)

def train_model(model, train_data, train_label):
    model.fit(train_data, train_label)
    return model

def perform_evaluation(model, test_data, test_label):
    predictions = model.predict(test_data)
    mse = mean_squared_error(test_label, predictions)
    mae = mean_absolute_error(test_label, predictions)
    return {'mse': mse, 'mae': mae}

def build_metrics(model_name, results, metrics):
    metrics[model_name]['mse'].append(results['mse'])
    metrics[model_name]['mae'].append(results['mae'])

In [9]:
# hardcore way
X = df.drop(['Cambio (R$ / U$$)', 'ProdIndustrial', 'Data'], axis=1)
Y = df['Cambio (R$ / U$$)']

for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].apply(clean_numeric_data)

Y = Y.apply(clean_numeric_data)#Y = Y.apply(np.log)

models = {
    #'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(random_state=42),
    #'MLPRegressor': MLPRegressor()
}

tscv = TimeSeriesSplit(n_splits=20, test_size=1)
metrics = {name:{'mse':[], 'mae':[]} for name,_ in models.items()}

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

    for name, model in models.items():
        model = train_model(model, X_train.values, Y_train.values)
        results = perform_evaluation(model, X_test.values, Y_test.values)
        build_metrics(name, results, metrics)


In [11]:
metrics = {name:{'mse':np.mean(metrics[name]['mse']),
                 'mae':np.mean(metrics[name]['mae']),
                 'rmse':np.sqrt(np.mean(metrics[name]['mse']))}
           for name,_ in models.items()}

In [12]:
metric_df = pd.DataFrame(metrics)
metric_df

Unnamed: 0,SVR,RandomForestRegressor,KNeighborsRegressor,DecisionTreeRegressor
mse,0.324113,0.086745,0.106954,0.088285
mae,0.304012,0.231855,0.2354,0.2445
rmse,0.569309,0.294525,0.327038,0.297128


In [13]:
# easy way
models = [RandomForestRegressor(random_state=42),
          SVR(),
          KNeighborsRegressor(),
          DecisionTreeRegressor(random_state=42)]
metrics = {
    'mae': make_scorer(mean_absolute_error),
    'mse': make_scorer(mean_squared_error),}

for model in models:
    tscv = TimeSeriesSplit(n_splits=20, test_size=1)

    X = df.drop(['Cambio (R$ / U$$)', 'ProdIndustrial', 'Data'], axis=1)
    Y = df['Cambio (R$ / U$$)']
    for col in X.columns:
        if X[col].dtype == 'object':
            X[col] = X[col].apply(clean_numeric_data)
    Y = Y.apply(clean_numeric_data)

    scores = cross_validate(model, X.values, Y.values, cv=tscv, scoring=metrics, return_train_score=False)

    print(model.__class__.__name__)
    print('MSE: {:.3f} | MAE: {:.3f} | RMSE: {:.3f}'.format(
        scores['test_mse'].mean(),
        scores['test_mae'].mean(),
        np.sqrt(scores['test_mse'].mean()),
    ))


RandomForestRegressor
MSE: 0.087 | MAE: 0.232 | RMSE: 0.295
SVR
MSE: 0.324 | MAE: 0.304 | RMSE: 0.569
KNeighborsRegressor
MSE: 0.107 | MAE: 0.235 | RMSE: 0.327
DecisionTreeRegressor
MSE: 0.088 | MAE: 0.244 | RMSE: 0.297


In [None]:
# predict only one instance

#print(rf.predict([X_test.iloc[0]]))
#print(Y_test.iloc[0])
#X_test.iloc[0]