In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle
import seaborn as sns

pd.options.display.max_columns = None
plt.style.use("fivethirtyeight")
plt.rcParams["figure.dpi"] = 150

X_train, X_ivs, y_train, col_names = pickle.load(open("../data/drd2_data.pickle", "rb"))
X_train = pd.DataFrame(X_train, columns=col_names)
dup_mask = X_train.duplicated()
X_train.drop_duplicates(inplace=True)
X_train = X_train[~dup_mask]
X_train = pd.DataFrame(X_train, columns=col_names)

y_train = y_train[~dup_mask]

  X_train = X_train[~dup_mask]


In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    make_scorer,
    r2_score,
    mean_squared_error,
    explained_variance_score,
)
from sklearn.svm import SVR

D_cols = [col for col in X_train.columns if "D" in col]

ct = ColumnTransformer([("scale", MinMaxScaler(), D_cols)], remainder="passthrough")
pipe = Pipeline([("ct", ct), ("svm", SVR())])

metrics = {
    "r2": make_scorer(r2_score),
    "mse": make_scorer(mean_squared_error),
    "evs": make_scorer(explained_variance_score),
}

param_grid = {
    "svm__C": [0.1, 1, 10, 100],
    "svm__gamma": [0.1, 1, 10, 100],
    "svm__kernel": ["rbf"],
}

gs = GridSearchCV(
    pipe,
    param_grid,
    scoring=metrics,
    refit="evs",
    cv=5,
    n_jobs=-1,
    verbose=2,
    return_train_score=True,
)

gs_results = gs.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [4]:
gs_df = pd.DataFrame(gs_results.cv_results_)
gs_df.to_csv("../data/svm_gridsearch_results.csv")

In [9]:
gs_df[
    [
        "param_svm__C",
        "param_svm__gamma",
        "param_svm__kernel",
        "mean_test_evs",
        "mean_train_evs",
        "mean_test_r2",
        "mean_train_r2",
        "mean_test_mse",
        "mean_train_mse",
    ]
]

Unnamed: 0,param_svm__C,param_svm__gamma,param_svm__kernel,mean_test_evs,mean_train_evs,mean_test_r2,mean_train_r2,mean_test_mse,mean_train_mse
0,0.1,0.1,rbf,0.097616,0.505805,0.095099,0.504659,0.0693,0.037959
1,0.1,1.0,rbf,0.013837,0.452277,0.011518,0.451369,0.075689,0.042043
2,0.1,10.0,rbf,0.010616,0.450922,0.008254,0.450006,0.075939,0.042147
3,0.1,100.0,rbf,0.004281,0.449002,0.0019,0.448129,0.076424,0.042291
4,1.0,0.1,rbf,0.183497,0.893726,0.182134,0.893706,0.062635,0.008145
5,1.0,1.0,rbf,0.022519,0.892419,0.020947,0.892346,0.074961,0.008249
6,1.0,10.0,rbf,0.018122,0.893656,0.016549,0.893582,0.075299,0.008155
7,1.0,100.0,rbf,0.009663,0.894227,0.008145,0.894154,0.075943,0.008111
8,10.0,0.1,rbf,0.183611,0.894235,0.182239,0.894215,0.062627,0.008106
9,10.0,1.0,rbf,0.022707,0.893834,0.021102,0.893757,0.074949,0.008141


O melhor gama é 0.1 

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    make_scorer,
    r2_score,
    mean_squared_error,
    explained_variance_score,
)
from sklearn.svm import SVR

D_cols = [col for col in X_train.columns if "D" in col]

ct = ColumnTransformer([("scale", MinMaxScaler(), D_cols)], remainder="passthrough")
pipe = Pipeline([("ct", ct), ("svm", SVR())])

metrics = {
    "r2": make_scorer(r2_score),
    "mse": make_scorer(mean_squared_error),
    "evs": make_scorer(explained_variance_score),
}

param_grid = {
    "svm__C": [0.1, 1, 10, 100],
    "svm__gamma": [0.01, 1, 10, 100],
    "svm__kernel": ["rbf"],
}

gs = GridSearchCV(
    pipe,
    param_grid,
    scoring=metrics,
    refit="evs",
    cv=5,
    n_jobs=-1,
    verbose=2,
    return_train_score=True,
)

gs_results = gs.fit(X_train, y_train)

In [11]:
from sklearn.model_selection import cross_val_score

pipe = Pipeline([("ct", ct), ("svm", SVR(kernel="rbf"))])

scores = cross_val_score(
    pipe,
    X_train,
    y_train,
    cv=5,
    scoring="explained_variance",
)

In [13]:
scores

array([0.64579916, 0.65668423, 0.6552194 , 0.66708453, 0.63679223])

In [21]:
pipe = Pipeline([("ct", ct), ("svm", SVR(kernel="linear"))])

scores_lin = cross_val_score(
    pipe,
    X_train,
    y_train,
    cv=5,
    scoring="explained_variance",
)

In [22]:
scores_lin

array([0.3353282 , 0.37365456, 0.43913217, 0.4047731 , 0.36088806])

Linear não é muito bom

In [29]:
param_grid = {
    "svm__epsilon": [0.1, 1, 10, 100],
    "svm__gamma": [0.1, "scale"],
    "svm__kernel": ["rbf"],
}

gs_rbf = GridSearchCV(
    pipe,
    param_grid,
    scoring=metrics,
    refit="evs",
    cv=5,
    n_jobs=-1,
    verbose=2,
    return_train_score=True,
)

gs_rbf_results = gs_rbf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
svm_rbf = pd.DataFrame(gs_rbf_results.cv_results_)
svm_rbf.to_csv("../data/svm_rbf_gridsearch_results.csv")