# 🚀 Fast NuSVM Regression with Intel(R) Extension for Scikit-learn  </h1>


In [None]:
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

y_train = train[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
x_train = train.drop(['date_time','target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis=1)
x_test = test.drop(['date_time'], axis=1)
x_train.shape, x_test.shape, y_train.shape

# 🔨 Installing Intel(R) Extension for Scikit-learn

Use [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) for fast compute Scikit-learn estimators

In [None]:
!pip install scikit-learn-intelex -q --progress-bar off

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

# ✏️ Preprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
transformer = MinMaxScaler().fit(x_train)
x_train = transformer.transform(x_train)
x_test = transformer.transform(x_test)

scaler = StandardScaler(with_mean=False).fit(y_train)
y_train = scaler.transform(y_train)

# 🔍 Defining model and parameters for search optimal model

Since the search turned out to be very large, I hid it.

In [None]:
from sklearn.model_selection import train_test_split
x_train_sub, x_val, y_train_sub, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

def get_loss(y1, y2):
    from sklearn.metrics import mean_squared_error
    return mean_squared_error(y1, y2)

def objective(trial):
    from sklearn.multioutput import RegressorChain
    from sklearn.svm import NuSVR    
    params_svm = {
        'C': trial.suggest_loguniform('C', 1e-3, 1e3),
        'nu':  trial.suggest_float('nu', 0.0, 1.0),
        'kernel': trial.suggest_categorical("kernel", ["rbf"])
    }
    
    clf = RegressorChain(NuSVR(**params_svm), random_state=34).fit(x_train_sub, y_train_sub)
    y_pred = clf.predict(x_val)
    return get_loss(y_val, y_pred)

import optuna
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=123),
                            direction="minimize",
                            pruner=optuna.pruners.HyperbandPruner())
# study.optimize(objective, n_trials=40, show_progress_bar=True)

# 🔥 NuSVM with best parameters

In [None]:
%%time
from sklearn.multioutput import RegressorChain
from sklearn.svm import NuSVR

params_svm = {
    'C': 15.094374246471325,
    'nu':  0.28613933495037946,
    'kernel': 'rbf',
}

clf = RegressorChain(NuSVR(**params_svm), random_state=34).fit(x_train, y_train)
y_pred = clf.predict(x_test)

# 🎯 Submit result

In [None]:
y_pred = scaler.inverse_transform(y_pred)
y_pred[y_pred < 0] = 0

In [None]:
sample_submission['target_carbon_monoxide'] = y_pred[:, 0]
sample_submission['target_benzene'] = y_pred[:, 1]
sample_submission['target_nitrogen_oxides'] = y_pred[:, 2]
sample_submission.to_csv('submission_base.csv', index=False)
sample_submission.head()