In [1]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import numpy as np
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import KernelPCA

test = pd.read_csv('data/test.csv')
target_ids = np.array(test['ID']).flatten()

X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
X_test = pd.read_csv('X_test.csv')

###########################################################
#apply boxcox to y_train
#box-cox for target skewness (svr will benefit from it), first we make the data positive
if (y_train.min() <= 0):
    y_shift = y_train - y_train.min() + 1  # Shift to make all values positive
else:
    y_shift = y_train

y_train_boxcox, lambda_bc = boxcox(y_shift)

In [2]:
from sklearn.utils import resample
X_train_sample = resample(X_train, n_samples=10000, random_state=123)
kpca = KernelPCA(kernel='rbf', n_components=25)
kpca.fit(X_train_sample)
X_train_kpca = kpca.transform(X_train)
X_test_kpca = kpca.transform(X_test)

print('KPCA tranformed')

KPCA tranformed


In [6]:
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'C': uniform(0.01, 10),         # Random values between 0.1 and 1000
    'gamma': ['auto'],      # Random values between 0.001 and 1
    'epsilon': uniform(0.001, 0.1),   # Random values between 0.01 and 0.5
    'kernel': ['rbf']
}

# Define the model and RandomizedSearchCV
svr = SVR()
random_search = RandomizedSearchCV(
    estimator=svr,
    param_distributions=param_distributions,
    n_iter=8,                        # Number of parameter combinations to try
    scoring='r2', # Replace with your desired metric
    cv=5,                             # 5-fold cross-validation
    verbose=3,
    random_state=69,
    n_jobs=-1
)
# Fit to training data
random_search.fit(X_train_kpca, y_train_boxcox)

# Retrieve the best parameters, best score, and best model
best_params = random_search.best_params_
best_score = random_search.best_score_  # This is the mean CV score of the best model
best_model = random_search.best_estimator_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score (r2):", best_score)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 3/5] END C=3.512525252234114, epsilon=0.07994092564308926, gamma=auto, kernel=rbf;, score=0.684 total time= 1.0min
[CV 2/5] END C=3.512525252234114, epsilon=0.07994092564308926, gamma=auto, kernel=rbf;, score=0.678 total time= 1.0min
[CV 5/5] END C=2.9724916167243354, epsilon=0.08190677156733267, gamma=auto, kernel=rbf;, score=0.692 total time= 1.0min
[CV 4/5] END C=2.9724916167243354, epsilon=0.08190677156733267, gamma=auto, kernel=rbf;, score=0.676 total time= 1.0min
[CV 3/5] END C=2.9724916167243354, epsilon=0.08190677156733267, gamma=auto, kernel=rbf;, score=0.682 total time= 1.0min
[CV 1/5] END C=3.512525252234114, epsilon=0.07994092564308926, gamma=auto, kernel=rbf;, score=0.697 total time= 1.1min
[CV 1/5] END C=2.9724916167243354, epsilon=0.08190677156733267, gamma=auto, kernel=rbf;, score=0.696 total time= 1.1min
[CV 2/5] END C=2.9724916167243354, epsilon=0.08190677156733267, gamma=auto, kernel=rbf;, score=0.677 to