In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.pipeline import Pipeline


def load_data(filename):
    """
    :param filename:
    :return: data, labels
    """
    data = np.load(filename)
    return data, np.square(data)


def mse_relative(y_true, y_pred):
    """
    :param y_true:
    :param y_pred:
    :return: average relative mse
    """
    # Handle y_true is zero
    non_zero_mask = y_true != 0
    return np.mean(np.square(np.linalg.norm(y_true[non_zero_mask] - y_pred[non_zero_mask], axis=-1) / np.linalg.norm(y_true[non_zero_mask], axis=-1)))


X, Y = load_data('toy_quadratic_data_iter_10000_order_10.npy')
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Framework Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(0.999)),
    ('poly_features', PolynomialFeatures(2)),
    ('regression', Ridge())
])

# Parameters for Grid Search
param_grid = {
    'regression__alpha': [1,2]
}

# Custom scorer for grid search
custom_scorer = make_scorer(mse_relative, greater_is_better=False)

# Grid search CV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=custom_scorer)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation relative MSE score:", -grid_search.best_score_)

# Evaluate on test set using the best model
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
relative_mse = mse_relative(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
print(f"Relative Mean Squared Error: {relative_mse}")
print(f"Test Mean Squared Error: {mse}")

# Plot
plt.figure(figsize=(10, 5))
plt.plot(y_test[100], label='True Values')
plt.plot(predictions[100], label='Predicted Values', linestyle='--')
plt.legend()
plt.title("Best Polynomial Regression with PCA from Grid Search")
plt.show()

median_dist = np.median(np.sqrt(np.sum(np.square(x_train_scaled), axis=1)))

# Define parameter space for both kernels
param_dist = {
    'alpha': stats.uniform(0.01, 10),
    'gamma': stats.uniform((1/(5*median_dist)**2), 1/(median_dist)**2),  # Relevant for RBF kernel
}

# RandomizedSearchCV for hyperparameter tuning
rbf_krr = RandomizedSearchCV(KernelRidge(kernel='rbf'), param_distributions=param_dist,
                                 n_iter=20, cv=10, scoring=make_scorer(relative_mse, greater_is_better=False))
rbf_krr.fit(x_train_scaled, y_train_scaled)
rbf_pred = scaler_y.inverse_transform(rbf_krr.predict(x_test_scaled))

print("Best RBF parameters:", rbf_krr.best_params_)

# Visualizing the results
idx = 0  # Change this index to visualize different samples
plt.figure(figsize=(12, 5))
plt.plot(y_test[idx], label='Actual at t=1', linestyle='-', marker='o')
plt.plot(rbf_pred[idx], label='RBF Predicted at t=1', linestyle='--', marker='x')
plt.title('Comparison of Actual vs. Predicted at t=1')
plt.xlabel('Spatial Dimension')
plt.ylabel('Solution Value')
plt.legend()
plt.show()


(8000, 1000)
(8000, 1000)


KeyboardInterrupt: 