In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import json
import pandas as pd

In [2]:
with open('/storage/taeun8991/descriptor_model/L2M3_revision_ML/x_vector_density.json', 'r') as file:
    x_vector = json.load(file)

with open('/storage/taeun8991/descriptor_model/L2M3_revision_ML/y_vector_density.json', 'r') as file:
    y_vector = json.load(file)

with open('/storage/taeun8991/descriptor_model/L2M3_revision_ML/refcode_list_density.json', 'r') as f:
    refcode_list = json.load(f)

In [3]:
filtered_indices = [
    i for i, sublist in enumerate(x_vector) 
    if not any(
        (isinstance(value, (float, int)) and (np.isinf(value) or np.abs(value) > np.finfo(np.float32).max or np.isnan(value) or value == 'nan'))
        for value in sublist
    )
]

x_vector_filtered = [x_vector[i] for i in filtered_indices]
y_vector_filtered = [y_vector[i] for i in filtered_indices]
refcode_list_filtered = [refcode_list[i] for i in filtered_indices]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x_vector_filtered, y_vector_filtered, test_size=0.25, random_state=42)


In [5]:
# Define the parameter grid for SVR
param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.001, 0.01, 0.1, 1],
    'gamma': ['scale', 'auto', 0.01, 0.1],
    'kernel': ['rbf', 'poly']
}

svm_model = SVR()

grid_search = GridSearchCV(
    estimator=svm_model, 
    param_grid=param_grid, 
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# Fit the GridSearchCV on the training data
grid_search.fit(x_train, y_train)

# Retrieve the best model and best hyperparameters
best_svr_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 128 candidates, totalling 640 fits


: 

In [None]:
x_train = np.array(x_train).astype(float)
y_train = np.array(y_train).astype(float)
x_test = np.array(x_test).astype(float)
y_test = np.array(y_test).astype(float)

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

# Step 1: Scale the input features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_model_exp = SVR(kernel='rbf', C=10, epsilon=0.01, gamma='auto')

# Step 5: Re-train the model using the best parameters on the scaled training set
svm_model_exp.fit(X_train_scaled, y_train)

# Step 6: Predict and evaluate on the training set
y_train_pred_exp = svm_model_exp.predict(X_train_scaled)
train_mse = mean_squared_error(y_train, y_train_pred_exp)
train_r2 = r2_score(y_train, y_train_pred_exp)
train_mae = mean_absolute_error(y_train, y_train_pred_exp)
print("Train Mean Squared Error:", train_mse)
print("Train R-squared (R2):", train_r2)
print("Train Mean Absolute Error (MAE):", train_mae)

# Step 7: Predict and evaluate on the test set
y_test_pred_exp = svm_model_exp.predict(X_test_scaled)
test_mse = mean_squared_error(y_test, y_test_pred_exp)
test_r2 = r2_score(y_test, y_test_pred_exp)
test_mae = mean_absolute_error(y_test, y_test_pred_exp)
print("Test Mean Squared Error:", test_mse)
print("Test R-squared (R2):", test_r2)
print("Test Mean Absolute Error (MAE):", test_mae)
