In [1]:
from json import dump, load
import numpy as np
import pandas as pd
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge

X = read_csv('features-bulk.csv')
y = read_csv('target-bulk.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

def calculate_r2(y_true, pred_y):
    
    #Calculate the R-squared score.
    mean_y_true = sum(y_true) / len(y_true)
    total_sum_of_squares = 0
    for y_i in y_true:
       total_sum_of_squares += (y_i - mean_y_true) ** 2
    # Check if total_sum_of_squares is zero
    if total_sum_of_squares == 0:
        r2 = 0  # or set to another appropriate value
    else:
        residual_sum_of_squares = 0
        for y_j, y_pred_j in zip(y_true, pred_y):
            residual_sum_of_squares +=((y_j - y_pred_j) ** 2 )
            r2 = 1 - (residual_sum_of_squares / total_sum_of_squares)
    return r2



calculating the R-squared (linear_r2) using predictions on both the training and test sets helps in assessing the model's overall performance, including its ability to generalize to new, unseen data. In practice, it's common to evaluate models on both training and test sets to understand their behavior and identify potential issues like overfitting or underfitting.
 
The coefficient of determination (R-squared) is a measure that indicates the proportion of the variance in the dependent variable that is predictable from the independent variables. It ranges from 0 to 1, where:

R-squared close to 1 indicates that a large proportion of the variance in the dependent variable is explained by the independent variables, suggesting a good fit.
R-squared close to 0 indicates that the model does not explain much of the variance in the dependent variable, suggesting a poor fit.
However, there isn't a universally agreed-upon threshold for what constitutes a "good" R-squared value, as it depends on the context and the nature of the data. In general terms:

0.7 to 1.0: A high R-squared value. The model is a good fit, and a large proportion of the variance is explained.
0.5 to 0.7: A moderate R-squared value. The model explains a moderate amount of the variance.
Below 0.5: A low R-squared value. The model might not be explaining much of the variance, and its predictive power might be limited.
However, it's crucial to interpret R-squared in the context of the specific problem and the nature of the data. R-squared alone doesn't provide information about the correctness of the model specification, the significance of individual predictors, or the presence of outliers.

Always consider other metrics and aspects of model performance in conjunction with R-squared, and be cautious about overfitting or relying solely on one metric for model evaluation. Additionally, when comparing models, it's often helpful to use domain-specific knowledge and assess the model's performance against a baseline or other relevant benchmarks.

# Linear Least Square Model


In [2]:
standardized_model = LinearRegression()
standardized_model.fit(X_train_standardized, y_train)
standardized_pred_y = standardized_model.predict(X_train_standardized)
'''evaluate the model's performance on the training data itself, 
providing insights into how well the model fits the training set.'''
standardized_y_true = y_test['K_VRH']
Standardized_linear_r2 = calculate_r2(standardized_y_true, standardized_pred_y)
print("Standardized Linear Regression R-squared:", Standardized_linear_r2[0])

non_standardized_model = LinearRegression()
non_standardized_model.fit(X_train, y_train)
non_standardized_pred_y = non_standardized_model.predict(X_train)
'''evaluate the model's performance on the training data itself, 
providing insights into how well the model fits the training set.'''
non_standardized_y_true = y_test['K_VRH']
Non_standardized_linear_r2 = calculate_r2(non_standardized_y_true, non_standardized_pred_y)
print("Non-standardized Linear Regression R-squared:", Non_standardized_linear_r2[0])

Standardized Linear Regression R-squared: -0.806748500561179
Non-standardized Linear Regression R-squared: -0.7975119361750378


# Lasso Regression:
## Objective Function: 
Minimizes the sum of squared differences with the addition of the absolute values of the coefficients multiplied by a regularization parameter (L1 regularization).
## Regularization: 
Encourages sparsity in the coefficient values, leading to some coefficients being exactly zero.
## Outcome: 
Can be effective in feature selection, setting some coefficients to zero and thus excluding irrelevant features.
## Resulting Coefficients: 
Tends to yield sparse coefficient vectors.

In [11]:
lasso_params = {'alpha': [0.01, 0.1, 1, 10]}
standardized_lasso_grid = GridSearchCV(Lasso(max_iter=100000), lasso_params, cv=5)
standardized_lasso_grid.fit(X_train_standardized, y_train)
y_true = y_test['K_VRH']
standardized_lasso_y_pred = standardized_lasso_grid.predict(X_train_standardized)
Standardized_lasso_r2 = calculate_r2(y_true, standardized_lasso_y_pred)
print("Standardized Lasso Regression R-squared:", Standardized_lasso_r2)
standardized_lasso_selector = SelectFromModel(standardized_lasso_grid.best_estimator_, max_features=10)
standardized_lasso_selector.fit(X_train_standardized, y_train)
standardized_lasso_important_features = X.columns[standardized_lasso_selector.get_support()]
print("lasso_important_features for standardized data",standardized_lasso_important_features)

Standardized Lasso Regression R-squared: -0.7758675145187031
lasso_important_features for standardized data Index(['MagpieData mean Number', 'MagpieData mean MeltingT',
       'MagpieData mean CovalentRadius', 'MagpieData mean NfValence',
       'MagpieData mean NValence', 'MagpieData minimum GSvolume_pa',
       'MagpieData mean GSvolume_pa', 'MagpieData mean GSmagmom', 'density',
       'packing fraction'],
      dtype='object')


In [12]:
non_standardized_lasso_grid = GridSearchCV(Lasso(max_iter=100000), lasso_params, cv=5)
non_standardized_lasso_grid.fit(X_train, y_train)
non_standardized_lasso_y_pred = non_standardized_lasso_grid.predict(X_train)
Non_standardized_lasso_r2 = calculate_r2(y_true, non_standardized_lasso_y_pred)
print("Non-standardized Lasso Regression R-squared:", Non_standardized_lasso_r2)
non_standardized_lasso_selector = SelectFromModel(non_standardized_lasso_grid.best_estimator_, max_features=10)
non_standardized_lasso_selector.fit(X_train, y_train)
non_standardized_lasso_important_features = X.columns[non_standardized_lasso_selector.get_support()]
print("lasso_important_features for Non-standardized data",non_standardized_lasso_important_features)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Non-standardized Lasso Regression R-squared: -0.7892461765408068
lasso_important_features for Non-standardized data Index(['MagpieData mean Number', 'MagpieData mean Row',
       'MagpieData mean Electronegativity', 'MagpieData minimum NpValence',
       'MagpieData mean NpValence', 'MagpieData minimum GSmagmom',
       'MagpieData mean GSmagmom', 'MagpieData avg_dev GSmagmom', 'density',
       'packing fraction'],
      dtype='object')


  model = cd_fast.enet_coordinate_descent(


# Ridge Regression:

## Objective Function: 
Minimizes the sum of squared differences with the addition of the squared values of the coefficients multiplied by a regularization parameter (L2 regularization).
## Regularization: 
Encourages smaller but non-zero coefficients for all features, helping to mitigate multicollinearity issues.
## Outcome: 
May not result in exact feature selection, but it can be more stable when there are highly correlated features.
## Resulting Coefficients: 
Tends to produce non-sparse coefficient vectors.

In [13]:
ridge_params = {'alpha': [0.01, 0.1, 1, 10]}
standardized_ridge_grid = GridSearchCV(Ridge(max_iter=100000), ridge_params, cv=5)
standardized_ridge_grid.fit(X_train_standardized, y_train)
standardized_ridge_y_pred = standardized_ridge_grid.predict(X_train_standardized)
Standardized_ridge_r2 = calculate_r2(y_true, standardized_ridge_y_pred)
print("Standardized Ridge Regression R-squared:", Standardized_ridge_r2[0])
standardized_ridge_selector = SelectFromModel(standardized_ridge_grid.best_estimator_, max_features=10)
standardized_ridge_selector.fit(X_train_standardized, y_train)
standardized_ridge_important_features = X.columns[standardized_ridge_selector.get_support()]
print("ridge_important_features for standardized data",standardized_ridge_important_features)

Standardized Ridge Regression R-squared: -0.7907004182189743
ridge_important_features for standardized data Index(['MagpieData maximum Number', 'MagpieData mean Number',
       'MagpieData avg_dev Number', 'MagpieData maximum AtomicWeight',
       'MagpieData mean AtomicWeight', 'MagpieData avg_dev AtomicWeight',
       'MagpieData mean Row', 'MagpieData mean CovalentRadius',
       'MagpieData mean NpValence', 'density'],
      dtype='object')


In [14]:
non_standardized_ridge_grid = GridSearchCV(Ridge(max_iter=100000), ridge_params, cv=5)
non_standardized_ridge_grid.fit(X_train, y_train)
non_standardized_ridge_y_pred = non_standardized_ridge_grid.predict(X_train)
Non_standardized_rifge_ridge_r2 = calculate_r2(y_true, non_standardized_ridge_y_pred)
print("Non-standardized Ridge Regression R-squared:", Non_standardized_rifge_ridge_r2[0])
non_standardized_ridge_selector = SelectFromModel(non_standardized_ridge_grid.best_estimator_, max_features=10)
non_standardized_ridge_selector.fit(X_train, y_train)
non_standardized_ridge_important_features = X.columns[non_standardized_ridge_selector.get_support()]
print("ridge_important_features for Non-standardized data",non_standardized_ridge_important_features)

Non-standardized Ridge Regression R-squared: -0.7898305765276765
ridge_important_features for Non-standardized data Index(['MagpieData mean Number', 'MagpieData minimum Row',
       'MagpieData maximum Row', 'MagpieData mean Row',
       'MagpieData minimum NpValence', 'MagpieData mean NpValence',
       'MagpieData mean GSmagmom', 'MagpieData avg_dev GSmagmom', 'density',
       'packing fraction'],
      dtype='object')


In [16]:
# Apply polynomial transformation to the standardized features
poly_order = 2
Standardized_poly = PolynomialFeatures(degree=poly_order)
X_poly_train = Standardized_poly.fit_transform(X_train_standardized)
X_poly_test = Standardized_poly.fit_transform(X_test_standardized)
#print(X_poly_test,X_poly_train)
print("Shape of Standardized_X_poly_train:", X_poly_train.shape)
print("Shape of Standardized_X_poly_test:", X_poly_test.shape)
a = read_csv('features-bulk.csv')
print("Shape of feature data",a.shape)

Shape of X_poly_train: (944, 10011)
Shape of X_poly_test: (237, 10011)
Shape of feature data (1181, 140)


## Standardized Polynomial features
### Lasso Polynomial Regression:

In [17]:
# Fit Lasso regression on the polynomial features
Standardized_lasso_grid_poly = GridSearchCV(Lasso(max_iter=100000), lasso_params, cv=5)
Standardized_lasso_grid_poly.fit(X_poly_train, y_train)
print(Standardized_lasso_grid_poly)
# Perform feature selection on the polynomial features
lasso_selector_poly = SelectFromModel(Standardized_lasso_grid_poly.best_estimator_, max_features=10)
lasso_selector_poly.fit(X_poly_train, y_train)
#print(lasso_selector_poly)

selected_features_mask = lasso_selector_poly.get_support()
Standardized_lasso_selected_features = X_poly_train[:, selected_features_mask]

print("Selected features based on the best estimator for Lasso Regression:")
print(Standardized_lasso_selected_features)

GridSearchCV(cv=5, estimator=Lasso(max_iter=100000),
             param_grid={'alpha': [0.01, 0.1, 1, 10]})
Selected features based on the best estimator for Lasso Regression:
[[ 0.69062311 -0.16435287  2.48523132 ...  0.03415912 -0.01154667
   0.15102762]
 [ 0.54431316 -0.61606118  2.64093718 ...  0.03845224 -0.02114642
   0.1913755 ]
 [-1.0523973   0.89507876 -1.1207621  ... -0.0587838   0.00457194
   0.44725836]
 ...
 [ 0.22835418 -0.53682844 -0.39703961 ...  0.08811707 -0.06909946
   1.00499341]
 [ 0.30366614  0.06436021  0.59346923 ...  0.0343216  -0.01670287
   0.15246784]
 [-0.1279605   1.04946008 -1.35980568 ... -0.04212789  0.06128207
   0.22971144]]


In [25]:
Standardized_lasso_poly_selected_features_indices = lasso_selector_poly.get_support(indices=True)
Standardized_lasso_poly_feature_names = Standardized_poly.get_feature_names_out(X.columns)
print("Standardized_lasso_poly_feature_names",Standardized_lasso_poly_feature_names)
Standardized_lasso_poly_selected_features_indices_str = [str(i) for i in Standardized_lasso_poly_selected_features_indices]
print("Standardized_lasso_poly_selected_features_indices_str", Standardized_lasso_poly_selected_features_indices_str)
# Map the selected feature indices back to the original feature names
Standardized_lasso_poly_selected_features_indices_original = [int(x) for x in Standardized_lasso_poly_selected_features_indices_str if int(x) < len(Standardized_lasso_poly_feature_names)]
print("Standardized_lasso_poly_selected_features_indices_original", Standardized_lasso_poly_selected_features_indices_original)
# Extract the corresponding feature names
Standardized_lasso_poly_important_features = np.array(Standardized_lasso_poly_feature_names[Standardized_lasso_poly_selected_features_indices_original]).flatten()
# Print the important features
print("Standardized_lasso_poly_important_features", Standardized_lasso_poly_important_features)

Standardized_lasso_poly_feature_names ['1' 'space_group' 'MagpieData minimum Number' ... 'vpa^2'
 'vpa packing fraction' 'packing fraction^2']
Standardized_lasso_poly_selected_features_indices_str ['23', '110', '138', '139', '3106', '5638', '9064', '9709', '9710', '10008']
Standardized_lasso_poly_selected_features_indices_original [23, 110, 138, 139, 3106, 5638, 9064, 9709, 9710, 10008]
Standardized_lasso_poly_important_features ['MagpieData mean MeltingT' 'MagpieData minimum GSvolume_pa' 'density'
 'vpa' 'MagpieData mean MeltingT vpa'
 'MagpieData mean Electronegativity vpa'
 'MagpieData mode NdUnfilled packing fraction'
 'MagpieData minimum GSbandgap vpa'
 'MagpieData minimum GSbandgap packing fraction' 'vpa^2']


### Ridge Polynomial Regression:

In [26]:
# Fit Ridge regression on the polynomial features
Standardized_poly_ridge_grid = GridSearchCV(Ridge(max_iter=100000), ridge_params, cv=5)
Standardized_poly_ridge_grid.fit(X_poly_train, y_train)
print(Standardized_poly_ridge_grid)
# Perform feature selection on the polynomial features
Standardized_poly_ridge_selector = SelectFromModel(Standardized_poly_ridge_grid.best_estimator_, max_features=10)
Standardized_poly_ridge_selector.fit(X_poly_train, y_train)
#print(ridge_selector_poly)

Standardized_poly_ridge_selected_features_mask = Standardized_poly_ridge_selector.get_support()
Standardized_poly_ridge_selected_features = X_poly_train[:, Standardized_poly_ridge_selected_features_mask]

print("Selected features based on the best estimator for Ridge Regression:")
print(Standardized_poly_ridge_selected_features)

Standardized_poly_ridge_selected_features_indices = Standardized_poly_ridge_selector.get_support(indices=True)
Standardized_ridge_poly_feature_names = Standardized_poly.get_feature_names_out(X.columns)
print("Standardized_ridge_poly_feature_names", Standardized_ridge_poly_feature_names)
Standardized_ridge_poly_selected_features_indices_str = [str(i) for i in Standardized_poly_ridge_selected_features_indices]
print("Standardized_ridge_poly_selected_features_indices_str", Standardized_ridge_poly_selected_features_indices_str)
# Map the selected feature indices back to the original feature names
Standardized_ridge_poly_selected_features_indices_original = [int(x) for x in Standardized_ridge_poly_selected_features_indices_str if int(x) < len(Standardized_ridge_poly_feature_names)]
print("Standardized_ridge_poly_selected_features_indices_original", Standardized_ridge_poly_selected_features_indices_original)
# Extract the corresponding feature names
Standardized_ridge_poly_important_features = np.array(Standardized_ridge_poly_feature_names[Standardized_ridge_poly_selected_features_indices_original]).flatten()
# Print the important features
print("Standardized_ridge_poly_important_features", Standardized_ridge_poly_important_features)


GridSearchCV(cv=5, estimator=Ridge(max_iter=100000),
             param_grid={'alpha': [0.01, 0.1, 1, 10]})
Selected features based on the best estimator for Ridge Regression:
[[ 0.10402081  0.69062311  0.73293489 ...  0.03415912 -0.01154667
   0.14723193]
 [-0.06304014  0.54431316  0.73293489 ...  0.03845224 -0.02114642
   0.16573605]
 [-1.38253848 -1.0523973  -0.58237206 ... -0.0587838   0.00457194
  -0.25336876]
 ...
 [-0.33486813  0.22835418  0.31613877 ...  0.08811707 -0.06909946
  -0.23428519]
 [-0.31475238  0.30366614  0.48202221 ...  0.0343216  -0.01670287
   0.06095542]
 [-0.48116444 -0.1279605   0.31613877 ... -0.04212789  0.06128207
   0.20987216]]
Standardized_ridge_poly_feature_names ['1' 'space_group' 'MagpieData minimum Number' ... 'vpa^2'
 'vpa packing fraction' 'packing fraction^2']
Standardized_ridge_poly_selected_features_indices_str ['21', '23', '25', '138', '139', '140', '9708', '9709', '9710', '9778']
Standardized_ridge_poly_selected_features_indices_original [21,

# Decision tree regressor
Three common methods for modification or boosting are Adaboost, Gradient Boosting, and Hist-Boost (a variant of Gradient Boosting). Each method has its characteristics, and the choice depends on the specific requirements of your task. Below, I'll provide a brief explanation of each and discuss the rationale for choosing one over the others:

## Adaboost (Adaptive Boosting):

### Explanation: 
Adaboost is an ensemble learning technique that focuses on improving the performance of weak learners (e.g., shallow decision trees) by assigning more weight to misclassified instances. It sequentially trains a series of weak models and gives more weight to misclassified instances in each iteration.
### Rationale: 
Adaboost is versatile and often effective in improving the accuracy of weak learners. It adapts well to noisy data and can be less prone to overfitting.

## Gradient Boosting:
### Explanation: 
Gradient Boosting builds an ensemble of decision trees sequentially, where each subsequent tree corrects the errors made by the previous ones. It minimizes a loss function (e.g., mean squared error for regression) using gradient descent.
### Rationale: 
Gradient Boosting is powerful and widely used. It often achieves high accuracy and is suitable for a variety of tasks. It allows fine-tuning of hyperparameters to balance model complexity and performance.
# Hist-Boost (Histogram-Based Boosting):

### Explanation: 
ist-Boost is a variant of Gradient Boosting that utilizes histogram-based techniques to speed up the training process. It constructs histograms of feature values to efficiently find the best splits during tree building.
### Rationale: 
Hist-Boost can be faster than traditional Gradient Boosting, especially when dealing with large datasets. It is efficient for datasets with a large number of samples and features.

# Choice:

If computational efficiency is a priority and you are dealing with a large dataset, Hist-Boost might be a good choice due to its faster training times.
If you prioritize interpretability and simplicity or have a smaller dataset, Adaboost or traditional Gradient Boosting could be suitable.
Consider trying all three methods and comparing their performance on your specific dataset through cross-validation.

In [None]:
tree_reg = DecisionTreeRegressor(random_state=42)

# Find optimal tree depth using GridSearchCV
param_grid = {'max_depth': np.arange(1, 21)}
grid_search_tree = GridSearchCV(tree_reg, param_grid, cv=5, scoring='r2')
grid_search_tree.fit(X_train, y_train)
optimal_tree_depth = grid_search_tree.best_params_['max_depth']

# Train the Decision Tree Regressor with the optimal depth
final_tree_reg = DecisionTreeRegressor(max_depth=optimal_tree_depth, random_state=42)
final_tree_reg.fit(X_train, y_train)

# Task 2.4.2: Train Gradient Boosted Decision Tree
gradient_boost_reg = GradientBoostingRegressor(random_state=42)

# Find optimal parameters using GridSearchCV
param_grid_gb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}
grid_search_gb = GridSearchCV(gradient_boost_reg, param_grid_gb, cv=5, scoring='r2')
grid_search_gb.fit(X_train, y_train)

optimal_n_estimators = grid_search_gb.best_params_['n_estimators']
optimal_learning_rate = grid_search_gb.best_params_['learning_rate']

# Train the Gradient Boosted Decision Tree with optimal parameters
final_gb_reg = GradientBoostingRegressor(n_estimators=optimal_n_estimators, learning_rate=optimal_learning_rate, random_state=42)
final_gb_reg.fit(X_train, y_train)

# Store the ten most important features for both models
tree_importances = final_tree_reg.feature_importances_
gb_importances = final_gb_reg.feature_importances_

top_10_tree_features = np.argsort(tree_importances)[-10:][::-1]
top_10_gb_features = np.argsort(gb_importances)[-10:][::-1]

In [38]:
print("important_features based on DecisionTreeRegressor",X.columns[top_10_tree_features], "important_features based on GradientBoostingRegressor",X.columns[top_10_gb_features])

important_features based on DecisionTreeRegressor Index(['MagpieData mean MeltingT', 'vpa', 'MagpieData minimum MeltingT',
       'MagpieData maximum GSvolume_pa', 'MagpieData maximum MeltingT',
       'MagpieData mode NValence', 'packing fraction',
       'MagpieData mean Electronegativity', 'MagpieData mode GSvolume_pa',
       'MagpieData minimum MendeleevNumber'],
      dtype='object') important_features based on GradientBoostingRegressor Index(['MagpieData mean MeltingT', 'vpa', 'density',
       'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT',
       'MagpieData minimum Column', 'MagpieData mode Column',
       'MagpieData mode GSvolume_pa', 'MagpieData maximum GSvolume_pa',
       'MagpieData mean GSvolume_pa'],
      dtype='object')


# Kernel Ridge Regressor:

In [52]:
kernel_ridge = KernelRidge()
param_grid_kernel = {'alpha': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': [0.1, 1, 10]}
# GridSearchCV to find optimal parameters
kernel_standardized = GridSearchCV(kernel_ridge, param_grid_kernel, cv=5)
kernel_standardized.fit(X_train_standardized, y_train)
kernel_pred_X = kernel_standardized.predict(X_test_standardized)
kernel_y_true = y_test['K_VRH']
kernel_y_true = kernel_standardized.predict(X_train_standardized)
kernel_r2 = calculate_r2(kernel_y_true, kernel_y_true)
print("Standardized Kernel Ridge Regression R-squared:", kernel_r2[0])
# Get the optimal hyperparameters
optimal_alpha = kernel_standardized.best_params_['alpha']
print("optimal regularization parameter =",optimal_alpha)
optimal_kernel = kernel_standardized.best_params_['kernel']
print("optimal kernel is",optimal_kernel)
optimal_gamma = kernel_standardized.best_params_['gamma']
print("optimal gamma =",optimal_gamma)


#final_kernel_reg = KernelRidge(alpha=optimal_alpha, kernel=optimal_kernel, gamma=optimal_gamma)
#final_kernel_reg.fit(X_train_standardized, y_train)

Standardized Kernel Ridge Regression R-squared: 1.0
optimal regularization parameter = 10
optimal kernel is poly
optimal gamma = 0.1


'final_kernel_reg = KernelRidge(alpha=optimal_alpha, kernel=optimal_kernel, gamma=optimal_gamma)\nfinal_kernel_reg.fit(X_train_standardized, y_train)'

In [53]:
best_score = grid_search_kernel.best_score_
print("Best Cross-Validated R-squared Score:", best_score)

Best Cross-Validated R-squared Score: 0.8934173898090536


In [50]:
kernel_non_standardized = GridSearchCV(kernel_ridge, param_grid_kernel, cv=5, scoring='r2')
kernel_non_standardized.fit(X_train, y_train)
kernel_pred_X = kernel_non_standardized.predict(X_test)
kernel_y_true = y_test['K_VRH']
kernel_y_true = kernel_non_standardized.predict(X_train)
kernel_r2 = calculate_r2(kernel_y_true, kernel_y_true)
print("Non-Standardized Kernel Ridge Regression R-squared:", kernel_r2[0])
# Get the optimal hyperparameters
optimal_alpha = kernel_non_standardized.best_params_['alpha']
print("Non-Standardized optimal regularization parameter =",optimal_alpha)
optimal_kernel = kernel_non_standardized.best_params_['kernel']
print("Non-Standardized optimal kernel is",optimal_kernel)
optimal_gamma = kernel_non_standardized.best_params_['gamma']
print("Non-Standardized optimal gamma =",optimal_gamma)


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos",

Non-Standardized Kernel Ridge Regression R-squared: 1.0
Non-Standardized optimal regularization parameter = 1
Non-Standardized optimal kernel is linear
Non-Standardized optimal gamma = 0.1


The warning messages you're seeing are related to the ill-conditioned matrix, and they indicate that the matrix being inverted in the Ridge regression is close to being singular, which can lead to numerical instability. This can happen when the features are highly correlated or when there are redundant features.

To address this issue, you can try the following approaches:

Feature Scaling:

Ensure that your features are scaled appropriately. Standardize the features (subtract mean and divide by standard deviation) before applying Ridge regression. This can sometimes help with numerical stability.
Feature Selection:

Consider performing feature selection to remove highly correlated or redundant features before applying Ridge regression. This can help improve the condition of the matrix.
Regularization Parameter Adjustment:

Experiment with different values of the regularization parameter (alpha). A smaller alpha might help with numerical stability. You can try values like 0.01, 0.1, or other small positive values.