In [1]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import matplotlib.pyplot as plt
import seaborn as sns

# Define function to select top 30 features based on Pearson correlation with age
def select_top_features(data, age_column='age', top_n=30):
    correlations = data.corr()[age_column].abs().sort_values(ascending=False)
    top_features = correlations.index[1:top_n+1].tolist()  # Exclude the age column itself
    return top_features

# Define function to convert age to age group
def age_to_group(age):
    return str((age // 10) * 10)

# Prepare data using top features
def prepare_data(data, top_features):
    X = np.ascontiguousarray(data[top_features].to_numpy(dtype=np.float32))
    y = data['age'].to_numpy(dtype=np.float32)
    return X, y

# Train models using 5-fold cross-validation
def run_cross_validation(data, top_features, n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    all_predictions = []
    all_true_values = []

    for fold, (train_index, test_index) in enumerate(kf.split(data)):
        print(f"Processing fold {fold+1}...")

        train_df, test_df = data.iloc[train_index], data.iloc[test_index]
        
        X_train, y_train = prepare_data(train_df, top_features)
        X_test, y_test = prepare_data(test_df, top_features)
        
        # Define the kernel: Constant * RBF (Radial Basis Function)
        kernel = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))
        model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=random_state)
        
        model.fit(X_train, y_train)
        
        # Save the model
        model_filename = os.path.join(model_save_dir, f'GPR_model_fold_{fold}.joblib')
        joblib.dump(model, model_filename)
        
        # Predict on the test data
        predictions = model.predict(X_test)
        
        all_predictions.extend(predictions)
        all_true_values.extend(y_test)

    return np.array(all_true_values), np.array(all_predictions)

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

# Read the dataset
data = pd.read_csv('imputed_data.csv', index_col=0)

# Select top 30 features based on Pearson correlation with age
top_features = select_top_features(data, age_column='age', top_n=30)

# Run cross-validation
y_true, y_pred = run_cross_validation(data, top_features)

# Calculate errors
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"Overall Root Mean Squared Error (RMSE): {mse**0.5:.4f}")
print(f"Overall Mean Absolute Error (MAE): {mae:.4f}")

# Calculate prediction error
error_table = pd.DataFrame({'Actual Age': y_true, 'Predicted Age': y_pred})
error_table['Prediction Error'] = error_table['Predicted Age'] - error_table['Actual Age']


Processing fold 1...
Processing fold 2...
Processing fold 3...


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


Processing fold 4...
Processing fold 5...


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


Overall Root Mean Squared Error (RMSE): 8.0822
Overall Mean Absolute Error (MAE): 4.8943
