In [1]:
import pandas as pd
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')

# Ensure the age group keys are strings
top_30_features = {str(key): value for key, value in top_30_features.items()}
def split_data_by_age_cv(df, age_column='age', n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    age_groups = df.groupby(df[age_column] // 10 * 10)
    
    fold_data = []

    for train_index, test_index in kf.split(df):
        train_list = []
        test_list = []
        
        for _, group in age_groups:
            group_train_index = [idx for idx in train_index if idx in group.index]
            group_test_index = [idx for idx in test_index if idx in group.index]
            
            train = group.loc[group_train_index]
            test = group.loc[group_test_index]
            
            train_list.append(train)
            test_list.append(test)
        
        train_df = pd.concat(train_list, ignore_index=True)
        test_df = pd.concat(test_list, ignore_index=True)
        
        fold_data.append((train_df, test_df))
    
    return fold_data

# Example usage
imputed_data = pd.read_csv('imputed_data.csv',index_col =0)  # Replace with your actual data
fold_data = split_data_by_age_cv(imputed_data)

# Plotting the age distribution for the first fold's train and test sets


In [2]:

train_data, test_data = fold_data[0]

In [3]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report
# Define function to convert age to age group
def age_to_group(age):
    return (age // 10) * 10

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age'] // 10 * 10
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features =  group_data.drop(['age','age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=np.int32))
    return np.concatenate(X), np.concatenate(y)




X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)


# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)

Classification Accuracy: 0.6411
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       158
          10       0.98      0.98      0.98       408
          20       0.68      0.70      0.69       139
          30       0.53      0.48      0.50       229
          40       0.55      0.68      0.61       433
          50       0.51      0.56      0.53       467
          60       0.50      0.46      0.48       360
          70       0.54      0.28      0.37       143
          80       0.40      0.05      0.09        38
          90       0.50      0.17      0.25         6
         100       0.00      0.00      0.00         1

    accuracy                           0.64      2382
   macro avg       0.56      0.49      0.50      2382
weighted avg       0.64      0.64      0.63      2382

      Actual Age Group  Predicted Age Group
0                    0                    0
1                    0                    0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

age_column = 'age'
age_groups = train_df[age_column] // 10 * 10
unique_age_groups = age_groups.unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[age_groups == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[str(age_group)]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data[age_column]
    
    model = ExplainableBoostingRegressor(interactions=5,greedy_ratio=0,inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'Ideal_EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}-{age_group+9}, MSE: {mse}, Model saved as: {model_filename}")

    # Save feature importances
    feature_importance = model.explain_global().data()
    feature_importance_df = pd.DataFrame(feature_importance['scores'][:30], columns=[f'Age Group {age_group}'])
    feature_importance_matrix = pd.concat([feature_importance_matrix, feature_importance_df], axis=1)



# Print the feature importance matrix
print(feature_importance_matrix)


(566, 30)
Age Group: 0.0-9.0, MSE: 0.0545810401538887, Model saved as: models\Ideal_EBM_model_age_group_0.0.joblib
(1709, 30)
Age Group: 10.0-19.0, MSE: 0.10429694656555655, Model saved as: models\Ideal_EBM_model_age_group_10.0.joblib
(586, 30)
Age Group: 20.0-29.0, MSE: 2.5203286411127617, Model saved as: models\Ideal_EBM_model_age_group_20.0.joblib
(874, 30)
Age Group: 30.0-39.0, MSE: 3.2319877928342104, Model saved as: models\Ideal_EBM_model_age_group_30.0.joblib
(1748, 30)
Age Group: 40.0-49.0, MSE: 4.188095087019111, Model saved as: models\Ideal_EBM_model_age_group_40.0.joblib
(1945, 30)
Age Group: 50.0-59.0, MSE: 5.376526024245087, Model saved as: models\Ideal_EBM_model_age_group_50.0.joblib
(1368, 30)
Age Group: 60.0-69.0, MSE: 4.982540509435378, Model saved as: models\Ideal_EBM_model_age_group_60.0.joblib
(573, 30)
Age Group: 70.0-79.0, MSE: 4.403285354980591, Model saved as: models\Ideal_EBM_model_age_group_70.0.joblib
(131, 30)
Age Group: 80.0-89.0, MSE: 2.7844903375047907, M

In [5]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error
train_data = train_df
test_data = test_df
# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')

# Ensure the age group keys are strings
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Define function to convert age to age group
def age_to_group(age):
    return str((age // 10) * 10)

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Train and Test Data using top 30 features
def prepare_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_df, top_30_features)
X_test, y_test = prepare_data(test_df, top_30_features)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0.0', '10.0', '20.0', '30.0', '40.0', '50.0', '60.0', '70.0', '80.0', '90.0', '100.0']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'Ideal_EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        final_predictions.append(prediction)
    else:
        final_predictions.append(np.nan)  # Handle cases where there is no model for the predicted age group

# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred, squared=True)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (MSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")

Root Mean Squared Error (MSE) on Test Data: 2.8632
Mean Absolute Error (MAE) on Test Data: 1.5932




In [2]:

train_data, test_data = fold_data[1]

In [3]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report
# Define function to convert age to age group
def age_to_group(age):
    return (age // 10) * 10

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age'] // 10 * 10
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features =  group_data.drop(['age','age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=np.int32))
    return np.concatenate(X), np.concatenate(y)




X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)


# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)

Classification Accuracy: 0.6746
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       143
          10       0.96      0.95      0.96       458
          20       0.65      0.73      0.69       146
          30       0.61      0.48      0.54       220
          40       0.57      0.72      0.64       422
          50       0.57      0.61      0.59       465
          60       0.55      0.53      0.54       335
          70       0.54      0.31      0.40       150
          80       0.33      0.03      0.05        34
          90       1.00      0.25      0.40         8
         100       0.00      0.00      0.00         1

    accuracy                           0.67      2382
   macro avg       0.62      0.51      0.53      2382
weighted avg       0.67      0.67      0.67      2382

      Actual Age Group  Predicted Age Group
0                    0                    0
1                    0                    0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

age_column = 'age'
age_groups = train_df[age_column] // 10 * 10
unique_age_groups = age_groups.unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[age_groups == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[str(age_group)]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data[age_column]
    
    model = ExplainableBoostingRegressor(interactions=5,greedy_ratio=0,inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'Ideal_EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}-{age_group+9}, MSE: {mse}, Model saved as: {model_filename}")

    # Save feature importances
    feature_importance = model.explain_global().data()
    feature_importance_df = pd.DataFrame(feature_importance['scores'][:30], columns=[f'Age Group {age_group}'])
    feature_importance_matrix = pd.concat([feature_importance_matrix, feature_importance_df], axis=1)



# Print the feature importance matrix
print(feature_importance_matrix)


(581, 30)
Age Group: 0.0-9.0, MSE: 0.06602269322060604, Model saved as: models\Ideal_EBM_model_age_group_0.0.joblib
(1659, 30)
Age Group: 10.0-19.0, MSE: 0.09512053998898681, Model saved as: models\Ideal_EBM_model_age_group_10.0.joblib
(579, 30)
Age Group: 20.0-29.0, MSE: 2.4967405305797254, Model saved as: models\Ideal_EBM_model_age_group_20.0.joblib
(883, 30)
Age Group: 30.0-39.0, MSE: 3.5524457014326747, Model saved as: models\Ideal_EBM_model_age_group_30.0.joblib
(1759, 30)
Age Group: 40.0-49.0, MSE: 4.0147832700806845, Model saved as: models\Ideal_EBM_model_age_group_40.0.joblib
(1947, 30)
Age Group: 50.0-59.0, MSE: 5.613695670340837, Model saved as: models\Ideal_EBM_model_age_group_50.0.joblib
(1393, 30)
Age Group: 60.0-69.0, MSE: 5.04257782997572, Model saved as: models\Ideal_EBM_model_age_group_60.0.joblib
(566, 30)
Age Group: 70.0-79.0, MSE: 3.841320710284491, Model saved as: models\Ideal_EBM_model_age_group_70.0.joblib
(135, 30)
Age Group: 80.0-89.0, MSE: 1.2171679011484988, 

In [5]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error
train_data = train_df
test_data = test_df
# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')

# Ensure the age group keys are strings
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Define function to convert age to age group
def age_to_group(age):
    return str((age // 10) * 10)

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Train and Test Data using top 30 features
def prepare_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_df, top_30_features)
X_test, y_test = prepare_data(test_df, top_30_features)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0.0', '10.0', '20.0', '30.0', '40.0', '50.0', '60.0', '70.0', '80.0', '90.0', '100.0']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'Ideal_EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        final_predictions.append(prediction)
    else:
        final_predictions.append(np.nan)  # Handle cases where there is no model for the predicted age group

# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred, squared=True)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (MSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")

Root Mean Squared Error (MSE) on Test Data: 2.9176
Mean Absolute Error (MAE) on Test Data: 1.5725




In [2]:

train_data, test_data = fold_data[2]

In [3]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report
# Define function to convert age to age group
def age_to_group(age):
    return (age // 10) * 10

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age'] // 10 * 10
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features =  group_data.drop(['age','age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=np.int32))
    return np.concatenate(X), np.concatenate(y)




X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)


# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)

Classification Accuracy: 0.6763
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       144
          10       0.96      0.97      0.97       439
          20       0.73      0.79      0.75       135
          30       0.63      0.51      0.57       220
          40       0.59      0.71      0.64       441
          50       0.55      0.62      0.58       478
          60       0.55      0.48      0.51       348
          70       0.49      0.34      0.40       137
          80       0.50      0.09      0.15        34
          90       0.50      0.17      0.25         6

    accuracy                           0.68      2382
   macro avg       0.65      0.57      0.58      2382
weighted avg       0.67      0.68      0.67      2382

      Actual Age Group  Predicted Age Group
0                    0                    0
1                    0                    0
2                    0                    0
3         

In [4]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

age_column = 'age'
age_groups = train_df[age_column] // 10 * 10
unique_age_groups = age_groups.unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[age_groups == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[str(age_group)]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data[age_column]
    
    model = ExplainableBoostingRegressor(interactions=5,greedy_ratio=0,inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'Ideal_EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}-{age_group+9}, MSE: {mse}, Model saved as: {model_filename}")

    # Save feature importances
    feature_importance = model.explain_global().data()
    feature_importance_df = pd.DataFrame(feature_importance['scores'][:30], columns=[f'Age Group {age_group}'])
    feature_importance_matrix = pd.concat([feature_importance_matrix, feature_importance_df], axis=1)



# Print the feature importance matrix
print(feature_importance_matrix)


(580, 30)
Age Group: 0.0-9.0, MSE: 0.042341642761241194, Model saved as: models\Ideal_EBM_model_age_group_0.0.joblib
(1678, 30)
Age Group: 10.0-19.0, MSE: 0.10885931919553064, Model saved as: models\Ideal_EBM_model_age_group_10.0.joblib
(590, 30)
Age Group: 20.0-29.0, MSE: 2.331794442086963, Model saved as: models\Ideal_EBM_model_age_group_20.0.joblib
(883, 30)
Age Group: 30.0-39.0, MSE: 3.6748995289001707, Model saved as: models\Ideal_EBM_model_age_group_30.0.joblib
(1740, 30)
Age Group: 40.0-49.0, MSE: 4.058645864204199, Model saved as: models\Ideal_EBM_model_age_group_40.0.joblib
(1934, 30)
Age Group: 50.0-59.0, MSE: 5.447007897244662, Model saved as: models\Ideal_EBM_model_age_group_50.0.joblib
(1380, 30)
Age Group: 60.0-69.0, MSE: 5.06237274529167, Model saved as: models\Ideal_EBM_model_age_group_60.0.joblib
(579, 30)
Age Group: 70.0-79.0, MSE: 3.64156622804192, Model saved as: models\Ideal_EBM_model_age_group_70.0.joblib
(135, 30)
Age Group: 80.0-89.0, MSE: 2.1349249607809737, Mo

In [5]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error
train_data = train_df
test_data = test_df
# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')

# Ensure the age group keys are strings
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Define function to convert age to age group
def age_to_group(age):
    return str((age // 10) * 10)

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Train and Test Data using top 30 features
def prepare_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_df, top_30_features)
X_test, y_test = prepare_data(test_df, top_30_features)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0.0', '10.0', '20.0', '30.0', '40.0', '50.0', '60.0', '70.0', '80.0', '90.0', '100.0']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'Ideal_EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        final_predictions.append(prediction)
    else:
        final_predictions.append(np.nan)  # Handle cases where there is no model for the predicted age group

# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred, squared=True)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (MSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")

Root Mean Squared Error (MSE) on Test Data: 2.1248
Mean Absolute Error (MAE) on Test Data: 1.5517




In [2]:

train_data, test_data = fold_data[3]

In [3]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report
# Define function to convert age to age group
def age_to_group(age):
    return (age // 10) * 10

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age'] // 10 * 10
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features =  group_data.drop(['age','age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=np.int32))
    return np.concatenate(X), np.concatenate(y)




X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)


# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)

Classification Accuracy: 0.6511
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       135
          10       0.93      0.96      0.95       399
          20       0.71      0.64      0.67       154
          30       0.53      0.48      0.50       208
          40       0.56      0.70      0.62       445
          50       0.55      0.60      0.58       514
          60       0.55      0.49      0.52       349
          70       0.51      0.27      0.35       138
          80       1.00      0.09      0.17        33
          90       0.67      0.33      0.44         6
         100       0.00      0.00      0.00         1

    accuracy                           0.65      2382
   macro avg       0.64      0.51      0.53      2382
weighted avg       0.65      0.65      0.64      2382

      Actual Age Group  Predicted Age Group
0                    0                    0
1                    0                    0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

age_column = 'age'
age_groups = train_df[age_column] // 10 * 10
unique_age_groups = age_groups.unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[age_groups == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[str(age_group)]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data[age_column]
    
    model = ExplainableBoostingRegressor(interactions=5,greedy_ratio=0,inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'Ideal_EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}-{age_group+9}, MSE: {mse}, Model saved as: {model_filename}")

    # Save feature importances
    feature_importance = model.explain_global().data()
    feature_importance_df = pd.DataFrame(feature_importance['scores'][:30], columns=[f'Age Group {age_group}'])
    feature_importance_matrix = pd.concat([feature_importance_matrix, feature_importance_df], axis=1)



# Print the feature importance matrix
print(feature_importance_matrix)


(589, 30)
Age Group: 0.0-9.0, MSE: 0.07738011757171012, Model saved as: models\Ideal_EBM_model_age_group_0.0.joblib
(1718, 30)
Age Group: 10.0-19.0, MSE: 0.09798220634201169, Model saved as: models\Ideal_EBM_model_age_group_10.0.joblib
(571, 30)
Age Group: 20.0-29.0, MSE: 2.497704711720544, Model saved as: models\Ideal_EBM_model_age_group_20.0.joblib
(895, 30)
Age Group: 30.0-39.0, MSE: 3.4647387492757304, Model saved as: models\Ideal_EBM_model_age_group_30.0.joblib
(1736, 30)
Age Group: 40.0-49.0, MSE: 4.139902011352934, Model saved as: models\Ideal_EBM_model_age_group_40.0.joblib
(1898, 30)
Age Group: 50.0-59.0, MSE: 5.456337039752191, Model saved as: models\Ideal_EBM_model_age_group_50.0.joblib
(1379, 30)
Age Group: 60.0-69.0, MSE: 5.141985591529512, Model saved as: models\Ideal_EBM_model_age_group_60.0.joblib
(578, 30)
Age Group: 70.0-79.0, MSE: 3.7757103540006773, Model saved as: models\Ideal_EBM_model_age_group_70.0.joblib
(136, 30)
Age Group: 80.0-89.0, MSE: 1.618447461319488, M

In [5]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error
train_data = train_df
test_data = test_df
# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')

# Ensure the age group keys are strings
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Define function to convert age to age group
def age_to_group(age):
    return str((age // 10) * 10)

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Train and Test Data using top 30 features
def prepare_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_df, top_30_features)
X_test, y_test = prepare_data(test_df, top_30_features)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0.0', '10.0', '20.0', '30.0', '40.0', '50.0', '60.0', '70.0', '80.0', '90.0', '100.0']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'Ideal_EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        final_predictions.append(prediction)
    else:
        final_predictions.append(np.nan)  # Handle cases where there is no model for the predicted age group

# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred, squared=True)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (MSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")

Root Mean Squared Error (MSE) on Test Data: 2.8982
Mean Absolute Error (MAE) on Test Data: 1.6045




In [2]:

train_data, test_data = fold_data[4]

In [3]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report
# Define function to convert age to age group
def age_to_group(age):
    return (age // 10) * 10

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age'] // 10 * 10
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features =  group_data.drop(['age','age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=np.int32))
    return np.concatenate(X), np.concatenate(y)




X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)


# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)

Classification Accuracy: 0.6520
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       144
          10       0.95      0.98      0.96       413
          20       0.70      0.70      0.70       151
          30       0.57      0.50      0.53       226
          40       0.55      0.68      0.61       440
          50       0.53      0.57      0.55       488
          60       0.53      0.48      0.51       336
          70       0.51      0.28      0.36       148
          80       0.80      0.13      0.23        30
          90       0.50      0.17      0.25         6

    accuracy                           0.65      2382
   macro avg       0.66      0.55      0.57      2382
weighted avg       0.65      0.65      0.64      2382

      Actual Age Group  Predicted Age Group
0                    0                    0
1                    0                    0
2                    0                    0
3         

In [4]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

age_column = 'age'
age_groups = train_df[age_column] // 10 * 10
unique_age_groups = age_groups.unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[age_groups == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[str(age_group)]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data[age_column]
    
    model = ExplainableBoostingRegressor(interactions=5,greedy_ratio=0,inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'Ideal_EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}-{age_group+9}, MSE: {mse}, Model saved as: {model_filename}")

    # Save feature importances
    feature_importance = model.explain_global().data()
    feature_importance_df = pd.DataFrame(feature_importance['scores'][:30], columns=[f'Age Group {age_group}'])
    feature_importance_matrix = pd.concat([feature_importance_matrix, feature_importance_df], axis=1)



# Print the feature importance matrix
print(feature_importance_matrix)


(580, 30)
Age Group: 0.0-9.0, MSE: 0.06115656607487444, Model saved as: models\Ideal_EBM_model_age_group_0.0.joblib
(1704, 30)
Age Group: 10.0-19.0, MSE: 0.13035405091561272, Model saved as: models\Ideal_EBM_model_age_group_10.0.joblib
(574, 30)
Age Group: 20.0-29.0, MSE: 2.8261296512623084, Model saved as: models\Ideal_EBM_model_age_group_20.0.joblib
(877, 30)
Age Group: 30.0-39.0, MSE: 3.089915243482211, Model saved as: models\Ideal_EBM_model_age_group_30.0.joblib
(1741, 30)
Age Group: 40.0-49.0, MSE: 4.161589466250099, Model saved as: models\Ideal_EBM_model_age_group_40.0.joblib
(1924, 30)
Age Group: 50.0-59.0, MSE: 5.3932709817306, Model saved as: models\Ideal_EBM_model_age_group_50.0.joblib
(1392, 30)
Age Group: 60.0-69.0, MSE: 4.7428774316270275, Model saved as: models\Ideal_EBM_model_age_group_60.0.joblib
(568, 30)
Age Group: 70.0-79.0, MSE: 4.211032339370485, Model saved as: models\Ideal_EBM_model_age_group_70.0.joblib
(139, 30)
Age Group: 80.0-89.0, MSE: 1.5930211403185164, Mo

In [5]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error
train_data = train_df
test_data = test_df
# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')

# Ensure the age group keys are strings
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Define function to convert age to age group
def age_to_group(age):
    return str((age // 10) * 10)

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Train and Test Data using top 30 features
def prepare_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_df, top_30_features)
X_test, y_test = prepare_data(test_df, top_30_features)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0.0', '10.0', '20.0', '30.0', '40.0', '50.0', '60.0', '70.0', '80.0', '90.0', '100.0']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'Ideal_EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        final_predictions.append(prediction)
    else:
        final_predictions.append(np.nan)  # Handle cases where there is no model for the predicted age group

# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred, squared=True)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (MSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")

Root Mean Squared Error (MSE) on Test Data: 2.1296
Mean Absolute Error (MAE) on Test Data: 1.5501


