In [1]:
import pandas as pd
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

# Function to map ages to custom age groups
def map_age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

def split_data_by_age_cv(df, age_column='age', n_splits=5, random_state=42):
    # Map each age to the custom age group
    df['age_group'] = df[age_column].apply(map_age_to_group)

    # Group the data by the custom age groups
    age_groups = df.groupby('age_group')
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    fold_data = []

    for train_index, test_index in kf.split(df):
        train_list = []
        test_list = []
        
        for _, group in age_groups:
            group_train_index = [idx for idx in train_index if idx in group.index]
            group_test_index = [idx for idx in test_index if idx in group.index]
            
            train = group.loc[group_train_index]
            test = group.loc[group_test_index]
            
            train_list.append(train)
            test_list.append(test)
        
        train_df = pd.concat(train_list, ignore_index=True)
        test_df = pd.concat(test_list, ignore_index=True)
        
        fold_data.append((train_df, test_df))
    
    return fold_data




# Example usage
imputed_data = pd.read_csv('imputed_data.csv', index_col=0)  # Replace with your actual data




fold_data = split_data_by_age_cv(imputed_data)

# To verify, print the age groups in the first fold
print(fold_data[0][0]['age_group'].value_counts())


age_group
34-60    4350
0-34     3078
60-78    1881
78+       219
Name: count, dtype: int64


In [2]:
import os
import pandas as pd
import numpy as np
# Load the imputed data
imputed_data = pd.read_csv('imputed_data.csv', index_col=0)

# Define function to map ages to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Apply the function to create age groups
imputed_data['age_group'] = imputed_data['age'].apply(age_to_group)

# Define function to select top 30 features based on Pearson correlation with age
def select_top_features(data, age_column='age', top_n=30):
    # Ensure only numeric columns are used for correlation calculation
    numeric_data = data.select_dtypes(include=[np.number])
    
    correlations = numeric_data.corr()[age_column].abs().sort_values(ascending=False)
    top_features = correlations.index[1:top_n+1].tolist()  # Exclude the age column itself
    return top_features

# Directory to save the top features
top_features_dir = '.'
os.makedirs(top_features_dir, exist_ok=True)

# Select and save top 30 features for each age group
age_groups = imputed_data['age_group'].unique()

top_30_features_all_age_group = {}

for age_group in age_groups:
    group_data = imputed_data[imputed_data['age_group'] == age_group]
    
    # Select top 30 features for the current age group
    top_features = select_top_features(group_data)
    top_30_features_all_age_group[age_group] = top_features
    
    # Save top features to a CSV file
    top_features_df = pd.DataFrame(top_features, columns=[f'Top 30 Features for Age Group {age_group}'])


# Save all top features to a single CSV file
top_30_features_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in top_30_features_all_age_group.items()]))


In [3]:
top_30_features_df

Unnamed: 0,0-34,60-78,34-60,78+
0,cg16867657,cg16867657,cg16867657,cg14571574
1,cg05412028,cg22454769,cg22454769,cg24273318
2,cg19283806,cg21572722,cg19283806,cg01763090
3,cg26685941,cg06639320,cg04875128,cg24079702
4,cg04295144,cg04875128,cg06639320,cg10501210
5,cg15951188,cg19283806,cg24724428,cg00292135
6,cg22242842,cg23500537,cg15341124,cg06833647
7,cg07082267,cg25478614,cg21572722,cg09118555
8,cg01282174,cg17110586,cg05404236,cg19890168
9,cg05207048,cg24724428,cg13033938,cg03064228


In [5]:
train_data, test_data = fold_data[0]


In [6]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report

# Define function to convert age to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.drop(['age', 'age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=str))  # Keeping as string since we have non-numeric age groups
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)


Classification Accuracy: 0.8480
Classification Report:
              precision    recall  f1-score   support

        0-34       0.97      0.96      0.97       759
       34-60       0.82      0.93      0.87      1075
       60-78       0.71      0.60      0.65       480
         78+       0.78      0.10      0.18        68

    accuracy                           0.85      2382
   macro avg       0.82      0.65      0.67      2382
weighted avg       0.84      0.85      0.84      2382

     Actual Age Group Predicted Age Group
0                0-34                0-34
1                0-34                0-34
2                0-34                0-34
3                0-34                0-34
4                0-34                0-34
...               ...                 ...
2377              78+               60-78
2378              78+               60-78
2379              78+               60-78
2380              78+               60-78
2381              78+               60-78

[2382

In [15]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = top_30_features_df
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

# Define function to map ages to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Apply the function to create age groups
train_df['age_group'] = train_df['age'].apply(age_to_group)
test_df['age_group'] = test_df['age'].apply(age_to_group)

unique_age_groups = train_df['age_group'].unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[train_df['age_group'] == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[age_group]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data['age']
    
    model = ExplainableBoostingRegressor(interactions=5, greedy_ratio=0, inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}, MSE: {mse}, Model saved as: {model_filename}")


(3078, 30)
Age Group: 0-34, MSE: 3.4423559389911307, Model saved as: models\EBM_model_age_group_0-34.joblib
(4350, 30)
Age Group: 34-60, MSE: 9.877452665791957, Model saved as: models\EBM_model_age_group_34-60.joblib
(1881, 30)
Age Group: 60-78, MSE: 9.420236419424015, Model saved as: models\EBM_model_age_group_60-78.joblib
(219, 30)
Age Group: 78+, MSE: 6.936531124471692, Model saved as: models\EBM_model_age_group_78+.joblib


In [16]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define function to convert age to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.columns.drop(['age', 'age_group'])
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0-34', '34-60', '60-78', '78+']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)

# Prepare Train using top 30 features
def prepare_train_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_test, y_test = prepare_train_data(test_data, top_30_features)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        final_predictions.append(prediction)

# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (RMSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")


Root Mean Squared Error (RMSE) on Test Data: 6.5550
Mean Absolute Error (MAE) on Test Data: 4.1045


In [4]:
train_data, test_data = fold_data[1]

In [5]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report

# Define function to convert age to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.drop(['age', 'age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=str))  # Keeping as string since we have non-numeric age groups
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)


Classification Accuracy: 0.8774
Classification Report:
              precision    recall  f1-score   support

        0-34       0.96      0.96      0.96       801
       34-60       0.85      0.93      0.89      1053
       60-78       0.79      0.69      0.74       474
         78+       0.90      0.17      0.28        54

    accuracy                           0.88      2382
   macro avg       0.88      0.69      0.72      2382
weighted avg       0.88      0.88      0.87      2382

     Actual Age Group Predicted Age Group
0                0-34                0-34
1                0-34                0-34
2                0-34                0-34
3                0-34                0-34
4                0-34                0-34
...               ...                 ...
2377              78+               60-78
2378              78+               60-78
2379              78+               60-78
2380              78+               60-78
2381              78+               60-78

[2382

In [6]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = top_30_features_df
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

# Define function to map ages to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Apply the function to create age groups
train_df['age_group'] = train_df['age'].apply(age_to_group)
test_df['age_group'] = test_df['age'].apply(age_to_group)

unique_age_groups = train_df['age_group'].unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[train_df['age_group'] == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[age_group]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data['age']
    
    model = ExplainableBoostingRegressor(interactions=5, greedy_ratio=0, inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}, MSE: {mse}, Model saved as: {model_filename}")

(3036, 30)
Age Group: 0-34, MSE: 2.780860543937726, Model saved as: models\EBM_model_age_group_0-34.joblib
(4372, 30)
Age Group: 34-60, MSE: 9.642502205851583, Model saved as: models\EBM_model_age_group_34-60.joblib
(1887, 30)
Age Group: 60-78, MSE: 9.839534888815793, Model saved as: models\EBM_model_age_group_60-78.joblib
(233, 30)
Age Group: 78+, MSE: 4.3168119568945205, Model saved as: models\EBM_model_age_group_78+.joblib


In [7]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define function to convert age to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.columns.drop(['age', 'age_group'])
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0-34', '34-60', '60-78', '78+']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)

# Prepare Train using top 30 features
def prepare_train_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_test, y_test = prepare_train_data(test_data, top_30_features)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        final_predictions.append(prediction)

# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (RMSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")


Root Mean Squared Error (RMSE) on Test Data: 6.3510
Mean Absolute Error (MAE) on Test Data: 3.8388


In [4]:
train_data, test_data = fold_data[2]

In [5]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report

# Define function to convert age to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.drop(['age', 'age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=str))  # Keeping as string since we have non-numeric age groups
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)


Classification Accuracy: 0.8606
Classification Report:
              precision    recall  f1-score   support

        0-34       0.96      0.96      0.96       769
       34-60       0.84      0.92      0.88      1088
       60-78       0.73      0.64      0.68       464
         78+       0.73      0.18      0.29        61

    accuracy                           0.86      2382
   macro avg       0.82      0.68      0.70      2382
weighted avg       0.86      0.86      0.85      2382

     Actual Age Group Predicted Age Group
0                0-34                0-34
1                0-34                0-34
2                0-34                0-34
3                0-34                0-34
4                0-34                0-34
...               ...                 ...
2377              78+               60-78
2378              78+               60-78
2379              78+                 78+
2380              78+                 78+
2381              78+               60-78

[2382

In [6]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = top_30_features_df
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

# Define function to map ages to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Apply the function to create age groups
train_df['age_group'] = train_df['age'].apply(age_to_group)
test_df['age_group'] = test_df['age'].apply(age_to_group)

unique_age_groups = train_df['age_group'].unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[train_df['age_group'] == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[age_group]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data['age']
    
    model = ExplainableBoostingRegressor(interactions=5, greedy_ratio=0, inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}, MSE: {mse}, Model saved as: {model_filename}")

(3068, 30)
Age Group: 0-34, MSE: 3.0017002861063364, Model saved as: models\EBM_model_age_group_0-34.joblib
(4337, 30)
Age Group: 34-60, MSE: 10.10635056197224, Model saved as: models\EBM_model_age_group_34-60.joblib
(1897, 30)
Age Group: 60-78, MSE: 9.187446556824815, Model saved as: models\EBM_model_age_group_60-78.joblib
(226, 30)
Age Group: 78+, MSE: 6.190863325320099, Model saved as: models\EBM_model_age_group_78+.joblib


In [7]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define function to convert age to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.columns.drop(['age', 'age_group'])
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0-34', '34-60', '60-78', '78+']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)

# Prepare Train using top 30 features
def prepare_train_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_test, y_test = prepare_train_data(test_data, top_30_features)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        final_predictions.append(prediction)

# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (RMSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")


Root Mean Squared Error (RMSE) on Test Data: 6.0194
Mean Absolute Error (MAE) on Test Data: 3.8643


In [4]:
train_data, test_data = fold_data[3]

In [5]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report

# Define function to convert age to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.drop(['age', 'age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=str))  # Keeping as string since we have non-numeric age groups
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)


Classification Accuracy: 0.8686
Classification Report:
              precision    recall  f1-score   support

        0-34       0.97      0.96      0.96       743
       34-60       0.84      0.94      0.89      1112
       60-78       0.78      0.64      0.70       471
         78+       0.86      0.11      0.19        56

    accuracy                           0.87      2382
   macro avg       0.86      0.66      0.69      2382
weighted avg       0.87      0.87      0.86      2382

     Actual Age Group Predicted Age Group
0                0-34                0-34
1                0-34                0-34
2                0-34                0-34
3                0-34                0-34
4                0-34                0-34
...               ...                 ...
2377              78+               60-78
2378              78+               60-78
2379              78+               34-60
2380              78+               60-78
2381              78+               34-60

[2382

In [6]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = top_30_features_df
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

# Define function to map ages to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Apply the function to create age groups
train_df['age_group'] = train_df['age'].apply(age_to_group)
test_df['age_group'] = test_df['age'].apply(age_to_group)

unique_age_groups = train_df['age_group'].unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[train_df['age_group'] == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[age_group]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data['age']
    
    model = ExplainableBoostingRegressor(interactions=5, greedy_ratio=0, inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}, MSE: {mse}, Model saved as: {model_filename}")

(3094, 30)
Age Group: 0-34, MSE: 3.132372703582305, Model saved as: models\EBM_model_age_group_0-34.joblib
(4313, 30)
Age Group: 34-60, MSE: 10.014699841672208, Model saved as: models\EBM_model_age_group_34-60.joblib
(1890, 30)
Age Group: 60-78, MSE: 9.288893702381664, Model saved as: models\EBM_model_age_group_60-78.joblib
(231, 30)
Age Group: 78+, MSE: 3.885761105388882, Model saved as: models\EBM_model_age_group_78+.joblib


In [7]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define function to convert age to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.columns.drop(['age', 'age_group'])
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0-34', '34-60', '60-78', '78+']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)

# Prepare Train using top 30 features
def prepare_train_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_test, y_test = prepare_train_data(test_data, top_30_features)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        final_predictions.append(prediction)

# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (RMSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")


Root Mean Squared Error (RMSE) on Test Data: 6.3435
Mean Absolute Error (MAE) on Test Data: 3.9399


In [4]:
train_data, test_data = fold_data[4]

In [5]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report

# Define function to convert age to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.drop(['age', 'age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=str))  # Keeping as string since we have non-numeric age groups
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)


Classification Accuracy: 0.8728
Classification Report:
              precision    recall  f1-score   support

        0-34       0.96      0.96      0.96       765
       34-60       0.84      0.94      0.89      1097
       60-78       0.80      0.66      0.72       472
         78+       0.75      0.19      0.30        48

    accuracy                           0.87      2382
   macro avg       0.84      0.68      0.72      2382
weighted avg       0.87      0.87      0.87      2382

     Actual Age Group Predicted Age Group
0                0-34                0-34
1                0-34                0-34
2                0-34                0-34
3                0-34                0-34
4                0-34                0-34
...               ...                 ...
2377              78+               60-78
2378              78+               60-78
2379              78+               60-78
2380              78+               34-60
2381              78+               34-60

[2382

In [6]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = top_30_features_df
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

# Define function to map ages to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Apply the function to create age groups
train_df['age_group'] = train_df['age'].apply(age_to_group)
test_df['age_group'] = test_df['age'].apply(age_to_group)

unique_age_groups = train_df['age_group'].unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[train_df['age_group'] == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[age_group]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data['age']
    
    model = ExplainableBoostingRegressor(interactions=5, greedy_ratio=0, inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}, MSE: {mse}, Model saved as: {model_filename}")

(3072, 30)
Age Group: 0-34, MSE: 3.107167509037312, Model saved as: models\EBM_model_age_group_0-34.joblib
(4328, 30)
Age Group: 34-60, MSE: 9.923692672369786, Model saved as: models\EBM_model_age_group_34-60.joblib
(1889, 30)
Age Group: 60-78, MSE: 9.81673643552513, Model saved as: models\EBM_model_age_group_60-78.joblib
(239, 30)
Age Group: 78+, MSE: 5.729575510071648, Model saved as: models\EBM_model_age_group_78+.joblib


In [7]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define function to convert age to the new age groups
def age_to_group(age):
    if age < 34:
        return '0-34'
    elif age < 60:
        return '34-60'
    elif age < 78:
        return '60-78'
    else:
        return '78+'

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.columns.drop(['age', 'age_group'])
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0-34', '34-60', '60-78', '78+']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)

# Prepare Train using top 30 features
def prepare_train_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)

X_test, y_test = prepare_train_data(test_data, top_30_features)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        final_predictions.append(prediction)

# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (RMSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")


Root Mean Squared Error (RMSE) on Test Data: 6.0540
Mean Absolute Error (MAE) on Test Data: 3.8422
