In [1]:
import pandas as pd
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')

# Ensure the age group keys are strings
top_30_features = {str(key): value for key, value in top_30_features.items()}
def split_data_by_age_cv(df, age_column='age', n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    age_groups = df.groupby(df[age_column] // 10 * 10)
    
    fold_data = []

    for train_index, test_index in kf.split(df):
        train_list = []
        test_list = []
        
        for _, group in age_groups:
            group_train_index = [idx for idx in train_index if idx in group.index]
            group_test_index = [idx for idx in test_index if idx in group.index]
            
            train = group.loc[group_train_index]
            test = group.loc[group_test_index]
            
            train_list.append(train)
            test_list.append(test)
        
        train_df = pd.concat(train_list, ignore_index=True)
        test_df = pd.concat(test_list, ignore_index=True)
        
        fold_data.append((train_df, test_df))
    
    return fold_data

# Example usage
imputed_data = pd.read_csv('imputed_data.csv',index_col =0)  # Replace with your actual data
fold_data = split_data_by_age_cv(imputed_data)

# Plotting the age distribution for the first fold's train and test sets


In [2]:

train_data, test_data = fold_data[4]

In [3]:
import os
import numpy as np
import pandas as pd
import faiss
from sklearn.metrics import accuracy_score, classification_report
# Define function to convert age to age group
def age_to_group(age):
    return (age // 10) * 10

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)

# Prepare Test Data using top 30 features
def prepare_data(data):
    age_groups = data['age'] // 10 * 10
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features =  group_data.drop(['age','age_group'], axis=1)
        X.append(features.to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy(dtype=np.int32))
    return np.concatenate(X), np.concatenate(y)




X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)


# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
y_pred = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    y_pred.append(unique[np.argmax(counts)])

# Evaluate Predictions
y_pred = np.array(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display Prediction Errors
error_table = pd.DataFrame({'Actual Age Group': y_test, 'Predicted Age Group': y_pred})
print(error_table)

Classification Accuracy: 0.6520
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       144
          10       0.95      0.98      0.96       413
          20       0.70      0.70      0.70       151
          30       0.57      0.50      0.53       226
          40       0.55      0.68      0.61       440
          50       0.53      0.57      0.55       488
          60       0.53      0.48      0.51       336
          70       0.51      0.28      0.36       148
          80       0.80      0.13      0.23        30
          90       0.50      0.17      0.25         6

    accuracy                           0.65      2382
   macro avg       0.66      0.55      0.57      2382
weighted avg       0.65      0.65      0.64      2382

      Actual Age Group  Predicted Age Group
0                    0                    0
1                    0                    0
2                    0                    0
3         

In [4]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor
from interpret import show
import matplotlib.pyplot as plt

# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

# Ensure the explanation images save directory exists
explanation_save_dir = 'explanations'
os.makedirs(explanation_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

age_column = 'age'
age_groups = train_df[age_column] // 10 * 10
unique_age_groups = age_groups.unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[age_groups == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[str(age_group)]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data[age_column]
    
    model = ExplainableBoostingRegressor(interactions=5, greedy_ratio=0, inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}-{age_group+9}, MSE: {mse}, Model saved as: {model_filename}")

    # Save feature importances
    feature_importance = model.explain_global().data()
    feature_importance_df = pd.DataFrame(feature_importance['scores'][:30], columns=[f'Age Group {age_group}'])
    feature_importance_matrix = pd.concat([feature_importance_matrix, feature_importance_df], axis=1)

    # Save global explanation
    global_explanation = model.explain_global()
    show(global_explanation)
    global_explanation_filename = os.path.join(explanation_save_dir, f'global_explanation_age_group_{age_group}.png')
    plt.savefig(global_explanation_filename)
    plt.close()

    # Save local explanation for a sample
    local_explanation = model.explain_local(X_train.iloc[:5], y_train.iloc[:5])
    show(local_explanation)
    local_explanation_filename = os.path.join(explanation_save_dir, f'local_explanation_age_group_{age_group}.png')
    plt.savefig(local_explanation_filename)
    plt.close()

# Print the feature importance matrix
print(feature_importance_matrix)


(580, 30)
Age Group: 0.0-9.0, MSE: 0.06115656607487444, Model saved as: models\EBM_model_age_group_0.0.joblib


(1704, 30)
Age Group: 10.0-19.0, MSE: 0.13035405091561272, Model saved as: models\EBM_model_age_group_10.0.joblib


(574, 30)
Age Group: 20.0-29.0, MSE: 2.8261296512623084, Model saved as: models\EBM_model_age_group_20.0.joblib


(877, 30)
Age Group: 30.0-39.0, MSE: 3.089915243482211, Model saved as: models\EBM_model_age_group_30.0.joblib


(1741, 30)
Age Group: 40.0-49.0, MSE: 4.161589466250099, Model saved as: models\EBM_model_age_group_40.0.joblib


(1924, 30)
Age Group: 50.0-59.0, MSE: 5.3932709817306, Model saved as: models\EBM_model_age_group_50.0.joblib


(1392, 30)
Age Group: 60.0-69.0, MSE: 4.7428774316270275, Model saved as: models\EBM_model_age_group_60.0.joblib


(568, 30)
Age Group: 70.0-79.0, MSE: 4.211032339370485, Model saved as: models\EBM_model_age_group_70.0.joblib


(139, 30)
Age Group: 80.0-89.0, MSE: 1.5930211403185164, Model saved as: models\EBM_model_age_group_80.0.joblib


(26, 30)
Age Group: 90.0-99.0, MSE: 0.27286454406202937, Model saved as: models\EBM_model_age_group_90.0.joblib


(3, 30)
Age Group: 100.0-109.0, MSE: 1167.6968980975703, Model saved as: models\EBM_model_age_group_100.0.joblib


    Age Group 0.0  Age Group 10.0  Age Group 20.0  Age Group 30.0  \
0        0.094239        0.025142        0.190447        0.423137   
1        0.077629        0.032502        0.143500        0.158028   
2        0.087692        0.007632        0.060307        0.142383   
3        0.085297        0.005184        0.117107        0.176928   
4        0.081918        0.020781        0.074821        0.029513   
5        0.122286        0.019343        0.109038        0.114403   
6        0.074189        0.009641        0.220315        0.184398   
7        0.099230        0.019168        0.068759        0.035670   
8        0.086888        0.022453        0.609557        0.184991   
9        0.077531        0.014310        0.071893        0.092908   
10       0.082840        0.019401        0.071020        0.073473   
11       0.098920        0.017130        0.034790        0.050810   
12       0.060742        0.025398        0.047223        0.050858   
13       0.071635        0.021739 

In [4]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from interpret.glassbox import ExplainableBoostingRegressor

# Load the top 30 features for each age group
top_30_features = pd.read_csv('top_30_features_all_age_group.csv', index_col=0).to_dict(orient='list')
top_30_features = {str(key): value for key, value in top_30_features.items()}

# Ensure the model save directory exists
model_save_dir = 'models'
os.makedirs(model_save_dir, exist_ok=True)

train_df, test_df = train_data, test_data

age_column = 'age'
age_groups = train_df[age_column] // 10 * 10
unique_age_groups = age_groups.unique()

models = {}
feature_importance_matrix = pd.DataFrame()

for age_group in unique_age_groups:
    group_data = train_df[age_groups == age_group]
    
    # Select the top 30 features for the current age group
    top_features = top_30_features[str(age_group)]
    X_train = group_data[top_features]
    print(X_train.shape)
    y_train = group_data[age_column]
    
    model = ExplainableBoostingRegressor(interactions=5,greedy_ratio=0,inner_bags=14)
    model.fit(X_train, y_train)
    
    # Calculate the training loss
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    
    models[age_group] = model

    # Save the model
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    joblib.dump(model, model_filename)
    
    print(f"Age Group: {age_group}-{age_group+9}, MSE: {mse}, Model saved as: {model_filename}")

    # Save feature importances
    feature_importance = model.explain_global().data()
    feature_importance_df = pd.DataFrame(feature_importance['scores'][:30], columns=[f'Age Group {age_group}'])
    feature_importance_matrix = pd.concat([feature_importance_matrix, feature_importance_df], axis=1)



# Print the feature importance matrix
print(feature_importance_matrix)

(580, 30)
Age Group: 0.0-9.0, MSE: 0.06115656607487444, Model saved as: models\EBM_model_age_group_0.0.joblib
(1704, 30)
Age Group: 10.0-19.0, MSE: 0.13035405091561272, Model saved as: models\EBM_model_age_group_10.0.joblib
(574, 30)
Age Group: 20.0-29.0, MSE: 2.8261296512623084, Model saved as: models\EBM_model_age_group_20.0.joblib
(877, 30)
Age Group: 30.0-39.0, MSE: 3.089915243482211, Model saved as: models\EBM_model_age_group_30.0.joblib
(1741, 30)
Age Group: 40.0-49.0, MSE: 4.161589466250099, Model saved as: models\EBM_model_age_group_40.0.joblib
(1924, 30)
Age Group: 50.0-59.0, MSE: 5.3932709817306, Model saved as: models\EBM_model_age_group_50.0.joblib
(1392, 30)
Age Group: 60.0-69.0, MSE: 4.7428774316270275, Model saved as: models\EBM_model_age_group_60.0.joblib
(568, 30)
Age Group: 70.0-79.0, MSE: 4.211032339370485, Model saved as: models\EBM_model_age_group_70.0.joblib
(139, 30)
Age Group: 80.0-89.0, MSE: 1.5930211403185164, Model saved as: models\EBM_model_age_group_80.0.jo

In [None]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss
from sklearn.metrics import mean_squared_error, mean_absolute_error



# Define function to convert age to age group
def age_to_group(age):
    return str((age // 10) * 10)

# Convert ages to age groups
train_data['age_group'] = train_data['age'].apply(age_to_group)
test_data['age_group'] = test_data['age'].apply(age_to_group)


def prepare_data(data):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = group_data.columns.drop(['age', 'age_group'])
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)


X_train, y_train = prepare_data(train_data)
X_test, y_test = prepare_data(test_data)

# Initialize FAISS index
dimension = X_train.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for KNN

# Train FAISS index
index.add(X_train)

# KNN search for Test Data
k = 11  # Number of neighbors
D, I = index.search(X_test, k)  # D is the distances, I is the indices of the nearest neighbors

# Predict age group by majority vote of nearest neighbors
predicted_age_groups = []
for neighbors in I:
    neighbor_labels = y_train[neighbors]
    unique, counts = np.unique(neighbor_labels, return_counts=True)
    predicted_age_groups.append(str(unique[np.argmax(counts)]))

# Load the saved models
model_save_dir = 'models'
age_group_models = {}
age_groups = ['0.0', '10.0', '20.0', '30.0', '40.0', '50.0', '60.0', '70.0', '80.0', '90.0', '100.0']

for age_group in age_groups:
    model_filename = os.path.join(model_save_dir, f'EBM_model_age_group_{age_group}.joblib')
    if os.path.exists(model_filename):
        age_group_models[age_group] = joblib.load(model_filename)
        #print(age_group in age_group_models)





# Prepare Train using top 30 features
def prepare_train_data(data, top_features):
    age_groups = data['age_group']
    X = []
    y = []
    for age_group in age_groups.unique():
        group_data = data[age_groups == age_group]
        features = top_features[age_group]
        X.append(group_data[features].to_numpy(dtype=np.float32))
        y.append(group_data['age_group'].to_numpy())
    return np.concatenate(X), np.concatenate(y)



X_test, y_test = prepare_train_data(test_data, top_30_features)

# Use the predicted age group to load the corresponding model and make final predictions
final_predictions = []
for i, age_group in enumerate(predicted_age_groups):
    if age_group in age_group_models:
        
        model = age_group_models[age_group]
        test_sample = X_test[i].reshape(1, -1)
        prediction = model.predict(test_sample)[0]
        #print(age_group,age_group_models,'make prediction')
        final_predictions.append(prediction)


# Calculate errors
y_true = test_data['age'].to_numpy(dtype=np.float32)
y_pred = np.array(final_predictions)

mse = mean_squared_error(y_true, y_pred, squared=True)
mae = mean_absolute_error(y_true, y_pred)

print(f"Root Mean Squared Error (RMSE) on Test Data: {mse**0.5:.4f}")
print(f"Mean Absolute Error (MAE) on Test Data: {mae:.4f}")