In [205]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mutual_info_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm.auto import tqdm
from xgboost import XGBClassifier, XGBRegressor
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer

from rankers import *
from hyper_param_search import *

In [7]:
data_path = './mena_ml_data.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,person_id,hdl,ldl,trig,sbp,dbp,bmi,HR,egfr,cre_alb,...,no_health_insurance,poverty,vacant_housing,deprivation_index,mi_status,t2d_status,mace_status,hf_status,scd_status,stroke_status
0,2825659,45.0,64.0,156.0,109.0,71.0,25.84,76.0,,,...,7.192906,9.716275,5.911106,0.225707,1,0,1,0,0,0
1,1769136,43.0,77.0,350.0,146.0,86.0,25.299999,72.0,60.0,,...,2.828384,8.649236,6.690513,0.247603,1,0,1,1,0,0
2,2582132,45.0,74.0,171.0,172.0,60.0,30.82,62.0,18.0,,...,6.863952,17.753432,11.598508,0.314289,0,1,0,0,0,0
3,2966613,36.0,115.0,128.0,116.0,86.0,,75.0,99.0,,...,2.948661,9.679384,6.108289,0.251716,0,0,0,0,0,0
4,2323362,43.0,75.0,166.0,133.0,76.0,29.86,55.0,,,...,10.27812,14.373649,6.069094,0.291387,0,1,0,0,0,0


In [198]:
def view_data(df_full):
    """
    Takes a dataframe and prints the percentage of NaN values in each column.

    Parameters:
    path (str): The path to the CSV file.

    Returns:
    None: This function doesn't return anything; it prints to the console.
    """

    # Loop through each column in the DataFrame
    for column in df_full.columns:
        # Count the number of NaN values in the current column
        nan_count = df_full[column].isna().sum()

        # Calculate the percentage of NaN values in the current column
        nan_percentage = round(nan_count / len(df_full) * 100, 1)

        # Print the percentage of NaN values for the current column
        print(f'The column {column} has {nan_percentage}% NaN values.')


def get_data(df_full, thresh=0.8,
             columns_to_drop=['person_id', 'egfr', 'cre_alb', 'statins_status',
                              'antihts_status', 'race', 'race_concept_id','t2d_status',
                              'mace_status','hf_status','scd_status','stroke_status'],
             labels=['mi_status']):

    # Separate the labels early on
    y = df_full[labels]

    # Drop specified columns
    df_clean = df_full.drop(columns=columns_to_drop + labels)

    # Calculate the threshold for the minimum number of non-missing values
    threshold = thresh * len(df_clean)

    # Drop columns with more than 20% missing values
    df_clean = df_clean.dropna(axis=1, thresh=threshold)

    # Drop rows with any missing values
    combined = pd.concat([df_clean, y], axis=1)  # combine to make sure rows are consistent
    combined_clean = combined.dropna()
    df_clean = combined_clean[df_clean.columns]
    y = combined_clean[labels]

    # Initialize the Label Encoder
    le = LabelEncoder()

    # Identify columns that need to be encoded (those of type 'object')
    columns_to_encode = df_clean.select_dtypes(include=['object', 'string']).columns

    # Apply Label Encoder to each categorical string column
    for column in columns_to_encode:
        df_clean[column] = le.fit_transform(df_clean[column])

    # Separate the cleaned DataFrame into features (X) and labels (y)
    X = df_clean

    return X, y

In [191]:
X, y = get_data(df)

In [206]:
hypers = classification_hyper_param_search(X, y, 3, 1)
hypers

Fitting 3 folds for each of 1 candidates, totalling 3 fits


NameError: name 'plot_confusion_matrix' is not defined

In [21]:
rf_hyper = dict(rf_scores['best_params'].iloc[0])
xb_hyper = dict(xb_scores['best_params'].iloc[0])
rf_hyper

{'n_estimators': 889, 'max_features': 'log2', 'max_depth': 778}

In [175]:
ranking = classification_ranking(X, y, rf_hyper, xb_hyper)

Finding C Step:   0%|          | 0/30 [00:00<?, ?it/s]

Ranking l1 Classification:   0%|          | 0/10000 [00:00<?, ?it/s]

In [176]:
ranking

Unnamed: 0,RF,Score,XG,Score.1,MI,Score.2,F,Score.3,L1,Score.4
0,age,0.121631,median_income,0.220408,age,0.021554,age,50.090692,age,0.019
1,ldl,0.121471,poverty,0.124953,trig,0.021496,sex_at_birth,19.674834,ldl,0.033
2,trig,0.091473,trig,0.090837,ldl,0.020969,ldl,18.447975,sex_at_birth,0.033
3,sbp,0.085195,ldl,0.081631,sex_at_birth,0.014419,hdl,14.986263,hdl,0.037
4,bmi,0.077955,zip_code,0.069088,hdl,0.011021,trig,14.357639,trig,0.042
5,hdl,0.075046,assisted_income,0.059192,zip_code,0.010966,dbp,6.366412,dbp,0.055
6,dbp,0.068696,deprivation_index,0.055863,median_income,0.00823,HR,3.024215,sbp,0.131
7,HR,0.058193,sbp,0.052083,no_health_insurance,0.006513,sbp,0.439655,zip_code,0.141
8,zip_code,0.043575,high_school_education,0.046118,smoking_status,0.005207,bmi,0.341425,median_income,0.217
9,no_health_insurance,0.038478,HR,0.042916,bmi,0.002352,no_health_insurance,0.279309,HR,0.344


In [None]:
def voting(df):
    """
    Aggregate feature rankings for each target variable using a weighted voting mechanism.

    Parameters:
    df (DataFrame): A DataFrame containing feature rankings from different models for each target column.

    Returns:
    dict: A dictionary containing final feature rankings for each target variable.
    """
    # Get the unique target variables in the DataFrame
    unique_targets = df['target_col'].unique()

    # Initialize an empty dictionary to hold the final rankings for each target
    all_scores = {}

    # Loop through each unique target variable
    for target in unique_targets:
        # Initialize an empty dictionary to hold the scores for the current target
        final_scores = {}

        # Get rows that correspond to the current target variable
        rows = df[df['target_col'] == target]

        # Extract the ordered lists of features for each model
        lasso = rows['Lasso order'].tolist()
        rf = rows['RF order'].tolist()
        xgb = rows['XGB order'].tolist()
        rfe = rows['RFE order'].tolist()
        mi = rows['MI order'].tolist()

        # Define the weights for each model
        w1, w2, w3, w4, w5 = 0.3, 0.2, 0.2, 0.1, 0.1

        # Pair each feature list with its corresponding weight
        lists_and_weights = [(lasso, w1), (rf, w2), (xgb, w3), (rfe, w4), (mi, w5)]

        # Calculate the final scores
        for feature_list, weight in lists_and_weights:
            for i, feature in enumerate(reversed(feature_list)):
                if feature not in final_scores:
                    final_scores[feature] = 0
                final_scores[feature] += (i + 1) * weight

        # Sort features by their final scores in descending order
        final_ranking = sorted(final_scores.items(), key=lambda item: item[1], reverse=True)

        # Store the final ranking for the current target in the 'all_scores' dictionary
        all_scores[target] = final_ranking

    return all_scores

In [None]:
for i, race in tqdm(enumerate(races)):
    hypers = hyper_param_search(Xs[i], ys[i], 5, model_params, 20)

    xb_scores = hypers[hypers['model'] == 'XGBoost'].reset_index(drop=True)
    rf_scores = hypers[hypers['model'] == 'random_forest'].reset_index(drop=True)
    xb_scores.to_csv(f'xb_hypers_social_det_{race}.csv', index=False)
    rf_scores.to_csv(f'rf_hypers_social_det_{race}.csv', index=False)

    ranks = classification_ranking(Xs[i], ys[i], rf_scores, xb_scores, 1)

    scoring = voting(ranks)

    for key in list(scoring.keys()):
        # Split the features and scores
        features, scores = zip(*scoring[key])
        height_per_feature = 0.5
        fig_height = len(features) * height_per_feature

        # Create a figure and a set of subplots
        fig, ax = plt.subplots(figsize=(10, fig_height))
        fig.patch.set_facecolor('white')

        # Plot a horizontal bar chart
        ax.barh(features, scores, color='blue', alpha=0.6)
        # Invert the y-axis to have the highest score at the top
        ax.invert_yaxis()
        # Set labels for the x-axis and the plot with a white background
        label_opts = {'color': 'black', 'bbox': dict(facecolor='white', edgecolor='none')}
        ax.set_xlabel('Scores', **label_opts)
        ax.set_ylabel('Features', **label_opts)
        ax.set_title(f'Ranking of Features for {key} in {race}')

        # Apply background color to tick labels
        ax.tick_params(axis='both', which='both', labelsize='large', labelcolor='black', colors='black')

        # Save and show the plot with all elements included
        plt.savefig(f'feature_{key}_{race}_rank.png', bbox_inches='tight', transparent=False, dpi=300)
        plt.show()

In [None]:
# 1. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def sanitize_column_names(df):
    """
    Sanitize column names by replacing prohibited characters.
    """
    sanitized_columns = [col.replace("[", "_").replace("]", "_").replace("<", "_") for col in df.columns]
    df.columns = sanitized_columns
    return df

# Use the function before training
X_train = sanitize_column_names(X_train)
X_test = sanitize_column_names(X_test)

cls = xgb.XGBClassifier(use_label_encoder=False, eval_metric='error')
cls.fit(X_train, y_train)
y_pred = cls.predict(X_test)

In [None]:
# Ensure both y_test and y_pred are 1D arrays
y_test = np.array(y_test).ravel()
y_pred = np.array(y_pred).ravel()

# Report classification accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')

# Report other classification metrics (precision, recall, F1-score, etc.)
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8,6))
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap='Blues',
            xticklabels=['Below 40', '40-80', 'Above 80'],
            yticklabels=['Below 40', '40-80', 'Above 80'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title(f'Confusion Matrix\nAccuracy: {accuracy:.3f}')
plt.show()

In [None]:
# Ensure both y_test and y_pred are 1D arrays
y_test = np.array(y_test).ravel()
y_pred = np.array(y_pred).ravel()

# 5. Report the Pearson correlation
corr, _ = pearsonr(y_test, y_pred)
print(f'Pearson correlation: {corr:.3f}')

# Report the R^2 score
r2 = best_ridge.score(X_test, y_test)
print(f'R^2 score: {r2:.3f}')

# 6. Plot the predicted values against the true labels
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('True Labels')
plt.ylabel('Predictions')
plt.title(f'True vs Predicted Values\nR^2 score: {r2:.3f}')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # diagonal line
plt.show()