In [48]:
import pandas as pd
import numpy as np

We'll train both a Random Forest Classifier and a K-Nearest Neighbors Classifier using our menopause data. We'll then compare the two models and see which one performs better.

# 📖 Loading codebook values

### ℹ Reading in the data and variables to be used

In [49]:
menopause_data = pd.read_csv('./datasets/visit_dfs.csv', dtype='str')
menopause_data.head()

Unnamed: 0,HAVEPER,EXPENSI,BROKEBO,AGE,EXERCIS,DNTKNOW,AVCIGDA,NOREASO,SIDEEFF,OSTEOPO,...,DIETNUT,LIKEFEL,COMBIN2,BONES3,PRGNANT,SMOKERE,ESTRNJ1,STOPOTH,BONES2,HOURSPA
0,-1,-1,0,53,-9999,-1,-1,-1,-1,1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1
1,-1,-1,0,52,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1
2,-1,-1,1,46,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1
3,-1,-1,0,50,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,4
4,-1,-1,0,52,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1


In [50]:
ages = menopause_data['AGE'].astype(int).where(menopause_data['AGE'].astype(int) >= 0)
print(
    'Ages in dataset:',
    f'Min age: {ages.min()}',
    f'Max age: {ages.max()}',
    sep='\n'
)

Ages in dataset:
Min age: 42.0
Max age: 64.0


In [51]:
with open('./lista_variables/variables_selected_final.txt', 'r') as f:
    features = f.read().splitlines()

features

['MENODEP',
 'EXERCIS',
 'DIETNUT',
 'OUTCOME',
 'ESTRNJ2',
 'COMBIN2',
 'ESTROG2',
 'E2AVE',
 'ALCHL24',
 'ESTRDA2',
 'HOURSPA',
 'HEIGHT',
 'ESTRDA1',
 'WEIGHT',
 'HAPPY',
 'PRGNANT',
 'VAGINDR',
 'SMOKERE',
 'HOTFLAS',
 'PHYSILL',
 'AVCIGDA',
 'TRBLSLE',
 'CANCERS',
 'DIABETE',
 'BROKEBO',
 'OSTEOPR',
 'STATUS',
 'ESTROG1',
 'ESTRNJ1',
 'AGE',
 'RACE',
 'NUMHOTF',
 'VISIT']

In [52]:
targets = [
    'STATUS',    # Indicating if the patient is menopausal or not, among other options
    'NUMHOTF',  # Number of hot flashes daily
    'OSTEOPR',  # Indicating if the patient has osteoporosis or not
    'MENODEP',  # Indicating if the patient has depression or not
]

# Remove targets from features
for target in targets:
    features.remove(target)

features.remove('VISIT')  # Visit should not be used as a feature

> Do note that our variables are mainly categorical, but not all of them. We'll read a JSON file to know which variables aren't categorical and we'll change them to float.

In [53]:
menopause_data = menopause_data[features + targets]

In [54]:
import json

with open('./lista_variables/variables_selected_labels.json', 'r') as file:
    features_labels = json.load(file)

### ✅ Parsing data to correct data type

In [55]:
integer_features = features_labels['NUMERICAL']['vars']['int']
integer_features.remove('VISIT')
float_features = features_labels['NUMERICAL']['vars']['float']

def parse_numbers():

    for feature in integer_features:
        menopause_data[feature] = menopause_data[feature].astype(int)

    for feature in float_features:
        menopause_data[feature] = menopause_data[feature].astype(float)

In [56]:
parse_numbers()
menopause_data.dtypes

EXERCIS     object
DIETNUT     object
OUTCOME     object
ESTRNJ2     object
COMBIN2     object
ESTROG2     object
E2AVE      float64
ALCHL24     object
ESTRDA2     object
HOURSPA     object
HEIGHT     float64
ESTRDA1     object
WEIGHT     float64
HAPPY       object
PRGNANT     object
VAGINDR     object
SMOKERE     object
HOTFLAS     object
PHYSILL     object
AVCIGDA      int32
TRBLSLE     object
CANCERS     object
DIABETE     object
BROKEBO      int32
ESTROG1     object
ESTRNJ1     object
AGE          int32
RACE        object
STATUS      object
NUMHOTF    float64
OSTEOPR     object
MENODEP     object
dtype: object

### ✨ Pipeline for negative values in data

In some of the variables, negative values are used to indicate missing data, or that the patient refused to answer. We'll create a pipeline to replace these negative values with NaNs,
and we'll use KNN to impute the missing values.

This will aid in standardizing the data.

In [57]:
numerical_features = integer_features + float_features

In [58]:
# Set negatives as NaN
menopause_data[numerical_features] = menopause_data[numerical_features].apply(lambda x: x.where(x > 0, np.nan))


from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

# DONE
# menopause_data[numerical_features] = imputer.fit_transform(menopause_data[numerical_features])

In [59]:
menopause_data = pd.read_csv('./datasets/visit_dfs_imputed.csv', dtype='str', index_col=0)
# Imputed values by KNN are float, so parse all as float
menopause_data[numerical_features] = menopause_data[numerical_features].astype(float)

In [60]:
menopause_data[numerical_features].isna().sum()  # Hopefully no NaNs!

AVCIGDA    0
BROKEBO    0
AGE        0
NUMHOTF    0
E2AVE      0
HEIGHT     0
WEIGHT     0
dtype: int64

In [61]:
# Save dataset
# DONE
# menopause_data.to_csv('./datasets/visit_dfs_imputed.csv', index=True)

# 🧠 Model training!

In [62]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from mlflow import log_metric, log_param, log_artifact, start_run


In [63]:
# Shuffle order to avoid order bias
menopause_data = shuffle(menopause_data)


### 🧮 Categorical values picky tuning

For our purposes, we will use a classifier, so we need to encode numerical targets as categorical ones. Luckily, only 'NUMHOTF' is needed to be processed.

In [64]:
def get_hotf_category(hot_flashes: float):

    if hot_flashes <= 0 and hot_flashes < 5:
        return 1
    elif hot_flashes >= 5 and hot_flashes < 10:
        return 2
    elif hot_flashes >= 10 and hot_flashes < 15:
        return 3
    elif hot_flashes >= 15 and hot_flashes < 20:
        return 4
    elif hot_flashes >= 20 and hot_flashes < 25:
        return 5
    elif hot_flashes >= 25 and hot_flashes < 30:
        return 6
    else:
        return 7

In [65]:
menp_data_encoded = menopause_data.copy()

# Turn 'NUMHOTF' into a categorical feature
menp_data_encoded['NUMHOTF'] = menp_data_encoded['NUMHOTF'].apply(lambda x: get_hotf_category(x)).astype(str)

In [66]:
from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder

""" Encode categorical features for KNN model """

def encode_labels(df: DataFrame, targets: list[str]):

    le = LabelEncoder()

    for target in targets:
        df[target] = le.fit_transform(df[target])


In [67]:
non_numerical_features = list(set(features) - set(numerical_features))

encode_labels(menp_data_encoded, non_numerical_features)

In [68]:
menp_data_encoded[non_numerical_features] = menp_data_encoded[non_numerical_features].astype(int)
menp_data_encoded[non_numerical_features].dtypes

EXERCIS    int32
ESTRNJ1    int32
HOTFLAS    int32
CANCERS    int32
RACE       int32
OUTCOME    int32
ESTRDA1    int32
DIABETE    int32
ESTROG1    int32
ESTRDA2    int32
HAPPY      int32
DIETNUT    int32
VAGINDR    int32
PHYSILL    int32
SMOKERE    int32
HOURSPA    int32
TRBLSLE    int32
PRGNANT    int32
ALCHL24    int32
ESTRNJ2    int32
COMBIN2    int32
ESTROG2    int32
dtype: object

So far, all of out features are numerical and all out targets are categorical!

In [69]:
menp_data_encoded[features].dtypes

EXERCIS      int32
DIETNUT      int32
OUTCOME      int32
ESTRNJ2      int32
COMBIN2      int32
ESTROG2      int32
E2AVE      float64
ALCHL24      int32
ESTRDA2      int32
HOURSPA      int32
HEIGHT     float64
ESTRDA1      int32
WEIGHT     float64
HAPPY        int32
PRGNANT      int32
VAGINDR      int32
SMOKERE      int32
HOTFLAS      int32
PHYSILL      int32
AVCIGDA    float64
TRBLSLE      int32
CANCERS      int32
DIABETE      int32
BROKEBO    float64
ESTROG1      int32
ESTRNJ1      int32
AGE        float64
RACE         int32
dtype: object

In [70]:
menp_data_encoded[targets].dtypes

STATUS     object
NUMHOTF    object
OSTEOPR    object
MENODEP    object
dtype: object

### 👨‍⚖️ Standardizing the data

In [73]:
# Standardize numerical features
scaler = MinMaxScaler()
std_menp_data = menp_data_encoded.copy()
std_menp_data.head()

Unnamed: 0,EXERCIS,DIETNUT,OUTCOME,ESTRNJ2,COMBIN2,ESTROG2,E2AVE,ALCHL24,ESTRDA2,HOURSPA,...,DIABETE,BROKEBO,ESTROG1,ESTRNJ1,AGE,RACE,STATUS,NUMHOTF,OSTEOPR,MENODEP
2303,1,1,1,2,2,1,92.45,3,0,0,...,3,1.2,2,2,44.0,0,5,7,1,-9999
24383,3,3,0,0,0,0,16.2,4,0,6,...,3,1.0,2,2,53.0,2,2,7,1,-9999
441,1,1,1,2,2,1,45.4,3,0,0,...,3,1.2,2,2,47.0,0,4,7,1,-9999
14350,1,1,0,0,3,0,17.75,3,0,0,...,3,1.2,2,2,48.0,3,7,7,1,-9999
18812,3,3,0,0,0,0,27.39,2,0,0,...,3,1.0,2,2,51.0,2,2,7,1,-9999


In [75]:
std_menp_data[features] = scaler.fit_transform(menopause_data[features])
std_menp_data[features].head()

Unnamed: 0,EXERCIS,DIETNUT,OUTCOME,ESTRNJ2,COMBIN2,ESTROG2,E2AVE,ALCHL24,ESTRDA2,HOURSPA,...,PHYSILL,AVCIGDA,TRBLSLE,CANCERS,DIABETE,BROKEBO,ESTROG1,ESTRNJ1,AGE,RACE
2303,0.0,0.0,0.0,0.0,0.0,0.0,0.012956,0.9999,0.9997,0.9993,...,0.9996,0.250847,1.0,0.9999,0.9999,0.066667,0.9999,0.9999,0.090909,0.0
24383,1.0,1.0,0.9993,0.9997,0.9997,0.9997,0.00204,1.0,0.9997,0.9996,...,0.9998,0.227119,0.9998,0.9999,0.9999,0.0,0.9999,0.9999,0.5,0.5
441,0.0,0.0,0.0,0.0,0.0,0.0,0.00622,0.9999,0.9997,0.9993,...,0.9996,0.169492,0.9997,0.9999,0.9999,0.066667,0.9999,0.9999,0.227273,0.0
14350,0.0,0.0,0.9993,0.9997,0.9999,0.9997,0.002262,0.9999,0.9997,0.9993,...,0.998601,0.094915,0.998601,0.9999,0.9999,0.066667,0.9999,0.9999,0.272727,0.75
18812,1.0,1.0,0.9993,0.9997,0.9997,0.9997,0.003642,0.0,0.9997,0.9993,...,0.9996,0.176271,1.0,0.9999,0.9999,0.0,0.9999,0.9999,0.409091,0.5


### ⚙ Defining resources

In [76]:
# Define cross-validation rules

cv_seed = 1
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=cv_seed)

In [77]:
# Classification models
knn_classifier = KNeighborsClassifier(
    n_neighbors=5
)
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    random_state=0
)

# Define as multioutput classifier
multi_knn_classifier = MultiOutputClassifier(knn_classifier, n_jobs=-1)
multi_rf_classifier = MultiOutputClassifier(rf_classifier, n_jobs=-1)

### 🏋️ Training and validating classifiers

In [79]:
models_performance = {
    'knn': {
        'model': multi_knn_classifier,
        'scores': cross_val_score(
            multi_knn_classifier,
            std_menp_data[features], 
            std_menp_data[targets],
            scoring='neg_mean_absolute_error',
            error_score='raise',
            cv=cv, 
            n_jobs=-1
        )
    },
    'rf': {
        'model': multi_rf_classifier,
        'scores': cross_val_score(
            multi_rf_classifier,
            std_menp_data[features], 
            std_menp_data[targets],
            scoring='neg_mean_absolute_error', 
            cv=cv, 
            n_jobs=-1
        )
    }

}

In [28]:
models_performance['knn']['scores']

array([-0.07718713, -0.07493135, -0.06522166, -0.07875638, -0.07453903,
       -0.07640251, -0.08003138, -0.06877943, -0.07672684, -0.07319466,
       -0.07434288, -0.07306787, -0.07914869, -0.07659867, -0.07365634,
       -0.0715967 , -0.07385249, -0.07397959, -0.07633438, -0.07339089,
       -0.07571597, -0.0751275 , -0.0751275 , -0.07434288, -0.07012554,
       -0.07306787, -0.07208709, -0.07339089, -0.07309655, -0.07702119])

In [30]:
n_scores = np.absolute(models_performance['knn']['scores'])
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

MAE: 0.074 (0.003)


In [31]:
n_scores = np.absolute(models_performance['rf']['scores'])
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

MAE: 0.005 (0.001)
