In [1]:
import pandas as pd
import numpy as np

We'll train both a Random Forest Classifier and a K-Nearest Neighbors Classifier using our menopause data. We'll then compare the two models and see which one performs better.

# 📖 Loading codebook values

### ℹ Reading in the data and variables to be used

In [27]:
menopause_data = pd.read_csv('./datasets/visit_dfs.csv', dtype='str')
menopause_data.head()

Unnamed: 0,HAVEPER,EXPENSI,BROKEBO,AGE,EXERCIS,DNTKNOW,AVCIGDA,NOREASO,SIDEEFF,OSTEOPO,...,DIETNUT,LIKEFEL,COMBIN2,BONES3,PRGNANT,SMOKERE,ESTRNJ1,STOPOTH,BONES2,HOURSPA
0,-1,-1,0,53,-9999,-1,-1,-1,-1,1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1
1,-1,-1,0,52,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1
2,-1,-1,1,46,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1
3,-1,-1,0,50,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,4
4,-1,-1,0,52,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1


pandas.core.series.Series

In [41]:
ages = menopause_data['AGE'].astype(int).where(menopause_data['AGE'].astype(int) >= 0)
print(
    'Ages in dataset:',
    f'Min age: {ages.min()}',
    f'Max age: {ages.max()}',
    sep='\n'
)

Ages in dataset:
Min age: 42.0
Max age: 64.0


In [3]:
with open('./lista_variables/variables_selected_final.txt', 'r') as f:
    features = f.read().splitlines()

features

['MENODEP',
 'EXERCIS',
 'DIETNUT',
 'OUTCOME',
 'ESTRNJ2',
 'COMBIN2',
 'ESTROG2',
 'E2AVE',
 'ALCHL24',
 'ESTRDA2',
 'HOURSPA',
 'HEIGHT',
 'ESTRDA1',
 'WEIGHT',
 'HAPPY',
 'PRGNANT',
 'VAGINDR',
 'SMOKERE',
 'HOTFLAS',
 'PHYSILL',
 'AVCIGDA',
 'TRBLSLE',
 'CANCERS',
 'DIABETE',
 'BROKEBO',
 'OSTEOPR',
 'STATUS',
 'ESTROG1',
 'ESTRNJ1',
 'AGE',
 'RACE',
 'NUMHOTF',
 'VISIT']

> Do note that our variables are mainly categorical, but not all of them. We'll read a JSON file to know which variables aren't categorical and we'll change them to float.

In [4]:
menopause_data = menopause_data[features]

In [5]:
import json

with open('./lista_variables/variables_selected_labels.json', 'r') as file:
    features_labels = json.load(file)

### ✅ Parsing data to correct data type

In [6]:
integer_features = features_labels['NUMERICAL']['vars']['int']
float_features = features_labels['NUMERICAL']['vars']['float']

def parse_numbers():

    for feature in integer_features:
        menopause_data[feature] = menopause_data[feature].astype(int)

    for feature in float_features:
        menopause_data[feature] = menopause_data[feature].astype(float)

In [7]:
parse_numbers()
menopause_data.dtypes

MENODEP     object
EXERCIS     object
DIETNUT     object
OUTCOME     object
ESTRNJ2     object
COMBIN2     object
ESTROG2     object
E2AVE      float64
ALCHL24     object
ESTRDA2     object
HOURSPA     object
HEIGHT     float64
ESTRDA1     object
WEIGHT     float64
HAPPY       object
PRGNANT     object
VAGINDR     object
SMOKERE     object
HOTFLAS     object
PHYSILL     object
AVCIGDA      int32
TRBLSLE     object
CANCERS     object
DIABETE     object
BROKEBO      int32
OSTEOPR     object
STATUS      object
ESTROG1     object
ESTRNJ1     object
AGE          int32
RACE        object
NUMHOTF    float64
VISIT        int32
dtype: object

### ✨ Pipeline for negative values in data

In some of the variables, negative values are used to indicate missing data, or that the patient refused to answer. We'll create a pipeline to replace these negative values with NaNs,
and we'll use KNN to impute the missing values.

This will aid in standardizing the data.

In [8]:
numerical_features = integer_features + float_features
numerical_features.remove('VISIT')


In [9]:
# Set negatives as NaN
menopause_data[numerical_features] = menopause_data[numerical_features].apply(lambda x: x.where(x > 0, np.nan))


from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

# DONE
# menopause_data[numerical_features] = imputer.fit_transform(menopause_data[numerical_features])

In [10]:
menopause_data = pd.read_csv('./datasets/visit_dfs_imputed.csv', dtype='str')
# parse_numbers()  # Imputed values by KNN are float, so parse all as float
menopause_data[numerical_features] = menopause_data[numerical_features].astype(float)

In [11]:
menopause_data[numerical_features].isna().sum()

AVCIGDA    0
BROKEBO    0
AGE        0
NUMHOTF    0
E2AVE      0
HEIGHT     0
WEIGHT     0
dtype: int64

In [12]:
menopause_data[numerical_features].head()

Unnamed: 0,AVCIGDA,BROKEBO,AGE,NUMHOTF,E2AVE,HEIGHT,WEIGHT
0,12.4,1.0,53.0,2.8,70.15,155.4,74.0
1,20.4,1.0,52.0,2.8,194.55,161.3,53.2
2,12.2,1.0,46.0,1.8,250.65,166.1,85.6
3,17.8,1.0,50.0,2.8,187.95,161.5,78.4
4,13.2,1.0,52.0,2.2,119.5,149.5,68.0


In [13]:
# Save dataset
# DONE
# menopause_data.to_csv('./datasets/visit_dfs_imputed.csv', index=True)

# 🧠 Model training!

In [14]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from mlflow import log_metric, log_param, log_artifact, start_run



* 'schema_extra' has been renamed to 'json_schema_extra'


In [15]:
targets = [
    'STATUS',    # Indicating if the patient is menopausal or not, among other options
    'NUMHOTF',  # Number of hot flashes daily
    'OSTEOPR',  # Indicating if the patient has osteoporosis or not
    'MENODEP',  # Indicating if the patient has depression or not
]

In [16]:
# Shuffle order to avoid order bias
menopause_data = shuffle(menopause_data)


### 👨‍⚖️ Standardizing the data

In [17]:
# Standardize numerical features
scaler = MinMaxScaler()
std_menp_data = menopause_data.copy()
std_menp_data.head()

Unnamed: 0.1,Unnamed: 0,MENODEP,EXERCIS,DIETNUT,OUTCOME,ESTRNJ2,COMBIN2,ESTROG2,E2AVE,ALCHL24,...,DIABETE,BROKEBO,OSTEOPR,STATUS,ESTROG1,ESTRNJ1,AGE,RACE,NUMHOTF,VISIT
15124,15124,-9999,2,2,-1,-1,-1,-1,22.45,1,...,1,1.0,1,5,1,1,55.0,4,3.0,6
7623,7623,-9999,-9999,-9999,-1,-1,-1,-1,15.4,2,...,1,1.0,1,2,1,1,52.0,4,2.2,3
16463,16463,-9999,2,2,-1,-1,-1,-1,15.5,1,...,1,1.0,1,2,1,1,55.0,4,2.8,7
13970,13970,-9999,1,1,-1,-1,1,-1,18.7,1,...,1,1.0,1,2,1,1,54.0,4,1.2,6
14416,14416,-9999,2,1,-1,-1,-1,-1,27.25,2,...,1,1.0,1,2,1,1,53.0,4,3.6,6


In [18]:
numerical_features_no_target = list(set(numerical_features) - set(targets))

In [19]:
std_menp_data[numerical_features_no_target] = scaler.fit_transform(menopause_data[numerical_features_no_target])
std_menp_data[numerical_features_no_target].head()

Unnamed: 0,WEIGHT,AGE,AVCIGDA,E2AVE,HEIGHT,BROKEBO
15124,0.272517,0.590909,0.264407,0.002935,0.461215,0.0
7623,0.101617,0.454545,0.016949,0.001925,0.377149,0.0
16463,0.33545,0.590909,0.318644,0.00194,0.507069,0.0
13970,0.373557,0.545455,0.237288,0.002398,0.600688,0.0
14416,0.394342,0.5,0.220339,0.003622,0.636989,0.0


### ⚙ Defining resources

In [20]:
# Define cross-validation rules

cv_seed = 1
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=cv_seed)

In [21]:
# Classification models
knn_classifier = KNeighborsClassifier(
    n_neighbors=5
)
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    random_state=0
)

# Define as multioutput classifier
multi_knn_classifier = MultiOutputClassifier(knn_classifier, n_jobs=-1)
multi_rf_classifier = MultiOutputClassifier(rf_classifier, n_jobs=-1)

### 🧮 Categorical values picky tuning

In [22]:
from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder

def encode_labels(df: DataFrame, targets: list[str]):

    le = LabelEncoder()

    for target in targets:
        df[target] = le.fit_transform(df[target])


In [23]:
def get_hotf_category(hot_flashes: float):

    if hot_flashes <= 0 and hot_flashes < 5:
        return 1
    elif hot_flashes >= 5 and hot_flashes < 10:
        return 2
    elif hot_flashes >= 10 and hot_flashes < 15:
        return 3
    elif hot_flashes >= 15 and hot_flashes < 20:
        return 4
    elif hot_flashes >= 20 and hot_flashes < 25:
        return 5
    elif hot_flashes >= 25 and hot_flashes < 30:
        return 6
    else:
        return 7

In [24]:
# Encode categorical features for KNN model

std_menp_data_encoded = std_menp_data.copy()

# Turn 'NUMHOTF' into a categorical feature

std_menp_data_encoded['NUMHOTF'] = std_menp_data_encoded['NUMHOTF'].apply(lambda x: get_hotf_category(x))

non_numerical_features = list(set(features) - set(numerical_features))

encode_labels(std_menp_data_encoded, non_numerical_features)


In [25]:
std_menp_data_encoded[non_numerical_features] = std_menp_data_encoded[non_numerical_features].astype(int)
std_menp_data_encoded.dtypes.unique()

array([dtype('O'), dtype('int32'), dtype('float64'), dtype('int64')],
      dtype=object)

In [26]:
std_menp_data_encoded[std_menp_data_encoded == '0'].columns

Index(['Unnamed: 0', 'MENODEP', 'EXERCIS', 'DIETNUT', 'OUTCOME', 'ESTRNJ2',
       'COMBIN2', 'ESTROG2', 'E2AVE', 'ALCHL24', 'ESTRDA2', 'HOURSPA',
       'HEIGHT', 'ESTRDA1', 'WEIGHT', 'HAPPY', 'PRGNANT', 'VAGINDR', 'SMOKERE',
       'HOTFLAS', 'PHYSILL', 'AVCIGDA', 'TRBLSLE', 'CANCERS', 'DIABETE',
       'BROKEBO', 'OSTEOPR', 'STATUS', 'ESTROG1', 'ESTRNJ1', 'AGE', 'RACE',
       'NUMHOTF', 'VISIT'],
      dtype='object')

### 🏋️ Training and validating classifiers

In [27]:
models_performance = {
    'knn': {
        'model': multi_knn_classifier,
        'scores': cross_val_score(
            multi_knn_classifier,
            std_menp_data_encoded[features], 
            std_menp_data_encoded[targets],
            scoring='neg_mean_absolute_error',
            error_score='raise',
            cv=cv, 
            n_jobs=-1
        )
    },
    'rf': {
        'model': multi_rf_classifier,
        'scores': cross_val_score(
            multi_rf_classifier,
            std_menp_data_encoded[features], 
            std_menp_data_encoded[targets],
            scoring='neg_mean_absolute_error', 
            cv=cv, 
            n_jobs=-1
        )
    }

}

In [28]:
models_performance['knn']['scores']

array([-0.07718713, -0.07493135, -0.06522166, -0.07875638, -0.07453903,
       -0.07640251, -0.08003138, -0.06877943, -0.07672684, -0.07319466,
       -0.07434288, -0.07306787, -0.07914869, -0.07659867, -0.07365634,
       -0.0715967 , -0.07385249, -0.07397959, -0.07633438, -0.07339089,
       -0.07571597, -0.0751275 , -0.0751275 , -0.07434288, -0.07012554,
       -0.07306787, -0.07208709, -0.07339089, -0.07309655, -0.07702119])

In [30]:
n_scores = np.absolute(models_performance['knn']['scores'])
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

MAE: 0.074 (0.003)


In [31]:
n_scores = np.absolute(models_performance['rf']['scores'])
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

MAE: 0.005 (0.001)
