In [None]:
import pandas as pd
import numpy as np

We'll train both a Random Forest Classifier and a K-Nearest Neighbors Classifier using our menopause data. We'll then compare the two models and see which one performs better.

# 📖 Loading codebook values

### ℹ Reading in the data and variables to be used

In [None]:
menopause_data = pd.read_csv('./datasets/visit_dfs.csv', dtype='str')
menopause_data.head()

In [None]:
with open('./lista_variables/variables_selected_final.txt', 'r') as f:
    features = f.read().splitlines()

features

> Do note that our variables are mainly categorical, but not all of them. We'll read a JSON file to know which variables aren't categorical and we'll change them to float.

In [None]:
import json

with open('./lista_variables/variables_selected_labels.json', 'r') as file:
    features_labels = json.load(file)

### ✅ Parsing data to correct data type

In [None]:
integer_features = features_labels['NUMERICAL']['vars']['int']
float_features = features_labels['NUMERICAL']['vars']['float']

for feature in integer_features:
    menopause_data[feature] = menopause_data[feature].astype(int)

for feature in float_features:
    menopause_data[feature] = menopause_data[feature].astype(float)

In [None]:
menopause_data.dtypes

### ✨ Pipeline for negative values in data

In some of the variables, negative values are used to indicate missing data, or that the patient refused to answer. We'll create a pipeline to replace these negative values with NaNs,
and we'll use KNN to impute the missing values.

This will aid in standardizing the data.

In [None]:
numerical_features = integer_features + float_features
numerical_features.remove('VISIT')


In [None]:
# Set negatives as NaN
menopause_data[numerical_features] = menopause_data[numerical_features].apply(lambda x: x.where(x > 0, np.nan))


from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

# DONE
# menopause_data[numerical_features] = imputer.fit_transform(menopause_data[numerical_features])

In [None]:
menopause_data = pd.read_csv('./datasets/visit_dfs_imputed.csv')

In [None]:
menopause_data[numerical_features].isna().sum()

In [None]:
menopause_data[numerical_features].head()

In [None]:
# Save dataset
# DONE
# menopause_data.to_csv('./datasets/visit_dfs_imputed.csv', index=True)

# 🧠 Model training!

In [81]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from mlflow import log_metric, log_param, log_artifact, start_run


In [None]:
targets = [
    'STATUS',    # Indicating if the patient is menopausal or not, among other options
    'NUMHOTF',  # Number of hot flashes daily
    'OSTEOPR',  # Indicating if the patient has osteoporosis or not
    'MENODEP',  # Indicating if the patient has depression or not
]

In [None]:
# Shuffle order to avoid order bias
menopause_data = shuffle(menopause_data)


In [82]:
# Standardize numerical features
scaler = MinMaxScaler()
std_menp_data = menopause_data.copy()
std_menp_data.head()

Unnamed: 0,HAVEPER,EXPENSI,BROKEBO,AGE,EXERCIS,DNTKNOW,AVCIGDA,NOREASO,SIDEEFF,OSTEOPO,...,DIETNUT,LIKEFEL,COMBIN2,BONES3,PRGNANT,SMOKERE,ESTRNJ1,STOPOTH,BONES2,HOURSPA
16079,-1,-1,1.0,54.0,2,-1,20.0,-1,-1,-1,...,1,-1,1,-1,1,2,1,-1,-1,-1
19647,-1,-1,1.2,56.0,-9999,-9999,7.0,-9999,-1,-1,...,-9999,-1,-1,-9999,-9999,1,1,-1,-9999,-9999
2100,-1,-1,1.2,47.0,-9999,-1,9.6,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,4
15252,-1,-1,1.0,58.0,2,-1,13.6,-1,-1,-1,...,2,-1,-1,-1,1,1,1,-1,-1,4
10304,-1,-1,1.0,51.0,-9999,-1,11.4,-1,-1,-1,...,-9999,-1,-1,-1,1,1,1,-1,-1,5


In [83]:
std_menp_data[numerical_features] = scaler.fit_transform(menopause_data[numerical_features])
std_menp_data[numerical_features].head()

Unnamed: 0,AVCIGDA,BROKEBO,AGE,NUMHOTF,E2AVE,HEIGHT,WEIGHT
16079,0.322034,0.0,0.545455,0.00603,0.004782,0.424914,0.199192
19647,0.101695,0.066667,0.636364,0.00804,0.005786,0.457394,0.266051
2100,0.145763,0.066667,0.227273,0.00603,0.025905,0.440199,0.248268
15252,0.213559,0.0,0.727273,0.013065,0.002226,0.24914,0.042148
10304,0.176271,0.0,0.409091,0.015075,0.003642,0.364157,0.196137


In [None]:
# Divide data into train and test, but only for the KNN model,
# since RF does the bootstraping datasaet itself

""" X_train, X_test, y_train, y_test = train_test_split(
    
) """

