In [19]:
import pandas as pd
import numpy as np

We'll train both a Random Forest Classifier and a K-Nearest Neighbors Classifier using our menopause data. We'll then compare the two models and see which one performs better.

# 📖 Loading codebook values

### ℹ Reading in the data and variables to be used

In [20]:
menopause_data = pd.read_csv('./datasets/visit_dfs.csv', dtype='str')
menopause_data.head()

Unnamed: 0,HAVEPER,EXPENSI,BROKEBO,AGE,EXERCIS,DNTKNOW,AVCIGDA,NOREASO,SIDEEFF,OSTEOPO,...,DIETNUT,LIKEFEL,COMBIN2,BONES3,PRGNANT,SMOKERE,ESTRNJ1,STOPOTH,BONES2,HOURSPA
0,-1,-1,0,53,-9999,-1,-1,-1,-1,1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1
1,-1,-1,0,52,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1
2,-1,-1,1,46,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1
3,-1,-1,0,50,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,4
4,-1,-1,0,52,-9999,-1,-1,-1,-1,-1,...,-9999,-1,-9999,-1,1,1,1,-1,-1,-1


In [21]:
with open('./lista_variables/variables_selected_final.txt', 'r') as f:
    features = f.read().splitlines()

features

['MENODEP',
 'EXERCIS',
 'DIETNUT',
 'OUTCOME',
 'ESTRNJ2',
 'COMBIN2',
 'ESTROG2',
 'E2AVE',
 'ALCHL24',
 'ESTRDA2',
 'HOURSPA',
 'HEIGHT',
 'ESTRDA1',
 'WEIGHT',
 'HAPPY',
 'PRGNANT',
 'VAGINDR',
 'SMOKERE',
 'HOTFLAS',
 'PHYSILL',
 'AVCIGDA',
 'TRBLSLE',
 'CANCERS',
 'DIABETE',
 'BROKEBO',
 'OSTEOPR',
 'STATUS',
 'ESTROG1',
 'ESTRNJ1',
 'AGE',
 'RACE',
 'NUMHOTF',
 'VISIT']

> Do note that our variables are mainly categorical, but not all of them. We'll read a JSON file to know which variables aren't categorical and we'll change them to float.

In [26]:
import json

with open('./lista_variables/variables_selected_labels.json', 'r') as file:
    features_labels = json.load(file)

### ✅ Parsing data to correct data type

In [27]:
for feature in features_labels['NUMERICAL']['vars']['int']:
    menopause_data[feature] = menopause_data[feature].astype(int)

for feature in features_labels['NUMERICAL']['vars']['float']:
    menopause_data[feature] = menopause_data[feature].astype(float)

In [None]:
menopause_data.dtypes

HAVEPER     int64
EXPENSI     int64
BROKEBO     int32
AGE         int32
EXERCIS     int64
            ...  
SMOKERE     int64
ESTRNJ1     int64
STOPOTH     int64
BONES2     object
HOURSPA     int64
Length: 64, dtype: object

# 🧠 Model training!

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle