In [31]:
import pandas as pd
import os

# 🚿 Filtering column names

### 📒 Reading dataframes and filtering columns

In [32]:
base_path = './datasets/'
file_name_convention = 'visit'
extension = '.tsv'

# Load the data, scanning in the base path, but load files as needed, not all at once

def get_visit_records():

    filenames = []

    for root, dirs, files in os.walk(base_path):

        for filename in files:
            if filename.startswith(file_name_convention) and filename.endswith(extension):
                filenames.append(filename)
        
    for filename in filenames:
        df = pd.read_csv(
            base_path + filename,
            sep='\t',
            index_col=False
        )
        yield df


In [33]:
# Load variable names

variables = []

with open('./lista_variables/variables_selectas.txt', ) as file:
    variables = [var[:-1] for var in file.readlines()]

print(variables)

['ABBLEED', 'AGE1', 'AGE10', 'AGE11', 'AGE12', 'AGE2', 'AGE3', 'AGE4', 'AGE5', 'AGE6', 'AGE7', 'AGE8', 'AGE9', 'AGE', 'ALCHL24', 'WORSE', 'FOODPNA', 'AVCIGDA', 'GLASBEE', 'GLASLIQ', 'GLASWIN', 'BONES1', 'BONES2', 'BONES3', 'BOTHOTF', 'LEKBOTH', 'COMBIN1', 'COMBIN2', 'NOREMEB', 'BATCARB', 'DTTDFIB', 'BATKCAL', 'DTTALCH', 'BATNIAC', 'BATPHOS', 'BATPOTS', 'BATPROT', 'BATRIBO', 'BATTFAT', 'HOMEXPD', 'DIABETE', 'HAVEPER', 'LIKEFEL', 'ALLBCAR', 'ALLCALC', 'ALLFOL', 'ALLIRON', 'ALLARE', 'ALLB1', 'ALLB12', 'ALLB6', 'ALLVITC', 'ALLVITD', 'ALLVITE', 'ALLZINC', 'ALLB2', 'NOLIKE', 'EXPECT', 'DNTKNOW', 'DONTKNO', 'MENODEP', 'IMEDTHR', 'EMBDDEV', 'E2AVE', 'ESTRDA1', 'ESTRDA2', 'ESTRNJ1', 'ESTRNJ2', 'ESTROG1', 'ESTROG2', 'EFPDFIB', 'EFPB1', 'EFPB12', 'EFPB6', 'EFPARE', 'EFP', 'LEKDISC', 'EXERCIS', 'EXERGEN', 'EXERHAR', 'EXERMEM', 'EXERMEN', 'EXEROST', 'EXEROTH', 'EXERSPE', 'EXERADV', 'EXERPER', 'EXERWGH', 'EXERLOO', 'EXER12H', 'FACEI1', 'FACEI10', 'FACEI11', 'FACEI12', 'FACEI13', 'FACEI14', 'FACEI15'

### ⚖ Normalizing variable names to find common variables

Given that the column names have in their name the visit id at the end, we have to take it away to proceed more easily

In [34]:
"""
    Normalize the variable names.

    This requires getting rid of the last character in each column name, which, as stated
    above, is the visit ID, so we can concatenate all the dataframes together.

    Note: not all the dataframes have the same columns, so we need to get the union of all
    the columns in all the dataframes.
"""

visit_dfs = [*get_visit_records()]

  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


In [35]:
visit_dfs[0].head()

Unnamed: 0,SWANID,VISIT,INTDAY1,AGE1,LANGINT1,RACE,PREGNAN1,PREVBLO1,EATDRIN1,STRTPER1,...,SPSCDAY1,SPSCTIM1,SPSCMOD1,HPSCDAY1,HPSCTIM1,HPSCMOD1,SPBMDT1,HPBMDT1,BMDFLG1,STATUS1
0,10046,1,413,53,1,2,1,1.0,1,2,...,413.0,0:10:08,5.0,413.0,0:09:52,5.0,1.1277,0.9619,0.0,7
1,10056,1,357,52,1,4,1,,1,2,...,441.0,0:12:55,11.0,441.0,0:13:03,11.0,0.914,0.8499,0.0,4
2,10092,1,364,46,1,4,1,,1,1,...,364.0,0:18:01,5.0,364.0,0:17:39,5.0,1.0377,0.8412,0.0,4
3,10126,1,442,50,1,1,1,,1,2,...,,.,,,.,,,,,4
4,10153,1,374,52,1,3,1,,1,2,...,402.0,0:10:47,11.0,402.0,0:10:40,11.0,1.0673,1.0313,0.0,4


In [36]:
def normalize_columns(df, suffix: str):
    # Remove last digit (matching index + 1) at the end, if it exists
    return [col.removesuffix(suffix) if col.endswith(suffix) else col for col in df.columns]


unique_variables = set(normalize_columns(visit_dfs[0], '1'))


for index, df in enumerate(visit_dfs[1:]):

    index_str = str(index + 2)  # Index starts at 0, but we continue at 2
    columns_no_visit = normalize_columns(df, index_str)
    unique_variables &= set(columns_no_visit)

variable_inter = list(unique_variables)
len(variable_inter)

291

In [37]:
variable_inter

['OOPHORE',
 'FLGDIF',
 'TRBLSLE',
 'PROGTW2',
 'PRTUNEM',
 'BROKEBO',
 'WEIGHT',
 'FAMLVIO',
 'THYROID',
 'HEART1',
 'HAPPY',
 'OSTEOPR',
 'BLDRWAT',
 'BLDDRAW',
 'OTHMED4',
 'SMOKERE',
 'IMPRMEM',
 'HYSTDAY',
 'PHYDAY',
 'OTHRTW8',
 'CANCERS',
 'GETGOIN',
 'EINJTW1',
 'VAGINDR',
 'CRYING',
 'STEROI2',
 'PULSE',
 'HOPEFUL',
 'PRGNANT',
 'HIGHBP',
 'DIURET1',
 'NERVS1',
 'RELATEN',
 'OTC1',
 'SIDEEFF',
 'LMPDAY',
 'SPSCMOD',
 'HPSCMOD',
 'LONELY',
 'ALCHL24',
 'BPTW1',
 'KEEPMIN',
 'CHEMOTH',
 'OTC2',
 'HAPPEN2',
 'LANGSAA',
 'BCP1',
 'E2AVE',
 'HORMOTH',
 'BOTHER',
 'MONEYPR',
 'SITESPE',
 'SPSCTIM',
 'BONES3',
 'DIURTW2',
 'SHBG',
 'CYCDAY',
 'FSH',
 'OTHRTW3',
 'DONTKNO',
 'UNFRNDL',
 'ACHES',
 'STOPOTH',
 'COMBIN1',
 'OTC4',
 'HARTTW1',
 'HCPADVI',
 'NUMDAND',
 'HBCHOLE',
 'FLGCV',
 'TALKLES',
 'OTHRTW9',
 'PROGES2',
 'SYSBP2',
 'BCPTWI2',
 'FRTLTW1',
 'DESCPER',
 'HEARTAT',
 'ONEOVAR',
 'SLEPTW1',
 'HAVEPER',
 'VISIT',
 'EINJTW2',
 'INTDAY',
 'SLEPTW2',
 'OTHRTW7',
 'OTHTW11',
 'I

In [38]:
# Having variables shared across all the dataframes, we can now filter them out by
# our list of selected variables

selected_shared_variables = set(variables) & set(variable_inter)
len(selected_shared_variables)

49

In [39]:
selected_shared_variables

{'ADVISTO',
 'AGE',
 'ALCHL24',
 'AVCIGDA',
 'BONES1',
 'BONES2',
 'BONES3',
 'BROKEBO',
 'CANCER',
 'COMBIN1',
 'COMBIN2',
 'DIABETE',
 'DNTKNOW',
 'DONTKNO',
 'E2AVE',
 'ESTRDA1',
 'ESTRDA2',
 'ESTRNJ1',
 'ESTRNJ2',
 'ESTROG1',
 'ESTROG2',
 'EXPENSI',
 'FRNADVI',
 'HAVEPER',
 'HCPADVI',
 'HORMOTH',
 'HOTFLAS',
 'HOURSPA',
 'IMPRMEM',
 'LANGINT',
 'LANGSAA',
 'LIKEFEL',
 'MENOSYM',
 'NOLIKE',
 'NOREASO',
 'NOREMEB',
 'OSTEOPO',
 'OSTEOPR',
 'OUTCOME',
 'PHYSILL',
 'PRBBLEE',
 'PRGNANT',
 'RACE',
 'REDUHAR',
 'REGPERI',
 'SIDEEFF',
 'STOPOTH',
 'VAGINDR',
 'YOUNGLK'}

### 📝 Renaming columns and final dataframe

In [89]:
from pandas import DataFrame


def rename_df_columns_to_normalized(df: DataFrame, selected_variables: set, suffix: str):

    normalized_columns = normalize_columns(df, suffix)

    # Check each column name in each dataframe, and see if it's in the
    # list of selected variables
    for col, normal_col in zip(df.columns, normalized_columns):

        columns_to_rename = {}

        if normal_col in selected_variables:
            columns_to_rename[col] = normal_col

        # Substitute the column name with the normalized one
        df.rename(columns=columns_to_rename, inplace=True)


for index, df in enumerate(visit_dfs):

    index_str = str(index + 1)
    rename_df_columns_to_normalized(df, selected_shared_variables, index_str)


In [41]:
# Check that the columns have been renamed correctly
visit_dfs[0][list(selected_shared_variables)].head()

Unnamed: 0,OSTEOPO,BROKEBO,HAVEPER,LIKEFEL,NOLIKE,FRNADVI,OSTEOPR,REDUHAR,OUTCOME,ESTROG1,...,AVCIGDA,DONTKNO,BONES3,STOPOTH,COMBIN1,HCPADVI,RACE,YOUNGLK,AGE,NOREMEB
0,1,0,-1,-1,-1,1,1,1,,2,...,-1,1,-1,-1,1,1,2,1,53,-1
1,-1,0,-1,-1,-1,-1,1,-1,,1,...,-1,-1,-1,-1,1,-1,4,-1,52,-1
2,-1,1,-1,-1,-1,-1,1,-1,,1,...,-1,-1,-1,-1,1,-1,4,-1,46,-1
3,-1,0,-1,-1,-1,-1,1,-1,,1,...,-1,-1,-1,-1,1,-1,1,-1,50,-1
4,-1,0,-1,-1,-1,-1,1,-1,,1,...,-1,-1,-1,-1,1,-1,3,-1,52,-1


In [42]:
visit_dfs[1][list(selected_shared_variables)]

Unnamed: 0,OSTEOPO,BROKEBO,HAVEPER,LIKEFEL,NOLIKE,FRNADVI,OSTEOPR,REDUHAR,OUTCOME,ESTROG1,...,AVCIGDA,DONTKNO,BONES3,STOPOTH,COMBIN1,HCPADVI,RACE,YOUNGLK,AGE,NOREMEB
0,1,0,,,,1,1,1,,2,...,-1,1,,,1,2,2,1,54,
1,,0,,,,,1,,,1,...,-1,,,,1,,4,,53,
2,,0,,,,,1,,,1,...,-1,,,,1,,1,,50,
3,,0,,,,,1,,,1,...,-1,,,,1,,3,,53,
4,,0,,,,,1,,,1,...,-1,,,,1,,2,,48,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2743,1,0,,,,1,1,1,,1,...,-1,1,,,1,2,4,1,45,
2744,,0,,,,,1,,,1,...,20,,,,1,,3,,50,
2745,,0,,,,,1,,,1,...,-1,,,,1,,4,,47,
2746,,0,,,,,1,,,1,...,-1,,,,1,,2,,49,


In [43]:
visit_dfs[1]['OSTEOPO'].unique()

array(['1', ' ', '2'], dtype=object)

### ✨ Encoding empty values

In [44]:
"""
    It was chosen that missing values would be represented by -9999;
    however, values here are still strings (object, according to pandas),
    so we need to replace them with the actual value.
"""

# Before continuing, check null values
for index, df in enumerate(visit_dfs):
    df.replace(' ', None, inplace=True)
    print(f'Visit {index + 1} information:\n\n')
    df[list(selected_shared_variables)].info()
    print('------------------------------------')



Visit 1 information:


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2881 entries, 0 to 2880
Columns: 576 entries, SWANID to STATUS1
dtypes: int64(3), object(573)
memory usage: 12.7+ MB
------------------------------------
Visit 2 information:


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2748 entries, 0 to 2747
Columns: 551 entries, SWANID to BMDFLG2
dtypes: int64(3), object(548)
memory usage: 11.6+ MB
------------------------------------
Visit 3 information:


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2709 entries, 0 to 2708
Columns: 627 entries, SWANID to BMDFLG3
dtypes: int64(3), object(624)
memory usage: 13.0+ MB
------------------------------------
Visit 4 information:


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2679 entries, 0 to 2678
Columns: 675 entries, SWANID to STATUS4
dtypes: int64(3), object(672)
memory usage: 13.8+ MB
------------------------------------
Visit 5 information:


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2617 entries, 0 to 

In [59]:
for df in visit_dfs:
    df.fillna(-9999, inplace=True)

In [61]:
visit_dfs[1][list(selected_shared_variables)].head()

Unnamed: 0,OSTEOPO,BROKEBO,HAVEPER,LIKEFEL,NOLIKE,FRNADVI,OSTEOPR,REDUHAR,OUTCOME,ESTROG1,...,AVCIGDA,DONTKNO,BONES3,STOPOTH,COMBIN1,HCPADVI,RACE,YOUNGLK,AGE,NOREMEB
0,1,0,-9999,-9999,-9999,1,1,1,-9999,2,...,-1,1,-9999,-9999,1,2,2,1,54,-9999
1,-9999,0,-9999,-9999,-9999,-9999,1,-9999,-9999,1,...,-1,-9999,-9999,-9999,1,-9999,4,-9999,53,-9999
2,-9999,0,-9999,-9999,-9999,-9999,1,-9999,-9999,1,...,-1,-9999,-9999,-9999,1,-9999,1,-9999,50,-9999
3,-9999,0,-9999,-9999,-9999,-9999,1,-9999,-9999,1,...,-1,-9999,-9999,-9999,1,-9999,3,-9999,53,-9999
4,-9999,0,-9999,-9999,-9999,-9999,1,-9999,-9999,1,...,-1,-9999,-9999,-9999,1,-9999,2,-9999,48,-9999


In [62]:
# For time columns, empty values are coded ad '.'. Replace them with ''

for df in visit_dfs:
    df.replace('.', '', inplace=True)

visit_dfs[1][list(selected_shared_variables)].head()

Unnamed: 0,OSTEOPO,BROKEBO,HAVEPER,LIKEFEL,NOLIKE,FRNADVI,OSTEOPR,REDUHAR,OUTCOME,ESTROG1,...,AVCIGDA,DONTKNO,BONES3,STOPOTH,COMBIN1,HCPADVI,RACE,YOUNGLK,AGE,NOREMEB
0,1,0,-9999,-9999,-9999,1,1,1,-9999,2,...,-1,1,-9999,-9999,1,2,2,1,54,-9999
1,-9999,0,-9999,-9999,-9999,-9999,1,-9999,-9999,1,...,-1,-9999,-9999,-9999,1,-9999,4,-9999,53,-9999
2,-9999,0,-9999,-9999,-9999,-9999,1,-9999,-9999,1,...,-1,-9999,-9999,-9999,1,-9999,1,-9999,50,-9999
3,-9999,0,-9999,-9999,-9999,-9999,1,-9999,-9999,1,...,-1,-9999,-9999,-9999,1,-9999,3,-9999,53,-9999
4,-9999,0,-9999,-9999,-9999,-9999,1,-9999,-9999,1,...,-1,-9999,-9999,-9999,1,-9999,2,-9999,48,-9999


### 🔗 Concatenating dataframes

In [64]:
# Concatenate all the dataframes together
# TODO: check a way to preserve empty values
full_df = pd.concat((df[list(selected_shared_variables)] for df in visit_dfs))

# Write it to a file
full_df.to_csv('./datasets/visit_dfs.csv', index=False)

# ❌ Removing columns with several missing values

In [65]:
# Check for each column how many -9999 values there are
# Make a dictionary with the column name and the number of -9999 values
missing_values = {}

for var in full_df.columns:
    missing_values[var] = len(full_df[full_df[var] == -9999])

# Sort the dictionary by the number of missing values
sorted_missing_values = {k: v for k, v in reversed(sorted(missing_values.items(), key=lambda item: item[1]))}

In [69]:
full_df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243, 2244],
           dtype='int64', length=25487)

In [101]:
# Show the percentage of missing values for each column
missing_ratio = pd.Series({k: v/len(full_df.index) * 100 for k,v in sorted_missing_values.items()})
missing_ratio.name = 'null_ratio'
missing_ratio

RACE        0.000000
AGE         0.160866
LANGINT     0.466905
ESTRNJ1     0.470828
COMBIN1     0.470828
ESTROG1     0.478675
BROKEBO     0.537529
OSTEOPR     0.537529
DIABETE     0.537529
LANGSAA     2.660180
AVCIGDA     2.660180
HOTFLAS     2.664103
PHYSILL     2.664103
VAGINDR     2.668027
PRGNANT     4.861302
ESTRDA1    10.644642
OSTEOPO    11.735395
FRNADVI    11.735395
REDUHAR    11.735395
IMPRMEM    11.735395
MENOSYM    11.735395
REGPERI    11.735395
HCPADVI    11.735395
YOUNGLK    11.735395
HORMOTH    11.739318
PRBBLEE    12.739828
HAVEPER    12.743752
LIKEFEL    12.743752
NOLIKE     12.743752
ADVISTO    12.743752
CANCER     12.743752
EXPENSI    12.743752
SIDEEFF    12.743752
STOPOTH    12.743752
NOREMEB    12.743752
HOURSPA    13.469612
ESTRDA2    13.603013
DONTKNO    14.179778
NOREASO    15.172441
DNTKNOW    15.172441
ALCHL24    15.576568
E2AVE      15.878683
ESTROG2    21.493310
COMBIN2    21.901361
ESTRNJ2    22.325107
OUTCOME    26.703810
BONES1     46.906266
BONES2     47

In [94]:
# Read csv with variable names and descriptions, and join by variable ID
var_names = pd.read_csv('./lista_variables/variables.csv')
normalized_colnames = {var:var.removesuffix('9') if var[-1].isdigit() else var for var in var_names['id']}
var_names['id'] = var_names['id'].map(normalized_colnames)
var_names.set_index('id', inplace=True)
var_names

Unnamed: 0_level_0,nombre,desc
id,Unnamed: 1_level_1,Unnamed: 2_level_1
FORMPHY,Abbreviated or Full Physical Measures,Abbreviated or Full Physical Measures
FORMINT,"Abbreviated, mailed or regular interview",Which version of the interview was administered:
ABBLEED,Abnormal vaginal bleeding since last visit,"Since your last study visit, have you had any ..."
ACTIPAN,Active this week,Indicate in the space next to each item how st...
ACUPUNC,Acupuncture,"During the past 12 months, have you used any o..."
...,...,...
YOGASPE,Yoga - Other specify,If YES to YOGA9: Please tell me whether or not...
YOGAADV,Yoga - Provider advice,If YES to YOGA9: Please tell me whether or not...
YOGAPER,Yoga - Regulate Periods,If YES to YOGA9: Please tell me whether or not...
YOGAWGH,Yoga - Weight,If YES to YOGA9: Please tell me whether or not...


In [102]:
ratio_comparison_df = pd.concat(
    [
        var_names['nombre'],
        missing_ratio
    ],
    axis=1,
    join='inner'
)

In [104]:
ratio_comparison_df.sort_values(by='null_ratio', ascending=False)

Unnamed: 0,nombre,null_ratio
BONES3,Bone #3 broken,47.08675
BONES2,Bone #2 broken,47.067132
BONES1,Bone #1 broken,46.906266
OUTCOME,Outcome of pregnancy,26.70381
ESTRNJ2,Estrogen injection/patch #2,22.325107
COMBIN2,Combination estrogen/progestin #2,21.901361
ESTROG2,Estrogen pills #2,21.49331
E2AVE,"Estradiol (average, pg/mL)",15.878683
ALCHL24,Alcohol in Last 24 hours,15.576568
NOREASO,Stopped hormones no reason given,15.172441


### Check here the columns to preserve

| nombre  | nombre                  | null_ratio | preserved              |
|---------|-------------------------|------------|------------------------|
| BONES3  | Bone #3 broken          | 47.086750  | <input type="checkbox"> |
| BONES2  | Bone #2 broken          | 47.067132  | <input type="checkbox"> |
| BONES1  | Bone #1 broken          | 46.906266  | <input type="checkbox"> |
| OUTCOME | Outcome of pregnancy    | 26.703810  | <input type="checkbox"> |
| ESTRNJ2 | Estrogen injection/patch #2 | 22.325107 | <input type="checkbox"> |
| COMBIN2 | Combination estrogen/progestin #2 | 21.901361 | <input type="checkbox"> |
| ESTROG2 | Estrogen pills #2       | 21.493310  | <input type="checkbox"> |
| E2AVE   | Estradiol (average, pg/mL) | 15.878683 | <input type="checkbox"> |
| ALCHL24 | Alcohol in Last 24 hours | 15.576568 | <input type="checkbox"> |
| NOREASO | Stopped hormones no reason given | 15.172441 | <input type="checkbox"> |
| DNTKNOW | Don't know why stopped hormones | 15.172441 | <input type="checkbox"> |
| DONTKNO | Don't know/remember why take hormones | 14.179778 | <input type="checkbox"> |
| ESTRDA2 | Estrogen #2 prescription daily or off & on | 13.603013 | <input type="checkbox"> |
| HOURSPA | How many hours/week work for pay | 13.469612 | <input type="checkbox"> |
| STOPOTH | Stopped hormones other reason | 12.743752 | <input type="checkbox"> |
| EXPENSI | Too expensive           | 12.743752  | <input type="checkbox"> |
| CANCER  | Worried about cancer    | 12.743752  | <input type="checkbox"> |
| ADVISTO | Health care provider advised to stop | 12.743752 | <input type="checkbox"> |
| SIDEEFF | Worried about possible side effects | 12.743752 | <input type="checkbox"> |
| HAVEPER | Did not like having periods | 12.743752 | <input type="checkbox"> |
| NOLIKE  | Do not like taking any medications | 12.743752 | <input type="checkbox"> |
| NOREMEB | Couldn't remember to take them | 12.743752 | <input type="checkbox"> |
| LIKEFEL | Did not like how felt on them | 12.743752 | <input type="checkbox"> |
| PRBBLEE | Problems with bleeding  | 12.739828  | <input type="checkbox"> |
| HORMOTH | Take hormones for other reasons | 11.739318 | <input type="checkbox"> |
| REDUHAR | Reduce risk of heart disease | 11.735395 | <input type="checkbox"> |
| YOUNGLK | Stay young-looking      | 11.735395  | <input type="checkbox"> |
| MENOSYM | Relieve menopausal symptoms | 11.735395 | <input type="checkbox"> |
| REGPERI | Regulate periods        | 11.735395  | <input type="checkbox"> |
| OSTEOPO | Reduce risk of osteoporosis | 11.735395 | <input type="checkbox"> |
| HCPADVI | Health care provider advised | 11.735395 | <input type="checkbox"> |
| IMPRMEM | Improve memory          | 11.735395  | <input type="checkbox"> |
| FRNADVI | Friend/relative advised | 11.735395  | <input type="checkbox"> |
| ESTRDA1 | Estrogen #1 prescription daily or off & on | 10.644642 | <input type="checkbox"> |
| PRGNANT | Pregnant since last visit | 4.861302   | <input type="checkbox"> |
| VAGINDR | Vaginal dryness past 2 weeks | 2.668027 | <input type="checkbox"> |
| HOTFLAS | Hot flashes past 2 weeks | 2.664103   | <input type="checkbox"> |
| PHYSILL | Serious illness family upsetting since last visit | 2.664103 | <input type="checkbox"> |
| LANGSAA | Language of Self-A      | 2.660180   | <input type="checkbox"> |
| AVCIGDA | Average cigarettes/day since last visit | 2.660180 | <input type="checkbox"> |
| OSTEOPR | Osteoporosis since last visit | 0.537529 | <input type="checkbox"> |
| BROKEBO | Number events where bone(s) broken since last ... | 0.537529 | <input type="checkbox"> |
| DIABETE | Diabetes since last visit | 0.537529 | <input type="checkbox"> |
| ESTROG1 | Estrogen pills #1       | 0.478675   | <input type="checkbox"> |
| COMBIN1 | Combination estrogen/progestin #1 | 0.470828 | <input type="checkbox"> |
| ESTRNJ1 | Estrogen injection/patch #1 | 0.470828 | <input type="checkbox"> |
| LANGINT | Language of Interview   | 0.466905   | <input type="checkbox"> |
| AGE     | Age At Current Visit (Integer) | 0.160866 | <input type="checkbox"> |
| RACE    | Race/Ethnicity          | 0.000000   | <input type="checkbox"> |


> During the process of manual selection of variables, we have found that the following variables are not present in all the visits (or, perhaps, were not detected given by the prompt asked to the LLM used to filter variables), but do appear in most of them, and are relevant to the study:

- **STATUS:** menopause status
- **WEIGHT:** weight in kg
- **HEIGHT:** height in cm
- **EXERCIS:** exercise or not in past 12 months
- **DIETNUT:** if diet carried out is considered healthy
- **EMOCTDW:** emotional cutdown due to menopause symptoms
- **EMOCARE:** how careful you've been given how symptoms make you feel
- **SOCIAL:** how much menopause symptoms have affected social life
- **CANCERS:** has had or not cancer since last visit and which type

> ... therefore the notebook will be rerun with these variables included in the final dataframe.