In [1]:
import pandas as pd
import os

# 🚿 Filtering column names

### 📒 Reading dataframes and filtering columns

In [2]:
base_path = './datasets/'
file_name_convention = 'visit'
extension = '.tsv'

# Load the data, scanning in the base path, but load files as needed, not all at once

def get_visit_records():

    filenames = []

    for root, dirs, files in os.walk(base_path):

        for filename in files:
            if filename.startswith(file_name_convention) and filename.endswith(extension):
                filenames.append(filename)
        
    for filename in filenames:
        df = pd.read_csv(
            base_path + filename,
            sep='\t',
            index_col=False
        )
        yield df


In [3]:
# Load variable names

variables = []

with open('./lista_variables/variables_selectas.txt', ) as file:
    variables = [var[:-1] for var in file.readlines()]

print(variables)

['ABBLEED', 'AGE1', 'AGE10', 'AGE11', 'AGE12', 'AGE2', 'AGE3', 'AGE4', 'AGE5', 'AGE6', 'AGE7', 'AGE8', 'AGE9', 'AGE', 'ALCHL24', 'WORSE', 'FOODPNA', 'AVCIGDA', 'GLASBEE', 'GLASLIQ', 'GLASWIN', 'BONES1', 'BONES2', 'BONES3', 'BOTHOTF', 'LEKBOTH', 'COMBIN1', 'COMBIN2', 'NOREMEB', 'BATCARB', 'DTTDFIB', 'BATKCAL', 'DTTALCH', 'BATNIAC', 'BATPHOS', 'BATPOTS', 'BATPROT', 'BATRIBO', 'BATTFAT', 'HOMEXPD', 'DIABETE', 'HAVEPER', 'LIKEFEL', 'ALLBCAR', 'ALLCALC', 'ALLFOL', 'ALLIRON', 'ALLARE', 'ALLB1', 'ALLB12', 'ALLB6', 'ALLVITC', 'ALLVITD', 'ALLVITE', 'ALLZINC', 'ALLB2', 'NOLIKE', 'EXPECT', 'DNTKNOW', 'DONTKNO', 'MENODEP', 'IMEDTHR', 'EMBDDEV', 'E2AVE', 'ESTRDA1', 'ESTRDA2', 'ESTRNJ1', 'ESTRNJ2', 'ESTROG1', 'ESTROG2', 'EFPDFIB', 'EFPB1', 'EFPB12', 'EFPB6', 'EFPARE', 'EFP', 'LEKDISC', 'EXERCIS', 'EXERGEN', 'EXERHAR', 'EXERMEM', 'EXERMEN', 'EXEROST', 'EXEROTH', 'EXERSPE', 'EXERADV', 'EXERPER', 'EXERWGH', 'EXERLOO', 'EXER12H', 'FACEI1', 'FACEI10', 'FACEI11', 'FACEI12', 'FACEI13', 'FACEI14', 'FACEI15'

### ⚖ Normalizing variable names to find common variables

Given that the column names have in their name the visit id at the end, we have to take it away to proceed more easily

In [4]:
"""
    Normalize the variable names.

    This requires getting rid of the last character in each column name, which, as stated
    above, is the visit ID, so we can concatenate all the dataframes together.

    Note: not all the dataframes have the same columns, so we need to get the union of all
    the columns in all the dataframes.
"""

visit_dfs = [*get_visit_records()]

  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


In [5]:
visit_dfs[0].head()

Unnamed: 0,SWANID,VISIT,INTDAY1,AGE1,LANGINT1,RACE,PREGNAN1,PREVBLO1,EATDRIN1,STRTPER1,...,SPSCDAY1,SPSCTIM1,SPSCMOD1,HPSCDAY1,HPSCTIM1,HPSCMOD1,SPBMDT1,HPBMDT1,BMDFLG1,STATUS1
0,10046,1,413,53,1,2,1,1.0,1,2,...,413.0,0:10:08,5.0,413.0,0:09:52,5.0,1.1277,0.9619,0.0,7
1,10056,1,357,52,1,4,1,,1,2,...,441.0,0:12:55,11.0,441.0,0:13:03,11.0,0.914,0.8499,0.0,4
2,10092,1,364,46,1,4,1,,1,1,...,364.0,0:18:01,5.0,364.0,0:17:39,5.0,1.0377,0.8412,0.0,4
3,10126,1,442,50,1,1,1,,1,2,...,,.,,,.,,,,,4
4,10153,1,374,52,1,3,1,,1,2,...,402.0,0:10:47,11.0,402.0,0:10:40,11.0,1.0673,1.0313,0.0,4


In [6]:
def normalize_columns(df, suffix: str):
    # Remove last digit (matching index + 1) at the end, if it exists
    return [col.removesuffix(suffix) if col.endswith(suffix) else col for col in df.columns]


unique_variables = set(normalize_columns(visit_dfs[0], '1'))


for index, df in enumerate(visit_dfs[1:]):

    index_str = str(index + 2)  # Index starts at 0, but we continue at 2
    columns_no_visit = normalize_columns(df, index_str)
    unique_variables &= set(columns_no_visit)

variable_inter = list(unique_variables)
len(variable_inter)

291

In [7]:
variable_inter

['BRSTEXA',
 'SPSCMOD',
 'LONELY',
 'CRYING',
 'CHOLST1',
 'SLEEP2',
 'FLGDIF',
 'FERTIL1',
 'FEELBLU',
 'LANGINT',
 'OTHMED5',
 'DIURTW1',
 'HAPPEN1',
 'SIDEEFF',
 'MONEYPR',
 'CHILDMO',
 'TALKLES',
 'PREGNAN',
 'OTHME11',
 'MAJEVEN',
 'HARTTW2',
 'THYRTW2',
 'MAMOGRA',
 'CHNGJOB',
 'WORKTRB',
 'OTHRTW2',
 'OTHRTW3',
 'OTHTW14',
 'THYRTW1',
 'OTC1',
 'MENOSYM',
 'OUTCOME',
 'OSTEOPO',
 'STATUS',
 'EFFORT',
 'WAKEUP',
 'GOOD',
 'OOPHORE',
 'OTC4',
 'CANCER',
 'BRSTFEE',
 'LEGALPR',
 'PHYDAY',
 'WEIGHT',
 'SITESPE',
 'FERTIL2',
 'VISIT',
 'HPSCTIM',
 'SYSBP1',
 'HAPPEN2',
 'BMI',
 'PRBBLEE',
 'THYROI1',
 'HLTHSER',
 'BLEEDNG',
 'PROGTW2',
 'DIZZY',
 'WORSREL',
 'BCPTWI2',
 'STROKE',
 'PILING',
 'STRTPER',
 'GETGOIN',
 'SERIPRO',
 'DIURET1',
 'HYSTDAY',
 'CHOLTW1',
 'BCPTWI1',
 'OTHRTW8',
 'LANGSAA',
 'OTHTW12',
 'PRTUNEM',
 'SPSCTIM',
 'ESTRTW2',
 'OTHRTW5',
 'YOUNGLK',
 'UNFRNDL',
 'PAINTW2',
 'HRMDAY',
 'THYROID',
 'T',
 'CANCERS',
 'THYROI2',
 'SWANID',
 'FAILURE',
 'OTHRTW1',
 'SPSC

In [8]:
# Having variables shared across all the dataframes, we can now filter them out by
# our list of selected variables

selected_shared_variables = (set(variables) & set(variable_inter)) |\
    set(
        [   # 2023-12-02: Added the manually selected variables.
            'STATUS',
            'WEIGHT',
            'HEIGHT',
            'EXERCIS',
            'DIETNUT',
            'EMOCTDW',
            'EMOCARE',
            'SOCIAL',
            'CANCERS',
            'TRBLSLE',
            'HAPPY',
            'SMOKERE',
            'MENODEP',
        ]
    )
len(selected_shared_variables)

62

In [9]:
selected_shared_variables

{'ADVISTO',
 'AGE',
 'ALCHL24',
 'AVCIGDA',
 'BONES1',
 'BONES2',
 'BONES3',
 'BROKEBO',
 'CANCER',
 'CANCERS',
 'COMBIN1',
 'COMBIN2',
 'DIABETE',
 'DIETNUT',
 'DNTKNOW',
 'DONTKNO',
 'E2AVE',
 'EMOCARE',
 'EMOCTDW',
 'ESTRDA1',
 'ESTRDA2',
 'ESTRNJ1',
 'ESTRNJ2',
 'ESTROG1',
 'ESTROG2',
 'EXERCIS',
 'EXPENSI',
 'FRNADVI',
 'HAPPY',
 'HAVEPER',
 'HCPADVI',
 'HEIGHT',
 'HORMOTH',
 'HOTFLAS',
 'HOURSPA',
 'IMPRMEM',
 'LANGINT',
 'LANGSAA',
 'LIKEFEL',
 'MENODEP',
 'MENOSYM',
 'NOLIKE',
 'NOREASO',
 'NOREMEB',
 'OSTEOPO',
 'OSTEOPR',
 'OUTCOME',
 'PHYSILL',
 'PRBBLEE',
 'PRGNANT',
 'RACE',
 'REDUHAR',
 'REGPERI',
 'SIDEEFF',
 'SMOKERE',
 'SOCIAL',
 'STATUS',
 'STOPOTH',
 'TRBLSLE',
 'VAGINDR',
 'WEIGHT',
 'YOUNGLK'}

### 📝 Renaming columns and final dataframe

In [17]:
from pandas import DataFrame


def rename_df_columns_to_normalized(df: DataFrame, selected_variables: set, suffix: str):

    normalized_columns = normalize_columns(df, suffix)

    # Check each column name in each dataframe, and see if it's in the
    # list of selected variables
    for col, normal_col in zip(df.columns, normalized_columns):

        columns_to_rename = {}

        if normal_col in selected_variables:
            columns_to_rename[col] = normal_col

        # Substitute the column name with the normalized one
        df.rename(columns=columns_to_rename, inplace=True)


def add_missing_columns(df: DataFrame, selected_variables: set):

    # Check each column name in each dataframe, and see if it's in the
    # list of selected variables
    for col in selected_variables:

        if col not in df.columns:
            df[col] = None

for index, df in enumerate(visit_dfs):

    index_str = str(index + 1)
    add_missing_columns(df, selected_shared_variables)
    rename_df_columns_to_normalized(df, selected_shared_variables, index_str)


In [18]:
# Check that the columns have been renamed correctly
visit_dfs[0][list(selected_shared_variables)].head()

Unnamed: 0,DIABETE,CANCERS,EXPENSI,RACE,LIKEFEL,EXERCIS,LANGINT,SIDEEFF,VAGINDR,EMOCTDW,...,LANGSAA,SMOKERE,E2AVE,AGE,ESTROG1,NOREASO,YOUNGLK,BONES1,ALCHL24,SOCIAL
0,1,1,-1,2,-1,,1,-1,1,1,...,1,1,70.15,53,2,-1,1,-1,1,5
1,1,1,-1,4,-1,,1,-1,1,1,...,1,1,194.55,52,1,-1,-1,-1,2,5
2,1,1,-1,4,-1,,1,-1,1,1,...,1,1,250.65,46,1,-1,-1,PINKY FINGER BONE BROKE,2,5
3,1,1,-1,1,-1,,1,-1,2,2,...,1,1,187.95,50,1,-1,-1,-1,1,3
4,1,1,-1,3,-1,,1,-1,1,1,...,1,1,119.5,52,1,-1,-1,-1,1,5


In [19]:
visit_dfs[1][list(selected_shared_variables)]

Unnamed: 0,DIABETE,CANCERS,EXPENSI,RACE,LIKEFEL,EXERCIS,LANGINT,SIDEEFF,VAGINDR,EMOCTDW,...,LANGSAA,SMOKERE,E2AVE,AGE,ESTROG1,NOREASO,YOUNGLK,BONES1,ALCHL24,SOCIAL
0,1,1,,2,,,1,,1,1,...,1,1,33.1,54,2,,1,,1,5
1,1,1,,4,,,1,,1,1,...,1,1,45.05,53,1,,,,2,5
2,1,1,,1,,,1,,2,2,...,1,1,17.9,50,1,,,,1,4
3,1,1,,3,,,1,,1,1,...,1,1,28.25,53,1,,,,1,5
4,1,1,,2,,,1,,1,1,...,1,1,57.1,48,1,,,,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2743,1,1,,4,,,1,,2,1,...,1,1,,45,1,,1,,1,5
2744,1,1,,3,,,1,,1,1,...,1,2,26.75,50,1,,,,1,4
2745,1,1,,4,,,1,,1,1,...,1,1,370.75,47,1,,,,1,5
2746,1,1,,2,,,3,,1,1,...,3,1,302.2000000001,49,1,,,,1,5


In [26]:
# Explore unique values in the column.
visit_dfs[1]['OSTEOPO'].unique()

array(['1', ' ', '2'], dtype=object)

### ✨ Encoding empty values

In [27]:
"""
    It was chosen that missing values would be represented by -9999;
    however, values here are still strings (object, according to pandas),
    so we need to replace them with the actual value.
"""

# Before continuing, check null values
for index, df in enumerate(visit_dfs):
    df.replace(' ', None, inplace=True)
    print(f'Visit {index + 1} information:\n\n')
    df[list(selected_shared_variables)].info()
    print('------------------------------------')



Visit 1 information:


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2881 entries, 0 to 2880
Data columns (total 62 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   DIABETE  2864 non-null   object
 1   CANCERS  2864 non-null   object
 2   EXPENSI  2864 non-null   object
 3   RACE     2881 non-null   int64 
 4   LIKEFEL  2864 non-null   object
 5   EXERCIS  0 non-null      object
 6   LANGINT  2867 non-null   object
 7   SIDEEFF  2864 non-null   object
 8   VAGINDR  2861 non-null   object
 9   EMOCTDW  2863 non-null   object
 10  HORMOTH  2866 non-null   object
 11  PHYSILL  2862 non-null   object
 12  NOREMEB  2864 non-null   object
 13  DONTKNO  2867 non-null   object
 14  REDUHAR  2867 non-null   object
 15  REGPERI  2867 non-null   object
 16  FRNADVI  2867 non-null   object
 17  NOLIKE   2864 non-null   object
 18  HOTFLAS  2862 non-null   object
 19  IMPRMEM  2867 non-null   object
 20  OSTEOPR  2864 non-null   object
 21  MENOSYM  2867 

In [28]:
for df in visit_dfs:
    df.fillna(-9999, inplace=True)

In [29]:
visit_dfs[1][list(selected_shared_variables)].head()

Unnamed: 0,DIABETE,CANCERS,EXPENSI,RACE,LIKEFEL,EXERCIS,LANGINT,SIDEEFF,VAGINDR,EMOCTDW,...,LANGSAA,SMOKERE,E2AVE,AGE,ESTROG1,NOREASO,YOUNGLK,BONES1,ALCHL24,SOCIAL
0,1,1,-9999,2,-9999,-9999,1,-9999,1,1,...,1,1,33.1,54,2,-9999,1,-9999,1,5
1,1,1,-9999,4,-9999,-9999,1,-9999,1,1,...,1,1,45.05,53,1,-9999,-9999,-9999,2,5
2,1,1,-9999,1,-9999,-9999,1,-9999,2,2,...,1,1,17.9,50,1,-9999,-9999,-9999,1,4
3,1,1,-9999,3,-9999,-9999,1,-9999,1,1,...,1,1,28.25,53,1,-9999,-9999,-9999,1,5
4,1,1,-9999,2,-9999,-9999,1,-9999,1,1,...,1,1,57.1,48,1,-9999,-9999,-9999,1,5


In [30]:
# For time columns, empty values are coded ad '.'. Replace them with ''

for df in visit_dfs:
    df.replace('.', '', inplace=True)

visit_dfs[1][list(selected_shared_variables)].head()

Unnamed: 0,DIABETE,CANCERS,EXPENSI,RACE,LIKEFEL,EXERCIS,LANGINT,SIDEEFF,VAGINDR,EMOCTDW,...,LANGSAA,SMOKERE,E2AVE,AGE,ESTROG1,NOREASO,YOUNGLK,BONES1,ALCHL24,SOCIAL
0,1,1,-9999,2,-9999,-9999,1,-9999,1,1,...,1,1,33.1,54,2,-9999,1,-9999,1,5
1,1,1,-9999,4,-9999,-9999,1,-9999,1,1,...,1,1,45.05,53,1,-9999,-9999,-9999,2,5
2,1,1,-9999,1,-9999,-9999,1,-9999,2,2,...,1,1,17.9,50,1,-9999,-9999,-9999,1,4
3,1,1,-9999,3,-9999,-9999,1,-9999,1,1,...,1,1,28.25,53,1,-9999,-9999,-9999,1,5
4,1,1,-9999,2,-9999,-9999,1,-9999,1,1,...,1,1,57.1,48,1,-9999,-9999,-9999,1,5


### 🔗 Concatenating dataframes

In [31]:
# Concatenate all the dataframes together
full_df = pd.concat((df[list(selected_shared_variables)] for df in visit_dfs))

# Write it to a file
full_df.to_csv('./datasets/visit_dfs.csv', index=False)

# ❌ Removing columns with several missing values

In [32]:
# Check for each column how many -9999 values there are
# Make a dictionary with the column name and the number of -9999 values
missing_values = {}

for var in full_df.columns:
    missing_values[var] = len(full_df[full_df[var] == -9999])

# Sort the dictionary by the number of missing values
sorted_missing_values = {k: v for k, v in reversed(sorted(missing_values.items(), key=lambda item: item[1]))}

In [33]:
full_df.index

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243, 2244],
      dtype='int64', length=25487)

In [34]:
# Show the percentage of missing values for each column
missing_ratio = pd.Series({k: v/len(full_df.index) * 100 for k,v in sorted_missing_values.items()})
missing_ratio.name = 'null_ratio'
missing_ratio

MENODEP    80.962844
DIETNUT    59.822655
EXERCIS    59.822655
BONES3     47.086750
BONES2     47.067132
             ...    
ESTRNJ1     0.470828
COMBIN1     0.470828
LANGINT     0.466905
AGE         0.160866
RACE        0.000000
Name: null_ratio, Length: 62, dtype: float64

In [37]:
# Read csv with variable names and descriptions, and join by variable ID
var_names = pd.read_csv('./lista_variables/variables.csv')
normalized_colnames = {var:var.removesuffix('9') if var[-1].isdigit() else var for var in var_names['id']}
var_names['id'] = var_names['id'].map(normalized_colnames)
var_names.set_index('id', inplace=True)
var_names

Unnamed: 0_level_0,nombre,desc
id,Unnamed: 1_level_1,Unnamed: 2_level_1
FORMPHY,Abbreviated or Full Physical Measures,Abbreviated or Full Physical Measures
FORMINT,"Abbreviated, mailed or regular interview",Which version of the interview was administered:
ABBLEED,Abnormal vaginal bleeding since last visit,"Since your last study visit, have you had any ..."
ACTIPAN,Active this week,Indicate in the space next to each item how st...
ACUPUNC,Acupuncture,"During the past 12 months, have you used any o..."
...,...,...
YOGASPE,Yoga - Other specify,If YES to YOGA9: Please tell me whether or not...
YOGAADV,Yoga - Provider advice,If YES to YOGA9: Please tell me whether or not...
YOGAPER,Yoga - Regulate Periods,If YES to YOGA9: Please tell me whether or not...
YOGAWGH,Yoga - Weight,If YES to YOGA9: Please tell me whether or not...


In [38]:
ratio_comparison_df = pd.concat(
    [
        var_names['nombre'],
        missing_ratio
    ],
    axis=1,
    join='inner'
)

In [39]:
ratio_comparison_df.sort_values(by='null_ratio', ascending=False)

Unnamed: 0,nombre,null_ratio
MENODEP,"During menopause, depressed",80.962844
EXERCIS,Exercise,59.822655
DIETNUT,Nutritious diet,59.822655
BONES3,Bone #3 broken,47.08675
BONES2,Bone #2 broken,47.067132
BONES1,Bone #1 broken,46.906266
OUTCOME,Outcome of pregnancy,26.70381
ESTRNJ2,Estrogen injection/patch #2,22.325107
COMBIN2,Combination estrogen/progestin #2,21.901361
ESTROG2,Estrogen pills #2,21.49331


### Check here the columns to preserve

| id      | name                                        | null_ratio | preserved |
|---------|---------------------------------------------|------------|-----------|
| MENODEP | During menopause, depressed                 | 80.962844  | ✅        |
| EXERCIS | Exercise                                    | 59.822655  | ✅        |
| DIETNUT | Nutritious diet                             | 59.822655  | ✅        |
| BONES3  | Bone #3 broken                              | 47.086750  | ❌        |
| BONES2  | Bone #2 broken                              | 47.067132  | ❌        |
| BONES1  | Bone #1 broken                              | 46.906266  | ❌        |
| OUTCOME | Outcome of pregnancy                        | 26.703810  | ✅        |
| ESTRNJ2 | Estrogen injection/patch #2                | 22.325107  | ✅        |
| COMBIN2 | Combination estrogen/progestin #2          | 21.901361  | ✅        |
| ESTROG2 | Estrogen pills #2                          | 21.493310  | ✅        |
| E2AVE   | Estradiol (average, pg/mL)                 | 15.878683  | ✅        |
| ALCHL24 | Alcohol in Last 24 hours                   | 15.576568  | ✅        |
| DNTKNOW | Don't know why stopped hormones            | 15.172441  | ❌        |
| NOREASO | Stopped hormones no reason given           | 15.172441  | ❌        |
| DONTKNO | Don't know/remember why take hormones      | 14.179778  | ❌        |
| ESTRDA2 | Estrogen #2 prescription daily or off & on | 13.603013  | ✅        |
| HOURSPA | How many hours/week work for pay           | 13.469612  | ✅        |
| NOLIKE  | Do not like taking any medications         | 12.743752  | ❌        |
| EXPENSI | Too expensive                              | 12.743752  | ❌        |
| CANCER  | Worried about cancer                       | 12.743752  | ❌        |
| ADVISTO | Health care provider advised to stop       | 12.743752  | ❌        |
| SIDEEFF | Worried about possible side effects        | 12.743752  | ❌        |
| NOREMEB | Couldn't remember to take them             | 12.743752  | ❌        |
| STOPOTH | Stopped hormones other reason              | 12.743752  | ❌        |
| HAVEPER | Did not like having periods                | 12.743752  | ❌        |
| LIKEFEL | Did not like how felt on them              | 12.743752  | ❌        |
| PRBBLEE | Problems with bleeding                     | 12.739828  | ❌        |
| HORMOTH | Take hormones for other reasons            | 11.739318  | ❌        |
| MENOSYM | Relieve menopausal symptoms                | 11.735395  | ❌        |
| YOUNGLK | Stay young-looking                         | 11.735395  | ❌        |
| REGPERI | Regulate periods                           | 11.735395  | ❌        |
| HCPADVI | Health care provider advised               | 11.735395  | ❌        |
| OSTEOPR | Reduce risk of osteoporosis                | 11.735395  | ❌        |
| IMPRMEM | Improve memory                             | 11.735395  | ❌        |
| REDUHAR | Reduce risk of heart disease               | 11.735395  | ❌        |
| FRNADVI | Friend/relative advised                    | 11.735395  | ❌        |
| HEIGHT  | Height (in cm)                             | 10.829050  | ✅        |
| ESTRDA1 | Estrogen #1 prescription daily or off & on | 10.644642  | ✅        |
| WEIGHT  | Weight (in kg)                             | 6.493506   | ✅        |
| HAPPY   | Happy past week                            | 6.081532   | ✅        |
| PRGNANT | Pregnant since last visit                  | 4.861302   | ✅        |
| VAGINDR | Vaginal dryness past 2 weeks               | 2.668027   | ✅        |
| SMOKERE | Smoked regularly since last visit          | 2.664103   | ✅        |
| HOTFLAS | Hot flashes past 2 weeks                   | 2.664103   | ✅        |
| PHYSILL | Serious illness family upsetting since last visit | 2.664103   | ✅        |
| AVCIGDA | Average cigarettes/day since last visit    | 2.660180   | ✅        |
| LANGSAA | Language of Self-A                         | 2.660180   | ❌        |
| TRBLSLE | Trouble falling asleep past 2 weeks        | 2.660180   | ✅        |
| CANCERS | Cancer since last visit                    | 0.541453   | ✅        |
| DIABETE | Diabetes since last visit                  | 0.537529   | ✅        |
| BROKEBO | Number events where bone(s) broken since last visit | 0.537529   | ✅        |
| OSTEOPR | Osteoporosis since last visit              | 0.537529   | ✅        |
| STATUS  | Menopausal Status                          | 0.537529   | ✅        |
| ESTROG1 | Estrogen pills #1                          | 0.478675   | ✅        |
| ESTRNJ1 | Estrogen injection/patch #1                | 0.470828   | ✅        |
| COMBIN1 | Combination estrogen/progestin #1          | 0.470828   | ❌        |
| LANGINT | Language of Interview                      | 0.466905   | ❌        |
| AGE     | Age At Current Visit (Integer)             | 0.160866   | ✅        |
| RACE    | Race/Ethnicity                             | 0.000000   | ✅        |


> During the process of manual selection of variables, we have found that the following variables are not present in all the visits (or, perhaps, were not detected given by the prompt asked to the LLM used to filter variables), but do appear in most of them, and are relevant to the study:

- **STATUS:** menopause status.
- **WEIGHT:** weight in kg.
- **HEIGHT:** height in cm.
- **EXERCIS:** exercise or not in past 12 months.
- **DIETNUT:** if diet carried out is considered healthy.
- **EMOCTDW:** emotional cutdown due to menopause symptoms.
- **EMOCARE:** how careful you've been given how symptoms make you feel.
- **SOCIAL:** how much menopause symptoms have affected social life.
- **CANCERS:** has had or not cancer since last visit and which type.
- **TRBLSLE:** trouble falling asleep past 2 weeks.
- **HAPPY:** happy with life last week.
- **SMOKERE:** regular smoker.
- **MENODEP:** depression status due to menopause symptoms.

> ... therefore the notebook will be rerun with these variables included in the final dataframe.