In [15]:
%%capture
! pip install pyarrow

In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_parquet('data/processed/randhrs_stroke_4yr.parquet')
df.head()

Unnamed: 0,person_id,calendar_year,birth_year,age,age_squared,female,ethnicity,education_years,college_plus,self_rated_health,...,sleep_problem,sleep_change,new_sleep_problem,former_smoker,quit_smoking,sedentary,stopped_activity,drinks_per_week,heavy_drinking,incident_stroke_4yr
0,3010,1996,1936.0,60.0,3600.0,False,White,3.0,False,4.0,...,0,0.0,0,0,0,0,0,3.0,0,0.0
1,3020,1996,1938.0,58.0,3364.0,True,White,5.0,True,3.0,...,0,0.0,0,0,0,0,0,0.0,0,0.0
2,10001010,1996,1939.0,57.0,3249.0,False,White,3.0,False,3.0,...,1,0.0,0,0,0,0,0,0.0,0,0.0
3,10004010,1996,1939.0,57.0,3249.0,False,White,5.0,True,3.0,...,1,0.0,0,1,0,0,0,12.0,0,0.0
4,10004040,1996,1946.0,50.0,2500.0,True,White,3.0,False,1.0,...,0,0.0,0,1,0,0,0,28.0,1,0.0


In [18]:
for col in df.columns:
    print(col)

person_id
calendar_year
birth_year
age
age_squared
female
ethnicity
education_years
college_plus
self_rated_health
bmi
weight
height
mobility_limitations
large_muscle_limitations
adl_limitations
iadl_limitations
fine_motor_limitations
cognition_score
memory_recall
immediate_recall
delayed_recall
serial7
depression_score
felt_depressed
everything_effort
restless_sleep
felt_lonely
ever_smoked
current_smoker
drinks_per_day
drink_days_per_week
vigorous_activity
marital_status
self_rated_health_lag1
bmi_lag1
weight_lag1
height_lag1
mobility_limitations_lag1
large_muscle_limitations_lag1
adl_limitations_lag1
iadl_limitations_lag1
fine_motor_limitations_lag1
cognition_score_lag1
memory_recall_lag1
immediate_recall_lag1
delayed_recall_lag1
serial7_lag1
depression_score_lag1
felt_depressed_lag1
everything_effort_lag1
restless_sleep_lag1
felt_lonely_lag1
ever_smoked_lag1
current_smoker_lag1
drinks_per_day_lag1
drink_days_per_week_lag1
vigorous_activity_lag1
marital_status_lag1
self_rated_health_

In [19]:
print("Shape:", df.shape)
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

Shape: (113886, 122)
Number of rows: 113886
Number of columns: 122


In [20]:
target_col = [c for c in df.columns if c.startswith("incident_")][0]

n_features = df.drop(columns=[target_col]).shape[1]
print("Number of feature columns:", n_features)

Number of feature columns: 121


In [21]:
summary = df.describe().T[['mean', 'min', 'max', 'std']]
print(summary)

                             mean     min          max           std
person_id            2.225515e+08  3010.0  959738010.0  2.346911e+08
calendar_year        2.002899e+03  1996.0       2010.0  4.535107e+00
birth_year           1.940881e+03  1897.0       1995.0  1.139670e+01
age                  6.201855e+01    15.0        101.0  1.035951e+01
age_squared          3.953620e+03   225.0      10201.0  1.311368e+03
...                           ...     ...          ...           ...
sedentary            1.367683e-01     0.0          1.0  3.436041e-01
stopped_activity     3.802048e-02     0.0          1.0  1.912465e-01
drinks_per_week      2.502524e+00     0.0        350.0  6.326500e+00
heavy_drinking       3.502625e-02     0.0          1.0  1.838470e-01
incident_stroke_4yr  3.491211e-02     0.0          1.0  1.835580e-01

[119 rows x 4 columns]


In [22]:
df.select_dtypes(include='number').skew().sort_values(ascending=False).head(10)

drinks_per_week          7.465262
quit_smoking             6.787381
iadl_limitations_lag2    5.319457
incident_stroke_4yr      5.067565
heavy_drinking           5.058358
iadl_limitations_lag1    4.957879
health_crash             4.890623
stopped_activity         4.831331
iadl_limitations         4.524491
drink_days_per_week      4.500250
dtype: float64

In [23]:
target_col = [c for c in df.columns if c.startswith("incident_")][0]
features = df.drop(columns=[target_col])

corr_matrix = features.corr(numeric_only=True)

corr_pairs = (
    corr_matrix.abs()
    .where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    .stack()
    .sort_values(ascending=False)
) # pyright: ignore[reportCallIssue]

print(corr_pairs.head(20))

vigorous_activity   sedentary                1.000000
restless_sleep      sleep_problem            1.000000
ever_smoked_lag1    ever_smoked_lag2         0.999585
ever_smoked         ever_smoked_lag1         0.999561
                    ever_smoked_lag2         0.999298
age                 age_squared              0.994394
height_lag1         height_lag2              0.983539
height              height_lag1              0.979984
                    height_lag2              0.967810
weight_change_kg    weight_change_pct        0.957108
weight_lag1         weight_lag2              0.949445
memory_recall       delayed_recall           0.948592
weight              weight_lag1              0.948274
memory_recall_lag1  delayed_recall_lag1      0.948138
memory_recall_lag2  delayed_recall_lag2      0.947806
weight              weight_lag2              0.929459
bmi_change          weight_change_kg         0.928369
memory_recall       immediate_recall         0.925016
bmi_lag1            bmi_lag2

In [24]:
missing = df.isna().mean().sort_values(ascending=False)
print(missing.head(15))


vigorous_activity         0.749399
vigorous_activity_lag1    0.651660
vigorous_activity_lag2    0.594788
everything_effort_lag2    0.244429
felt_lonely_lag2          0.244341
felt_depressed_lag2       0.244183
restless_sleep_lag2       0.244121
depression_score_lag2     0.243972
memory_recall_lag2        0.243682
serial7_lag2              0.243682
cognition_score_lag2      0.243682
immediate_recall_lag2     0.243682
delayed_recall_lag2       0.243682
bmi_lag2                  0.219623
weight_lag2               0.217700
dtype: float64


In [25]:
constant_cols = [c for c in df.columns if df[c].nunique() <= 1]
print("Constant columns:", constant_cols)

Constant columns: []


In [26]:
[c for c in df.columns if "eligible_" in c or "target_" in c]

[]

In [27]:
print("Duplicate rows:", df.duplicated().sum())

Duplicate rows: 0


In [28]:
df.head()

Unnamed: 0,person_id,calendar_year,birth_year,age,age_squared,female,ethnicity,education_years,college_plus,self_rated_health,...,sleep_problem,sleep_change,new_sleep_problem,former_smoker,quit_smoking,sedentary,stopped_activity,drinks_per_week,heavy_drinking,incident_stroke_4yr
0,3010,1996,1936.0,60.0,3600.0,False,White,3.0,False,4.0,...,0,0.0,0,0,0,0,0,3.0,0,0.0
1,3020,1996,1938.0,58.0,3364.0,True,White,5.0,True,3.0,...,0,0.0,0,0,0,0,0,0.0,0,0.0
2,10001010,1996,1939.0,57.0,3249.0,False,White,3.0,False,3.0,...,1,0.0,0,0,0,0,0,0.0,0,0.0
3,10004010,1996,1939.0,57.0,3249.0,False,White,5.0,True,3.0,...,1,0.0,0,1,0,0,0,12.0,0,0.0
4,10004040,1996,1946.0,50.0,2500.0,True,White,3.0,False,1.0,...,0,0.0,0,1,0,0,0,28.0,1,0.0
