In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

In [15]:
df = pd.read_csv('Sleep_data__norm_ML2568.csv')

df = df.drop(df.columns[0], axis=1)  # drop col 0
df = df.drop(columns=['pid', 'date', 'date_'])  # drop non-informative columns

In [19]:
# CONTINUATION: target-aware imputation to fill missing target values

target_col = 'f_slp:fitbit_sleep_summary_rapids_sumdurationasleepmain_norm:allday'

# Drop rows with more than 80% missing values
row_missing_ratio = df.isna().mean(axis=1)
threshold = 0.8  # 80%
df_filtered = df.loc[row_missing_ratio <= threshold].copy()

print(f"Original rows: {len(df)}")
print(f"Remaining rows after dropping > 80% missing: {len(df_filtered)}")
print(f"dropped rows: {len(df) - len(df_filtered)}")

Original rows: 11470
Remaining rows after dropping > 80% missing: 10066
dropped rows: 1404


In [None]:

# Split into rows with and without target
df_with_target = df.dropna(subset=[target_col]).copy()
df_missing_target = df[df[target_col].isna()].copy()

# print size of dataset
print(f"Dataset shape: {df.shape}")
print(f"Rows with target: {len(df_with_target)} size: {df_with_target.shape}\nRows with missing target: {len(df_missing_target)} size: {df_missing_target.shape}")

# 1️⃣  Compute feature importance using Mutual Information instead of correlation
from sklearn.feature_selection import mutual_info_regression

numeric_cols = df_with_target.select_dtypes(include='number').columns.drop(target_col, errors='ignore')
X_num = df_with_target[numeric_cols].copy()
y_target = df_with_target[target_col].copy()

# Fill temporary NaNs for MI computation (MI cannot handle NaNs)
X_num_filled = X_num.fillna(X_num.median())

# Compute mutual information (nonlinear relationships)
mi_scores = mutual_info_regression(X_num_filled, y_target, random_state=42)

# Convert to Series for easier handling
mi_series = pd.Series(mi_scores, index=numeric_cols).sort_values(ascending=False)
top_features = mi_series.index[:40].tolist()

print("Top features with highest mutual information to target:")
print(top_features)

# for all these top features, check missingness in df_missing_target
print("Missingness in influential features (in rows with missing target):")
foundUsefulColumn = False
for feature in top_features:
    missing_count = df_missing_target[feature].isna().sum()
    total_count = df_missing_target.shape[0]
    missing_percentage = (missing_count / total_count * 100) if total_count > 0 else 0
    # criteria: missing less than 70% and MI > threshold
    if (missing_percentage < 70 and mi_series[feature] > 0.01):  # MI > 0.01 roughly means some relationship
        print(f"Missing in {feature}: {missing_percentage:.2f}%, mutual info: {mi_series[feature]:.4f}")
        foundUsefulColumn = True

if not foundUsefulColumn:
    print("No useful influential columns found for target imputation.")
    # Proceed with other imputation strategies 


# 2️⃣  Measure missingness in those features (in rows that have no target)
missing_counts = df_missing_target[top_features].isna().sum(axis=1)
df_missing_target['missing_in_influencers'] = missing_counts

# 3️⃣  Decide a threshold: how many influential features must be present
# Here we require that at least 80% of top_features are non-missing
threshold = int(0.7 * len(top_features))
eligible_mask = df_missing_target['missing_in_influencers'] <= (len(top_features) - threshold)
df_eligible_for_impute = df_missing_target[eligible_mask].copy()
df_drop_for_missing = df_missing_target[~eligible_mask].copy()

print(f"Eligible for target imputation: {len(df_eligible_for_impute)} rows")
print(f"Dropped (too incomplete): {len(df_drop_for_missing)} rows")

# 4️⃣  Prepare data for modeling target from top features
X_train = df_with_target[top_features].copy()
y_train = df_with_target[target_col].copy()
X_pred = df_eligible_for_impute[top_features].copy()

# Basic cleaning: impute median for feature missing values before modeling
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_pred_imputed = imputer.transform(X_pred)

# 5️⃣  Train a regression model to predict target for missing rows
from sklearn.ensemble import RandomForestRegressor

rf_imputer = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    max_depth=10
)
rf_imputer.fit(X_train_imputed, y_train)

# 6️⃣  Predict missing targets
predicted_targets = rf_imputer.predict(X_pred_imputed)
df_eligible_for_impute[target_col] = predicted_targets
df_eligible_for_impute['target_imputed'] = True

# 7️⃣  Combine back into one DataFrame
df_with_target['target_imputed'] = False
df_filled = pd.concat([df_with_target, df_eligible_for_impute], ignore_index=True)

print(f"Final data size after imputation: {len(df_filled)}")
print(f"Rows with actual targets: {(~df_filled['target_imputed']).sum()}")
print(f"Rows with imputed targets: {df_filled['target_imputed'].sum()}")

# 8️⃣  Optional sanity check — compare distributions
plt.figure(figsize=(8,4))
plt.hist(df_with_target[target_col], bins=40, alpha=0.6, label='Original target')
plt.hist(predicted_targets, bins=40, alpha=0.6, label='Imputed target')
plt.xlabel('Sleep duration (normalized)')
plt.ylabel('Frequency')
plt.legend()
plt.title('Distribution: Original vs Imputed Target')
plt.show()

# df_filled is now your main dataset with both original and imputed targets


Dataset shape: (11470, 139)
Rows with target: 7336 size: (7336, 139)
Rows with missing target: 4134 size: (4134, 139)


ValueError: Input X contains NaN.

: 

In [None]:
for col in [c for c in df.columns if 'screen' in c]:
    corr = df.corr(numeric_only=True)[col].sort_values(ascending=False)
    print(f"\nTop correlations with {col}:\n", corr[1:6])



Top correlations with f_screen:phone_screen_rapids_countepisodeunlock_norm:afternoon:
 f_screen:phone_screen_rapids_countepisodeunlock_norm:allday      0.764269
f_screen:phone_screen_rapids_countepisodeunlock_norm:evening     0.309548
f_screen:phone_screen_rapids_countepisodeunlock_norm:morning     0.298422
f_blue:phone_bluetooth_doryab_uniquedevicesall_norm:allday       0.274187
f_blue:phone_bluetooth_doryab_uniquedevicesall_norm:afternoon    0.253772
Name: f_screen:phone_screen_rapids_countepisodeunlock_norm:afternoon, dtype: float64

Top correlations with f_screen:phone_screen_rapids_sumdurationunlock_norm:afternoon:
 f_screen:phone_screen_rapids_sumdurationunlock_locmap_home_norm:afternoon      0.583025
f_screen:phone_screen_rapids_sumdurationunlock_norm:allday                     0.497595
f_screen:phone_screen_rapids_sumdurationunlock_locmap_living_norm:afternoon    0.496432
f_screen:phone_screen_rapids_sumdurationunlock_locmap_home_norm:allday         0.306515
f_screen:phone_scr

In [None]:
def count_missing_values(data, columns=None):
    # If no specific columns provided, use all columns
    if columns is None:
        columns = data.columns
    
    # Count missing values for specified columns
    missing_counts = data[columns].isnull().sum()
    
    # Add percentage of missing values
    missing_percentage = (missing_counts / len(data)) * 100
    
    # Create a DataFrame with both counts and percentages
    missing_info = pd.DataFrame({
        'Missing Count': missing_counts,
        'Missing Percentage': missing_percentage
    })
    
    return missing_info.sort_values('Missing Count', ascending=False)

In [None]:
missing_stats = count_missing_values(df)
display(missing_stats)

Unnamed: 0,Missing Count,Missing Percentage
f_loc:phone_locations_barnett_hometime_norm:morning,11470,100.000000
f_loc:phone_locations_barnett_circdnrtn_norm:afternoon,11470,100.000000
f_loc:phone_locations_barnett_rog_norm:afternoon,11470,100.000000
f_loc:phone_locations_barnett_siglocsvisited_norm:afternoon,11470,100.000000
f_loc:phone_locations_barnett_wkenddayrtn_norm:afternoon,11470,100.000000
...,...,...
f_loc:phone_locations_doryab_totaldistance_norm:allday,1406,12.258065
Unnamed: 0,0,0.000000
pid,0,0.000000
date,0,0.000000


In [None]:
# drop col with more than 70% missing values
threshold = 0.7  # 70%
missing_rate = df.isnull().mean()
to_drop = missing_rate[missing_rate > threshold].index
df = df.drop(columns=to_drop)

In [None]:
missing_stats = count_missing_values(df.filter(like='f_loc'))
display(missing_stats)

Unnamed: 0,Missing Count,Missing Percentage
f_loc:phone_locations_locmap_duration_in_locmap_greens_norm:night,2431,21.19442
f_loc:phone_locations_locmap_duration_in_locmap_exercise_norm:night,2431,21.19442
f_loc:phone_locations_locmap_duration_in_locmap_study_norm:night,2431,21.19442
f_loc:phone_locations_doryab_totaldistance_norm:night,2431,21.19442
f_loc:phone_locations_doryab_timeathome_norm:night,2431,21.19442
f_loc:phone_locations_doryab_movingtostaticratio_norm:night,2431,21.19442
f_loc:phone_locations_doryab_movingtostaticratio_norm:morning,1930,16.826504
f_loc:phone_locations_doryab_timeathome_norm:morning,1930,16.826504
f_loc:phone_locations_doryab_totaldistance_norm:morning,1929,16.817786
f_loc:phone_locations_locmap_duration_in_locmap_exercise_norm:morning,1929,16.817786


In [None]:
missing_stats = count_missing_values(df.filter(like='f_screen'))
display(missing_stats)

Unnamed: 0,Missing Count,Missing Percentage
f_screen:phone_screen_rapids_sumdurationunlock_locmap_greens_norm:morning,7876,68.666085
f_screen:phone_screen_rapids_sumdurationunlock_locmap_living_norm:night,7309,63.722755
f_screen:phone_screen_rapids_sumdurationunlock_locmap_study_norm:afternoon,6826,59.51177
f_screen:phone_screen_rapids_sumdurationunlock_locmap_greens_norm:afternoon,6427,56.03313
f_screen:phone_screen_rapids_sumdurationunlock_locmap_living_norm:morning,6254,54.524847
f_screen:phone_screen_rapids_sumdurationunlock_locmap_living_norm:afternoon,5823,50.767219
f_screen:phone_screen_rapids_sumdurationunlock_locmap_living_norm:evening,5507,48.012206
f_screen:phone_screen_rapids_sumdurationunlock_locmap_study_norm:allday,5349,46.634699
f_screen:phone_screen_rapids_sumdurationunlock_locmap_home_norm:night,5278,46.015693
f_screen:phone_screen_rapids_sumdurationunlock_locmap_greens_norm:allday,4778,41.656495


In [None]:
missing_stats = count_missing_values(df.filter(like='f_call'))
display(missing_stats)

Unnamed: 0,Missing Count,Missing Percentage
f_call:phone_calls_rapids_missed_count_norm:allday,7305,63.687881
f_call:phone_calls_rapids_incoming_count_norm:allday,7305,63.687881
f_call:phone_calls_rapids_outgoing_count_norm:allday,7305,63.687881


In [None]:
missing_stats = count_missing_values(df.filter(like='f_blue'))
display(missing_stats)

Unnamed: 0,Missing Count,Missing Percentage
f_blue:phone_bluetooth_doryab_uniquedevicesall_norm:night,6069,52.911944
f_blue:phone_bluetooth_doryab_uniquedevicesall_norm:morning,3909,34.080209
f_blue:phone_bluetooth_doryab_uniquedevicesall_norm:evening,3508,30.584133
f_blue:phone_bluetooth_doryab_uniquedevicesall_norm:afternoon,2769,24.141238
f_blue:phone_bluetooth_doryab_uniquedevicesall_norm:allday,1892,16.495205


In [None]:
missing_stats = count_missing_values(df.filter(like='f_steps'))
display(missing_stats)

Unnamed: 0,Missing Count,Missing Percentage
f_steps:fitbit_steps_intraday_rapids_countepisodeactivebout_norm:night,5323,46.408021
f_steps:fitbit_steps_intraday_rapids_sumdurationactivebout_norm:night,5323,46.408021
f_steps:fitbit_steps_intraday_rapids_countepisodeactivebout_norm:morning,4074,35.518745
f_steps:fitbit_steps_intraday_rapids_sumdurationactivebout_norm:morning,4074,35.518745
f_steps:fitbit_steps_intraday_rapids_countepisodeactivebout_norm:evening,3796,33.095031
f_steps:fitbit_steps_intraday_rapids_sumdurationactivebout_norm:evening,3796,33.095031
f_steps:fitbit_steps_intraday_rapids_countepisodeactivebout_norm:afternoon,3703,32.28422
f_steps:fitbit_steps_intraday_rapids_sumdurationactivebout_norm:afternoon,3703,32.28422
f_steps:fitbit_steps_intraday_rapids_sumsteps_norm:afternoon,3397,29.616391
f_steps:fitbit_steps_intraday_rapids_sumsteps_norm:allday,3397,29.616391


In [None]:
missing_stats = count_missing_values(df.filter(like='f_slp'))
display(missing_stats)

Unnamed: 0,Missing Count,Missing Percentage
f_slp:fitbit_sleep_summary_rapids_avgefficiencymain:allday,4214,36.73932
f_slp:fitbit_sleep_summary_rapids_avgefficiencymain_norm:allday,4214,36.73932
f_slp:fitbit_sleep_summary_rapids_sumdurationasleepmain_norm:allday,4134,36.041848
f_slp:fitbit_sleep_summary_rapids_sumdurationasleepmain:allday,4134,36.041848


In [None]:

target_col = 'f_slp:fitbit_sleep_summary_rapids_sumdurationasleepmain_norm:allday'
