In [1]:
import os
import math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
import sys
sys.path.append('../src')

In [16]:
from data_helpers import load_static_feat

In [13]:
file_loc = '/storage/data/bump/bump/'

In [4]:
def ethnicity_func(ethnicity):
    if pd.isnull(ethnicity):
        return 'Unknown'
    ethnicity = ethnicity.strip().lower()  # Remove extra spaces and make lowercase
    # Handle known variations manually here
    replacements = {
        'white': ['caucasian', 'white, nothing even remotely diverse', 'white american', 'white\n'],
        'asian': ['chinese', 'asian (taiwanese)', 'korean', 'asian\n'],
        'black/african/caribbean': ['black american', 'black/african/caribbean'],
        'american': ['american', 'italian american', 'irish american', 'canadian'],
        'hispanic': ['mexican', 'french mexican', 'jamaican cuban'],
        'european': ['european', 'northern european', 'western and eastern european', 'mixed european'],
        'other': ['unknown? non-hispanic caucasian?', 'no ethic identification']
    }
    for key, values in replacements.items():
        if ethnicity in values:
            return key
    return ethnicity


In [5]:
def categorize_ethnicity(ethnicity):
    ethnicity = ethnicity.strip().lower()  # Remove extra spaces and make lowercase
    if ethnicity in ['white', 'caucasian', 'white american']:
        return 'White'
    elif ethnicity in ['asian', 'chinese', 'korean', 'asian indian', 'asian (taiwanese)']:
        return 'Asian'
    elif ethnicity in ['black', 'black/african/caribbean', 'black american', 'african']:
        return 'Black'
    elif ethnicity in ['hispanic', 'mexican', 'jamaican', 'cuban']:
        return 'Hispanic'
    else:
        return 'Other'

In [14]:
df_sleep = pd.read_csv(file_loc+'oura/oura_sleep.csv.gz')
df_sleep['date'] = pd.to_datetime(df_sleep.event_date).dt.date
key = 'redcap/birthing_data.csv.gz'
df_birth = pd.read_csv(file_loc+key, compression='gzip')
df_birth['date'] = pd.to_datetime(df_birth.birth_date).dt.date

In [17]:
birth_ids = (df_birth.record_id.unique())
birth_ids = np.sort(birth_ids)
sleep_ids = np.sort(df_sleep.record_id.unique())
sleep_ids_new = [b_id for b_id in list(sleep_ids) if b_id in list(birth_ids)]

In [18]:
static_feat = load_static_feat(file_loc, sleep_ids_new, df_birth, parity=True, pred_cond=True)


  df_survey = pd.read_csv(file_loc+key, compression='gzip') #dtype={"user_id": int, "username": "string"}


In [19]:
static_feat.loc[static_feat.age>17].describe()

Unnamed: 0,gestage_week,age,234,236,239,240,241,242,243,244,245,1160,weight_pre,bmi,n_prev_baby
count,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0
mean,38.520558,35.619289,0.03934,0.109137,0.236041,0.062183,0.314721,0.041878,0.138325,0.106599,0.436548,0.03934,153.583306,25.398735,0.360406
std,2.003841,4.265637,0.189685,0.306034,0.41916,0.232404,0.45949,0.19575,0.339178,0.306929,0.495305,0.189685,36.978452,5.62647,0.711273
min,26.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,81.0,12.314663,0.0
25%,38.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127.0,21.400536,0.0
50%,39.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,148.0,24.27378,0.0
75%,40.0,39.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,170.75,28.334497,0.0
max,42.0,52.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,319.0,46.659945,4.0


In [21]:
key = 'app_activities/surveys.csv.gz'
df_survey = pd.read_csv(file_loc+key, compression='gzip') #dtype={"user_id": int, "username": "string"}
df_survey_new = df_survey.loc[df_survey.record_id.isin(sleep_ids_new)]
prev_condition_q = df_survey_new.question_text.loc[df_survey.question_text.str.contains('ethnicity')].unique()
#df_survey_new.question_text.loc[df_survey.question_text.str.contains('condition')].unique()

  df_survey = pd.read_csv(file_loc+key, compression='gzip') #dtype={"user_id": int, "username": "string"}


In [22]:
df = df_survey_new.loc[:,['record_id','answer_text', 'question_id', 'question_text']].loc[df_survey_new.question_text.isin(prev_condition_q)].copy()
df = df.drop_duplicates(subset='record_id', keep='first')



In [23]:
df['data_ethnicity'] = df['answer_text'].apply(ethnicity_func)
df['data_ethnicity'] = df['data_ethnicity'].apply(categorize_ethnicity)

# Step 3: Calculate the percentage of each cleaned ethnicity
ethnicity_counts = df['data_ethnicity'].value_counts()

# Step 2: Calculate the percentage of each ethnicity
ethnicity_percentage = (ethnicity_counts / ethnicity_counts.sum()) * 100

# Create a new dataframe to present the results
ethnicity_distribution = pd.DataFrame({
    'Ethnicity': ethnicity_percentage.index,
    'Percentage': ethnicity_percentage.values
})

In [24]:
ethnicity_distribution

Unnamed: 0,Ethnicity,Percentage
0,White,48.730964
1,Other,42.639594
2,Asian,5.329949
3,Black,1.77665
4,Hispanic,1.522843


In [25]:
f = df_survey_new.loc[:,['record_id','answer_text', 'question_id', 'question_text']].loc[df_survey_new.question_text.isin(prev_condition_q)].copy()
df['question_text'] = df['question_text'].apply(lambda x: x.split('\n')[-1] if isinstance(x, str) else x)
mapping = {'No': 0, 'Not sure': np.nan, 'Yes': 1}
df['answer_text_mapped'] = df['answer_text'].map(mapping)


In [None]:
df.groupby('question_text')['answer_text_mapped'].agg(['mean', 'std','count']).reset_index()
