# Symptoms and Triggers

In [2]:
import pandas as pd
from datetime import datetime
import numpy as np
import json

In [5]:
filename = 'raw_data/symptoms.csv'

In [6]:
data = pd.read_csv(open(filename, 'r'))
data.head()

Unnamed: 0,uid,name,time,triggers,severity,discreet,note,allday,alldaynum
0,ObjectId(5f8e90320054a6c5fab1544e),breast_swelling,2020-10-20T18:52:00.002Z,[],2.0,,,False,
1,ObjectId(5f8e90320054a6c5fab1544e),bloating,2020-10-20T18:52:00.002Z,[],2.0,,,False,
2,ObjectId(5f8e90320054a6c5fab1544e),nipple_discharge,2020-10-20T18:53:00.002Z,[],2.0,,,False,
3,ObjectId(5f8e90320054a6c5fab1544e),hot_flashes,2020-10-20T07:00:00.002Z,[],3.0,,,True,3.0
4,ObjectId(5f8e90320054a6c5fab1544e),nipple_discharge,2020-10-19T18:56:00.002Z,"[{""question"":""Are you experiencing any of thes...",3.0,,,False,


In [108]:
def fixUid(frame, colname='uid'):
    uids = frame[colname].values
    uids_new = []
    for u in uids:
        uids_new += [u.split('(')[1].split(')')[0]]
    frame[colname] = uids_new
    
    return frame

In [5]:
data = fixUid(data)
data.head()

Unnamed: 0,uid,name,time,triggers,severity,discreet,note,allday,alldaynum
0,5f8e90320054a6c5fab1544e,breast_swelling,2020-10-20T18:52:00.002Z,[],2.0,,,False,
1,5f8e90320054a6c5fab1544e,bloating,2020-10-20T18:52:00.002Z,[],2.0,,,False,
2,5f8e90320054a6c5fab1544e,nipple_discharge,2020-10-20T18:53:00.002Z,[],2.0,,,False,
3,5f8e90320054a6c5fab1544e,hot_flashes,2020-10-20T07:00:00.002Z,[],3.0,,,True,3.0
4,5f8e90320054a6c5fab1544e,nipple_discharge,2020-10-19T18:56:00.002Z,"[{""question"":""Are you experiencing any of thes...",3.0,,,False,


In [9]:
def fixTime(frame, column_name, output_name='datetime'):
    """
    Function has side effects. Changes the argument in place, saves on memory.
    """
    time = frame[column_name].values
    time_new = []
    for t in time:
        if pd.isna(t):
            time_new += [None]
        else:
            d = datetime.strptime(t, '%Y-%m-%dT%H:%M:%S.%fZ')
            time_new += [d.strftime('%Y-%m-%d %H:%M:%S')]              
    frame[column_name] = time_new
    
    # Also change the name of the "time" field to "datetime" for consistency.
    index = -1
    columns = frame.columns.values
    for (idx, c) in enumerate(columns):
        if c == column_name:
            index = idx
            break
            
    columns[index] = output_name
    frame.columns= columns
    
    # Also change the datatype of the datetime column.
    frame[output_name] = frame[output_name].astype('datetime64')

    return frame

In [7]:
data_new = fixTime(data, 'time')
data_new.head()

Unnamed: 0,uid,name,datetime,triggers,severity,discreet,note,allday,alldaynum
0,5f8e90320054a6c5fab1544e,breast_swelling,2020-10-20 18:52:00,[],2.0,,,False,
1,5f8e90320054a6c5fab1544e,bloating,2020-10-20 18:52:00,[],2.0,,,False,
2,5f8e90320054a6c5fab1544e,nipple_discharge,2020-10-20 18:53:00,[],2.0,,,False,
3,5f8e90320054a6c5fab1544e,hot_flashes,2020-10-20 07:00:00,[],3.0,,,True,3.0
4,5f8e90320054a6c5fab1544e,nipple_discharge,2020-10-19 18:56:00,"[{""question"":""Are you experiencing any of thes...",3.0,,,False,


In [8]:
for (idx, triggers) in zip(data.index, data['triggers']):
    if idx > 10:
        break
    print(idx, triggers)

0 []
1 []
2 []
3 []
4 [{"question":"Are you experiencing any of these common triggers?","answer":["Excessive breast stimulation","Breast infection"]}]
5 [{"question":"Are you experiencing any of these common triggers?","answer":["Am or may be pregnant","Abscess"]}]
6 [{"question":"Are you experiencing any of these common triggers?","answer":["Am or may be pregnant","Abscess"]}]
7 [{"question":"Are you experiencing any of these common triggers?","answer":["Abscess"]}]
8 [{"question":"Did you experience any of these common triggers?","answer":["Asparagus"]}]
9 [{"question":"Did you consume any of these common triggers? ","answer":["Change in fiber intake","Apples","Beans","Onions","Lentils","Rye"]}]
10 [{"question":"Are you experiencing any of these common triggers?","answer":["Relationship conflict"]}]


In [9]:
data_new.dtypes

uid                  object
name                 object
datetime     datetime64[ns]
triggers             object
severity            float64
discreet             object
note                float64
allday                 bool
alldaynum           float64
dtype: object

Need to flatten out triggers JSON into a table, so we can parse this easier.

I would suggest the scehma to be -

`trigger_id (primary), symptom_id (foriegn), question, answers`

If a question has multiple answers (which in most situations it does), then each answer will be its own row / record.

In [10]:
def generateUniqueId(frame : pd.DataFrame, column_name='symptom_id'):
    """
    Method which generates a unique ID for each record and then creates a column
    for the frame in which all of this will be stored.
    """
    
    time = int(datetime.now().timestamp())
    symptom_id = np.arange(time, time+frame.shape[0])
    
    frame[column_name] = symptom_id
    
    return frame

In [11]:
time = int(datetime.now().timestamp())
symptom_id = np.arange(time, time+data_new.shape[0])

In [12]:
symptom_id.size

448772

In [13]:
data = generateUniqueId(data_new)

In [14]:
data.head()

Unnamed: 0,uid,name,datetime,triggers,severity,discreet,note,allday,alldaynum,symptom_id
0,5f8e90320054a6c5fab1544e,breast_swelling,2020-10-20 18:52:00,[],2.0,,,False,,1686178445
1,5f8e90320054a6c5fab1544e,bloating,2020-10-20 18:52:00,[],2.0,,,False,,1686178446
2,5f8e90320054a6c5fab1544e,nipple_discharge,2020-10-20 18:53:00,[],2.0,,,False,,1686178447
3,5f8e90320054a6c5fab1544e,hot_flashes,2020-10-20 07:00:00,[],3.0,,,True,3.0,1686178448
4,5f8e90320054a6c5fab1544e,nipple_discharge,2020-10-19 18:56:00,"[{""question"":""Are you experiencing any of thes...",3.0,,,False,,1686178449


In [15]:
def createTriggersTable(dataframe : pd.DataFrame) -> pd.DataFrame:
    """
    Method which flattens the JSON in triggers and creates a seperate table. 
    """
    
    # Lists that will hold all the column data.
    sid = []
    questions = []
    answers = []
    
    for (symptom_id, trigger) in zip(dataframe['symptom_id'], dataframe['triggers']):
        trigger = json.loads(trigger)
        if len(trigger) <= 0:
            continue
        
        # For each question answer pair, we will flatten it.
        for qa in trigger:
            q = qa['question']
            ans = qa['answer']
            # Each question can have multiple answers, so we flatten this out as well,
            # and then add this to the column lists.
            for a in ans:
                sid += [symptom_id]
                questions += [q]
                answers += [a]
    
    # Create a triggers dataframe with all the information.
    frame = pd.DataFrame({
        'symptom_id' : sid,
        'question' : questions,
        'answer' : answers
    })
    
    # Create the index for this as well.
    t = int(datetime.now().timestamp())
    frame['trigger_id'] = np.arange(t, t+frame.shape[0])
    frame.set_index('trigger_id')
    
    # Reset some of the type information.
    frame['question'] = frame['question'].astype('string')
    frame['answer'] = frame['answer'].astype('string') 
    
    return frame

In [16]:
triggers = createTriggersTable(data)

In [17]:
triggers.head()

Unnamed: 0,symptom_id,question,answer,trigger_id
0,1686178449,Are you experiencing any of these common trigg...,Excessive breast stimulation,1686178449
1,1686178449,Are you experiencing any of these common trigg...,Breast infection,1686178450
2,1686178450,Are you experiencing any of these common trigg...,Am or may be pregnant,1686178451
3,1686178450,Are you experiencing any of these common trigg...,Abscess,1686178452
4,1686178451,Are you experiencing any of these common trigg...,Am or may be pregnant,1686178453


In [18]:
triggers.dtypes

symptom_id     int64
question      string
answer        string
trigger_id     int64
dtype: object

In [19]:
# Drop triggers column from the main dataframe to generate the symptom specific datafame.
symptoms = data.drop(columns=['triggers'])
symptoms.head()

Unnamed: 0,uid,name,datetime,severity,discreet,note,allday,alldaynum,symptom_id
0,5f8e90320054a6c5fab1544e,breast_swelling,2020-10-20 18:52:00,2.0,,,False,,1686178445
1,5f8e90320054a6c5fab1544e,bloating,2020-10-20 18:52:00,2.0,,,False,,1686178446
2,5f8e90320054a6c5fab1544e,nipple_discharge,2020-10-20 18:53:00,2.0,,,False,,1686178447
3,5f8e90320054a6c5fab1544e,hot_flashes,2020-10-20 07:00:00,3.0,,,True,3.0,1686178448
4,5f8e90320054a6c5fab1544e,nipple_discharge,2020-10-19 18:56:00,3.0,,,False,,1686178449


In [20]:
symptoms.dtypes

uid                   object
name                  object
datetime      datetime64[ns]
severity             float64
discreet              object
note                 float64
allday                  bool
alldaynum            float64
symptom_id             int64
dtype: object

In [21]:
symptoms['name'].unique()

array(['breast_swelling', 'bloating', 'nipple_discharge', 'hot_flashes',
       'odor', 'anxiety', 'headaches', 'dry_skin', 'nausea',
       'constipation', 'ovulation', 'depression', 'vomiting', 'allergies',
       'joint_pain', 'cramps', 'fatigue', 'muscle_tension', 'breast_pain',
       'painful_intercourse', 'heartburn', 'tingling_extremeties',
       'yeast_infection', 'dry_eyes', 'hair_loss', 'itchy_skin',
       'irregular_beat', 'diarrhea', 'insomnia', 'sex_drive',
       'facial_hair', 'mood_swings', 'ringing_ears', 'brain_fog', 'uti',
       'vaginal_dryness', 'night_sweats', 'chills', 'spotting',
       'brittle_nails', 'ovulation_pain', 'memory_lapse', 'incontinence',
       'dizzy', 'vertigo'], dtype=object)

In [22]:
symptoms.shape

(448772, 9)

In [23]:
symptoms['discreet'].unique()

array([nan, '["Funky"]', '["Yes"]', '["Sweet"]', '["Patchy, thinning"]',
       '["A lot of new hair"]', '["No new facial hair growth"]',
       '["A few new hairs"]', '["A little off"]', '["Normal"]',
       '["Cracking, brittle"]', '[]', '["No"]', '["Strong, healthy"]'],
      dtype=object)

In [24]:
def fixSymptomTypes(symptoms):
    """
    Method to fix the symptom types and also clean it up.
    """
    
    sym = symptoms.copy(deep=True)
    sym['severity'] = symptoms['severity'].replace(to_replace=np.nan, value=-1)
    sym['severity'] = sym['severity'].astype('int64')
    sym['alldaynum'] = symptoms['alldaynum'].replace(to_replace=np.nan, value=-1)
    sym['alldaynum'] = sym['alldaynum'].astype('int64')
    sym['uid'] = sym['uid'].astype('string')
    sym['name'] = sym['name'].astype('string')

    sym['note'] = sym['note'].replace(to_replace=np.nan, value='')
    sym['note'] = sym['note'].astype('string')
    
    # For discreet, we need to flatten it inside the column itself. 
    # Scanning the data shows that even if the values are arrays, each of them only has 1 element. 
    discreet = sym['discreet'].values
    for (idx, d) in enumerate(discreet):
        if d == np.nan:
            discreet[idx] = ''
            continue
        try:
            discreet[idx] = d.split('["')[1].split('"]')[0]
        except IndexError as ex:
            #print('Exception raised when processing {d} : {ex}'.format(d=d, ex=ex))
            discreet[idx] = ''
        except AttributeError as ex:
            #print('Exception raised when processing {d} : {ex}'.format(d=d, ex=ex))
            discreet[idx] = ''
            
    sym['discreet'] = discreet
    sym['discreet'] = sym['discreet'].astype('string')
    
    
    return sym

In [25]:
sym = fixSymptomTypes(symptoms)

In [26]:
sym.head()

Unnamed: 0,uid,name,datetime,severity,discreet,note,allday,alldaynum,symptom_id
0,5f8e90320054a6c5fab1544e,breast_swelling,2020-10-20 18:52:00,2,,,False,-1,1686178445
1,5f8e90320054a6c5fab1544e,bloating,2020-10-20 18:52:00,2,,,False,-1,1686178446
2,5f8e90320054a6c5fab1544e,nipple_discharge,2020-10-20 18:53:00,2,,,False,-1,1686178447
3,5f8e90320054a6c5fab1544e,hot_flashes,2020-10-20 07:00:00,3,,,True,3,1686178448
4,5f8e90320054a6c5fab1544e,nipple_discharge,2020-10-19 18:56:00,3,,,False,-1,1686178449


In [27]:
sym[sym['uid'] == '640a88974c33a20028c60365']

Unnamed: 0,uid,name,datetime,severity,discreet,note,allday,alldaynum,symptom_id
441680,640a88974c33a20028c60365,cramps,2023-03-10 18:39:53,2,,,True,2,1686620125
441681,640a88974c33a20028c60365,dry_skin,2023-03-10 18:39:56,2,,,False,-1,1686620126
441682,640a88974c33a20028c60365,itchy_skin,2023-03-10 18:39:56,2,,,False,-1,1686620127
441818,640a88974c33a20028c60365,bloating,2023-03-10 18:39:53,2,,,True,3,1686620263


In [28]:
sym.dtypes

uid                   string
name                  string
datetime      datetime64[ns]
severity               int64
discreet              string
note                  string
allday                  bool
alldaynum              int64
symptom_id             int64
dtype: object

We are now ready to upload the tables to Glue.

In [29]:
%env AWS_PROFILE=opensci

env: AWS_PROFILE=opensci


In [30]:
sym.shape

(448772, 9)

In [31]:
import awswrangler as aw

In [36]:
# Storing data on Data Lake
aw.s3.to_parquet(
    df=sym,
    path="s3://menolife-opensci/dataset/symptoms/",
    dataset=True,
    database="menolife",
    table="symptoms",
    index=False
)

{'paths': ['s3://menolife-opensci/dataset/symptoms/cfa925e452f146658f4db4e996035333.snappy.parquet'],
 'partitions_values': {}}

In [37]:
# Storing triggers data on Data Lake
aw.s3.to_parquet(
    df=triggers,
    path="s3://menolife-opensci/dataset/triggers/",
    dataset=True,
    database="menolife",
    table="triggers",
    index=False
)

{'paths': ['s3://menolife-opensci/dataset/triggers/6b7ae26ce076488190d32e81f759317b.snappy.parquet'],
 'partitions_values': {}}

<hr>

# Checkins

In [35]:
data = pd.read_csv(open('raw_data/checkin.csv', 'r'), low_memory=False)
data.head()

Unnamed: 0,uid,timestamp,type,weight,weight_unit,mood,exercise,sleep
0,ObjectId(6045a7bdbea29a1eb66bb4e8),2021-03-08T04:37:03.919Z,PM,195.0,lbs,"Not good, but not bad either.",No,8.0
1,ObjectId(5f8e22b60054a6c5fab15447),2020-10-19T23:37:43.890Z,PM,,,Awful.,No,
2,ObjectId(5f8e22b60054a6c5fab15447),2020-10-19T23:39:50.201Z,PM,,,Awful.,No,
3,ObjectId(5f8e22b60054a6c5fab15447),2020-10-19T23:40:01.399Z,PM,,,Great!,Yes,
4,ObjectId(5f8e89640054a6c5fab1544d),2020-10-20T07:59:24.540Z,PM,,,Great!,Yes,


In [36]:
data.dtypes

uid             object
timestamp       object
type            object
weight         float64
weight_unit     object
mood            object
exercise        object
sleep           object
dtype: object

In [37]:
data['exercise'].unique()

array(['No', 'Yes', nan], dtype=object)

In [38]:
data['type'].unique()

array(['PM', 'AM'], dtype=object)

In [39]:
data = fixUid(data)
data = fixTime(data, 'timestamp')

In [40]:
data.head()

Unnamed: 0,uid,datetime,type,weight,weight_unit,mood,exercise,sleep
0,6045a7bdbea29a1eb66bb4e8,2021-03-08 04:37:03,PM,195.0,lbs,"Not good, but not bad either.",No,8.0
1,5f8e22b60054a6c5fab15447,2020-10-19 23:37:43,PM,,,Awful.,No,
2,5f8e22b60054a6c5fab15447,2020-10-19 23:39:50,PM,,,Awful.,No,
3,5f8e22b60054a6c5fab15447,2020-10-19 23:40:01,PM,,,Great!,Yes,
4,5f8e89640054a6c5fab1544d,2020-10-20 07:59:24,PM,,,Great!,Yes,


In [41]:
data.dtypes

uid                    object
datetime       datetime64[ns]
type                   object
weight                float64
weight_unit            object
mood                   object
exercise               object
sleep                  object
dtype: object

In [42]:
data['sleep'].unique()

array(['8', nan, 'More than 8 hours', 'None', '6 to 8 hours',
       '2 to 4 hours', '6', '4', '1 to 2 hours', '4 to 6 hours', '5', '7',
       '9', '3', '10', '0', '2', '15', '11', '1', '12', '13', '14', '24',
       '23', '18', '17', '16', '21', '22', '20', '19'], dtype=object)

In [43]:
def fixTypesCheckin(frame):
    """
    Method which fixes the types for the checkin frame.
    """
    
    checkin = frame.copy(deep=True)
    
    checkin['uid'] = checkin['uid'].astype('string')
    checkin['type'] = checkin['type'].astype('string')
    
    checkin['weight'] = checkin['weight'].replace(to_replace=np.nan, value=-1)
    checkin['weight'] = checkin['weight'].astype('float64')
    
    checkin['weight_unit'] = checkin['weight_unit'].replace(to_replace=np.nan, value='')
    checkin['weight_unit'] = checkin['weight_unit'].astype('string')
    
    checkin['mood'] = checkin['mood'].replace(to_replace=np.nan, value='')
    checkin['mood'] = checkin['mood'].astype('string')
    
    checkin['exercise'] = checkin['exercise'].replace(to_replace=np.nan, value='')
    checkin['exercise'] = checkin['exercise'].astype('string')
    
    """
    For sleep, we will split it into 2 separate columns. 
    (sleep, sleep_text)
    * sleep column will have the numeric value if the users have entered that. (dtype=int)
    * sleep_text will have the text value if the users had chosen that, or ''. (dtype=string)
    """
    sleep = []
    sleep_text = []
    checkin['sleep'] = checkin['sleep'].replace(to_replace=np.nan, value='')
    for s in checkin['sleep'].values:
        try:
            temp = int(s)
            sleep += [s]
            sleep_text += ['']
        except ValueError as ex:
            sleep += [-1]
            sleep_text += [s]
            
    checkin['sleep'] = sleep
    checkin['sleep'] = checkin['sleep'].astype('int64')
    checkin['sleep_text'] = sleep_text
    checkin['sleep_text'] = checkin['sleep_text'].astype('string')
    
    return checkin

In [44]:
checkin = fixTypesCheckin(data)

In [45]:
checkin.head()

Unnamed: 0,uid,datetime,type,weight,weight_unit,mood,exercise,sleep,sleep_text
0,6045a7bdbea29a1eb66bb4e8,2021-03-08 04:37:03,PM,195.0,lbs,"Not good, but not bad either.",No,8,
1,5f8e22b60054a6c5fab15447,2020-10-19 23:37:43,PM,-1.0,,Awful.,No,-1,
2,5f8e22b60054a6c5fab15447,2020-10-19 23:39:50,PM,-1.0,,Awful.,No,-1,
3,5f8e22b60054a6c5fab15447,2020-10-19 23:40:01,PM,-1.0,,Great!,Yes,-1,
4,5f8e89640054a6c5fab1544d,2020-10-20 07:59:24,PM,-1.0,,Great!,Yes,-1,


In [46]:
checkin.dtypes

uid                    string
datetime       datetime64[ns]
type                   string
weight                float64
weight_unit            string
mood                   string
exercise               string
sleep                   int64
sleep_text             string
dtype: object

In [47]:
# Storing checki data on Data Lake
aw.s3.to_parquet(
    df=checkin,
    path="s3://menolife-opensci/dataset/checkins/",
    dataset=True,
    database="menolife",
    table="checkins",
    index=False
)

{'paths': ['s3://menolife-opensci/dataset/checkins/e323428158c74f289c9e662f9420c2db.snappy.parquet'],
 'partitions_values': {}}

# Products data

In [48]:
products = pd.read_csv(open('raw_data/product.csv', 'r'))
products.head()

Unnamed: 0,uid,sku,date_added
0,ObjectId(5f07bcc8f284553b7b6a0a94),MENOGLOW90,2021-03-23T21:40:00.000Z
1,ObjectId(5f07bcc8f284553b7b6a0a94),MENOFIT90,2021-03-23T21:40:00.000Z
2,ObjectId(609bf58e5a14d7f274edf921),MENOFIT,2020-07-08T18:49:28.000Z
3,ObjectId(609bf58e5a14d7f274edf921),MENOFIT90,2020-07-13T19:29:36.000Z
4,ObjectId(609bf58e5a14d7f274edf921),MENOGLOW90,2020-07-13T19:29:36.000Z


In [49]:
products = fixUid(products)
products.head()

Unnamed: 0,uid,sku,date_added
0,5f07bcc8f284553b7b6a0a94,MENOGLOW90,2021-03-23T21:40:00.000Z
1,5f07bcc8f284553b7b6a0a94,MENOFIT90,2021-03-23T21:40:00.000Z
2,609bf58e5a14d7f274edf921,MENOFIT,2020-07-08T18:49:28.000Z
3,609bf58e5a14d7f274edf921,MENOFIT90,2020-07-13T19:29:36.000Z
4,609bf58e5a14d7f274edf921,MENOGLOW90,2020-07-13T19:29:36.000Z


In [50]:
products = fixTime(products, 'date_added')
products.head()

Unnamed: 0,uid,sku,datetime
0,5f07bcc8f284553b7b6a0a94,MENOGLOW90,2021-03-23 21:40:00
1,5f07bcc8f284553b7b6a0a94,MENOFIT90,2021-03-23 21:40:00
2,609bf58e5a14d7f274edf921,MENOFIT,2020-07-08 18:49:28
3,609bf58e5a14d7f274edf921,MENOFIT90,2020-07-13 19:29:36
4,609bf58e5a14d7f274edf921,MENOGLOW90,2020-07-13 19:29:36


In [51]:
products.dtypes

uid                 object
sku                 object
datetime    datetime64[ns]
dtype: object

In [52]:
def fixTypesProducts(frame):
    frame['uid'] = frame['uid'].astype('string')
    frame['sku'] = frame['sku'].astype('string')
    
    return frame

In [53]:
products = fixTypesProducts(products)
products.dtypes

uid                 string
sku                 string
datetime    datetime64[ns]
dtype: object

In [54]:
products.head()

Unnamed: 0,uid,sku,datetime
0,5f07bcc8f284553b7b6a0a94,MENOGLOW90,2021-03-23 21:40:00
1,5f07bcc8f284553b7b6a0a94,MENOFIT90,2021-03-23 21:40:00
2,609bf58e5a14d7f274edf921,MENOFIT,2020-07-08 18:49:28
3,609bf58e5a14d7f274edf921,MENOFIT90,2020-07-13 19:29:36
4,609bf58e5a14d7f274edf921,MENOGLOW90,2020-07-13 19:29:36


In [55]:
# Storing checki data on Data Lake
aw.s3.to_parquet(
    df=products,
    path="s3://menolife-opensci/dataset/products/",
    dataset=True,
    database="menolife",
    table="products",
    index=False
)

{'paths': ['s3://menolife-opensci/dataset/products/24a3ae50be7e415d8557041272ad9e61.snappy.parquet'],
 'partitions_values': {}}

# Period Data

In [56]:
filename = 'raw_data/periods.csv'
periods = pd.read_csv(open(filename, 'r'))
periods.head()

Unnamed: 0,uid,start_time,flow
0,ObjectId(640a88974c33a20028c60365),2023-03-10T05:00:00.000Z,"[{""flow_time"":{""$date"":""2023-03-10T05:00:00.00..."
1,ObjectId(60fd791bbea29a1eb66d0610),2021-10-31T04:00:00.000Z,"[{""flow_time"":{""$date"":""2021-10-31T04:00:00.00..."
2,ObjectId(60d07cf4bea29a1eb66ca764),2021-06-15T04:00:00.000Z,"[{""flow_time"":{""$date"":""2021-06-15T04:00:00.00..."
3,ObjectId(5f8e89640054a6c5fab1544d),2020-10-17T18:30:00.000Z,"[{""flow_time"":{""$date"":""2020-10-17T18:30:00.00..."
4,ObjectId(60283634bea29a1eb66b7c1d),2021-06-20T10:06:01.032Z,"[{""flow_time"":{""$date"":""2021-06-20T10:06:01.03..."


In [57]:
periods = fixUid(periods)
periods = fixTime(periods, 'start_time', 'start_time')
periods.head()

Unnamed: 0,uid,start_time,flow
0,640a88974c33a20028c60365,2023-03-10 05:00:00,"[{""flow_time"":{""$date"":""2023-03-10T05:00:00.00..."
1,60fd791bbea29a1eb66d0610,2021-10-31 04:00:00,"[{""flow_time"":{""$date"":""2021-10-31T04:00:00.00..."
2,60d07cf4bea29a1eb66ca764,2021-06-15 04:00:00,"[{""flow_time"":{""$date"":""2021-06-15T04:00:00.00..."
3,5f8e89640054a6c5fab1544d,2020-10-17 18:30:00,"[{""flow_time"":{""$date"":""2020-10-17T18:30:00.00..."
4,60283634bea29a1eb66b7c1d,2021-06-20 10:06:01,"[{""flow_time"":{""$date"":""2021-06-20T10:06:01.03..."


In [58]:
flows = periods['flow'].values
counter = {}

In [59]:
periods.iloc[11]['flow']

'[{"flow_time":{"$date":"2020-10-24T01:38:38.287Z"},"value":2},{"flow_time":{"$date":"2020-10-24T01:39:05.837Z"},"value":2},{"flow_time":{"$date":"2020-10-24T01:39:15.010Z"},"value":2},{"flow_time":{"$date":"2020-10-24T01:39:44.017Z"},"value":2},{"flow_time":{"$date":"2020-10-24T01:39:57.008Z"},"value":2},{"flow_time":{"$date":"2020-10-24T01:44:31.387Z"},"value":2},{"flow_time":{"$date":"2020-10-24T01:44:45.357Z"},"value":3},{"flow_time":{"$date":"2020-10-24T01:47:03.488Z"},"value":2},{"flow_time":{"$date":"2020-10-24T01:51:52.509Z"},"value":2},{"flow_time":{"$date":"2020-10-25T14:54:38.776Z"},"value":1},{"flow_time":{"$date":"2020-10-25T18:44:14.661Z"},"value":0}]'

Looks like each period entry indicates when a period started for a user. The flow array indicates a flow recording for the user and when they added it. 

TODO: We need to check with the app logic what value=0, value=1, etc. means. We had things like spotting, heavy flow, etc.

In [60]:
periods = generateUniqueId(periods, 'period_id')

In [61]:
def createFlowTable(dataframe : pd.DataFrame) -> pd.DataFrame:
    """
    Method which flattens the json and creates a flow table from the period data. 
    """
    
    # Lists that will hold all the column data.
    periodid = []
    flow_time = []
    flow_value = []
    
    for (period_id, flows) in zip(dataframe['period_id'], dataframe['flow']):
        flows = json.loads(flows)
        if len(flows) <= 0:
            continue
        
        # For each flow entry we will flatten it.
        for flow in flows:
            ft = flow['flow_time']  # This in itself is a json that needs to be parsed.
            ft = ft['$date']
            # Convert from mongodb timestamp 
            d = datetime.strptime(ft, '%Y-%m-%dT%H:%M:%S.%fZ')
            ft = d.strftime('%Y-%m-%d %H:%M:%S')
            
            fv = flow['value']
            
            # Add everything to the column arrays.
            periodid += [period_id]
            flow_time += [ft]
            flow_value += [fv]
            
    
    # Create a flows dataframe with all the information.
    frame = pd.DataFrame({
        'period_id' : periodid,
        'datetime' : flow_time,
        'value' : flow_value
    })
    
    # Create the index for this as well.
    t = int(datetime.now().timestamp())
    frame['flow_id'] = np.arange(t, t+frame.shape[0])
    frame.set_index('flow_id')
    
    # Reset some of the type information.
    frame['datetime'] = frame['datetime'].astype('datetime64')
    frame['value'] = frame['value'].astype('int64') 
    
    return frame

In [62]:
flows = createFlowTable(periods)

In [63]:
flows.head()

Unnamed: 0,period_id,datetime,value,flow_id
0,1681881458,2023-03-10 05:00:00,2,1681881459
1,1681881459,2021-10-31 04:00:00,1,1681881460
2,1681881460,2021-06-15 04:00:00,2,1681881461
3,1681881460,2021-06-19 04:00:00,0,1681881462
4,1681881461,2020-10-17 18:30:00,3,1681881463


In [64]:
# Storing  data on Data Lake
aw.s3.to_parquet(
    df=periods,
    path="s3://menolife-opensci/dataset/periods/",
    dataset=True,
    database="menolife",
    table="periods",
    index=False
)

{'paths': ['s3://menolife-opensci/dataset/periods/c42b499801a84522bf1efd81c0e5bb2f.snappy.parquet'],
 'partitions_values': {}}

In [65]:
# Storing checki data on Data Lake
aw.s3.to_parquet(
    df=flows,
    path="s3://menolife-opensci/dataset/flows/",
    dataset=True,
    database="menolife",
    table="flows",
    index=False
)

{'paths': ['s3://menolife-opensci/dataset/flows/af8746ee07be4729a5a715307e711aa9.snappy.parquet'],
 'partitions_values': {}}

# Menopause State

In [106]:
menopause = pd.read_csv(open('raw_data/menopause.csv', 'r'))
print(menopause.shape)
print(menopause.columns)
print(menopause.head())

(58926, 2)
Index(['uid', 'menopause'], dtype='object')
                                  uid menopause
0  ObjectId(61d8615fbea29a1eb66f7bc4)      True
1  ObjectId(62e5342ce46e8f00284ad6f3)     False
2  ObjectId(5f93d4c3a42cd9d7053ec24a)     False
3  ObjectId(5f8e89640054a6c5fab1544d)     False
4  ObjectId(5f8f1b7566dae0d0328adb78)     False


In [109]:
menopause = fixUid(menopause)

In [110]:
menopause['menopause'].astype = 'boolean'
menopause.dtypes

uid          object
menopause    object
dtype: object

In [114]:
cols = menopause.columns.values

In [115]:
cols[1] = 'state'
menopause.columns = cols

In [116]:
menopause.head()

Unnamed: 0,uid,state
0,61d8615fbea29a1eb66f7bc4,True
1,62e5342ce46e8f00284ad6f3,False
2,5f93d4c3a42cd9d7053ec24a,False
3,5f8e89640054a6c5fab1544d,False
4,5f8f1b7566dae0d0328adb78,False


In [117]:
import awswrangler as aw

# Storing menopause data on Data Lake
aw.s3.to_parquet(
    df=menopause,
    path="s3://menolife-opensci/dataset/menopause/",
    dataset=True,
    database="menolife",
    table="menopause",
    index=False
)

{'paths': ['s3://menolife-opensci/dataset/menopause/18a6e20e38e64f4bac768e8968f08f72.snappy.parquet'],
 'partitions_values': {}}

# Birthdays

In [28]:
birthdays = pd.read_csv(open('raw_data/birthday.csv', 'r'))
print(birthdays.columns)

Index(['uid', 'birthday'], dtype='object')


In [29]:
birthdays = fixUid(birthdays)

In [32]:
birthdays = fixTime(birthdays, 'birthday')

In [35]:
# Storing birthday data on Data Lake
aw.s3.to_parquet(
    df=birthdays,
    path="s3://menolife-opensci/dataset/menopause/",
    dataset=True,
    database="menolife",
    table="birthdays",
    index=False
)

{'paths': ['s3://menolife-opensci/dataset/menopause/b48ac34d4f7e4587a26d4ea9d4682953.snappy.parquet'],
 'partitions_values': {}}

# User on-boarding

The original csv file had to be modified to change the `,` delimiter to `&` because the file contains a large number of json values also with `,`.

The vim commands used to carry out these substitutions were
* `%s/],202/]\&202/g`
* `%s/Z,false`
* `%s/Z,true/Z\&true/g`

In [129]:
onboarding = pd.read_csv(open('raw_data/user_on_boarding 1.csv', 'r'), delimiter='&')
print(onboarding.shape)

(56647, 6)


In [130]:
onboarding

Unnamed: 0,_id,uid,on_board_time,on_board_dirty,no_of_time_survey_taken,questions
0,ObjectId(64942c7373e2d10028c61f31),ObjectId(64942c7273e2d10028c61f2b),2023-06-22T11:11:47.225Z,true,1,"[{""answer"":[""Anxiety"",""Bloating"",""Fatigue"",""In..."
1,ObjectId(6494357173e2d10028c62150),ObjectId(6494356f73e2d10028c6214a),2023-06-22T11:50:09.283Z,true,1,"[{""answer"":[""Anxiety"",""Bloating"",""Difficulty C..."
2,ObjectId(6495a79c73e2d10028c6bdd8),ObjectId(6495a79773e2d10028c6bdd2),2023-06-23T14:09:31.980Z,true,1,"[{""answer"":[""Anxiety"",""Bloating"",""Difficulty C..."
3,ObjectId(6495d7c173e2d10028c6cd19),ObjectId(6495d7c173e2d10028c6cd13),2023-06-23T17:34:57.596Z,true,1,"[{""answer"":[""Fatigue"",""Hot Flashes/Night Sweat..."
4,ObjectId(6497a95373e2d10028c73233),ObjectId(6497a94f73e2d10028c7322d),2023-06-25T02:41:22.902Z,true,1,"[{""answer"":[""Difficulty Concentrating"",""Brittl..."
...,...,...,...,...,...,...
56642,ObjectId(6493a06673e2d10028c60a0a),ObjectId(6493a06473e2d10028c60a04),2023-06-22T01:14:14.027Z,true,1,"[{""answer"":[""Low Sex Drive"",""Irregular Periods..."
56643,ObjectId(6493ae5173e2d10028c60d9b),ObjectId(6493ae4f73e2d10028c60d95),2023-06-22T02:13:36.884Z,true,1,"[{""answer"":[""Anxiety"",""Brittle Hair/Nails"",""Di..."
56644,ObjectId(6493538a73e2d10028c5eea9),ObjectId(6493538973e2d10028c5eea3),2023-06-21T19:46:18.149Z,true,1,"[{""answer"":[""Anxiety"",""Bloating"",""Difficulty C..."
56645,ObjectId(64939ca973e2d10028c60892),ObjectId(64939ca773e2d10028c6088c),2023-06-22T00:58:16.947Z,true,1,"[{""answer"":[""Anxiety"",""Bloating"",""Difficulty C..."


In [131]:
def createQaTable(frame : pd.DataFrame):
    questions = []
    answers = []
    community_id = []
    
    for (uid, question) in zip(frame['uid'].values, frame['questions'].values):
        # Parse the json array.
        try:
            jdata = json.loads(question)
        except:
            # If we are unable to decode the json, just skip over it.
            continue
        # For privacy reasons we will not ingest any identifiable information into our data.
        # This includes, names, emails, phone numbers.
        not_include = ['Name', 'Email', 'PhoneNumber']
        
        for qa in jdata:
            q = qa['question']
            a = qa['answer']
            if not q in not_include:
                # Flatten everything into comma seperated strings, if they are arrays.
                # AWSWrangler library is not able to ingest it otherwise.
                str = ''
                for item in a:
                    if len(str) <= 0:
                        str += item
                    else:
                        str += ','+item
                
                a = str
                # Convert dob into correct date format.
                # This is sad, but over the evolution of the app we have had many  
                dateformats = ['%m/%d/%Y', '%Y-%m-%dT%H:%M:%S.%fZ', '%d-%m-%Y', '%Y-%m-%dT%H:%M:%S.%fZ']
                if q == 'DateOfBirth' and (not a == ''):
                    for df in dateformats:
                        try:
                            d = datetime.strptime(a, df)
                            a = d.strftime('%Y-%m-%d %H:%M:%S')
                            break
                        except ValueError:
                           # Not this format we move on to the next.
                            pass

                # Add this to our tables.
                community_id += [uid.split('(')[1].split(')')[0]]
                questions += [q]
                answers += [a]

    return pd.DataFrame({'community_id':community_id, 'questions':questions, 'answers':answers})

In [132]:
qframe = createQaTable(onboarding)

In [133]:
qframe

Unnamed: 0,community_id,questions,answers
0,64942c7273e2d10028c61f2b,Choose the symptoms you deal with the most.,"Anxiety,Bloating,Fatigue,Insomnia,Mood Swings/..."
1,64942c7273e2d10028c61f2b,What topics are you most interested in?,"Education,Sleep & Wellness,Lifestyle,Health & ..."
2,64942c7273e2d10028c61f2b,How would you describe your periods?,Regular cycle
3,64942c7273e2d10028c61f2b,,
4,64942c7273e2d10028c61f2b,,
...,...,...,...
253572,64939ca773e2d10028c6088c,How did you enter menopause?,Naturally
253573,64939ca773e2d10028c6088c,,
253574,623f092fb67903e071d12d40,Choose the symptoms you deal with the most.,"Fatigue,Irregular Periods,Weight Gain"
253575,623f092fb67903e071d12d40,What topics are you most interested in?,Hot Flashes &\nSymptoms


In [134]:
%set_env AWS_PROFILE=opensci
import awswrangler as wr
# Store this into our database.
wr.s3.to_parquet(
    df=qframe,
    path="s3://menolife-opensci/dataset/onboarding/",
    dataset=True,
    database="menolife",
    table="onboarding",
    index=False
)

env: AWS_PROFILE=opensci


{'paths': ['s3://menolife-opensci/dataset/onboarding/9160c8de8450428d92cd0618f507b887.snappy.parquet'],
 'partitions_values': {}}

# Upload test users dataset. 

These are users who were internal to the testing procedure, and hence their uids should be removed from the customer data.

In [125]:
testusers = pd.read_csv(open('raw_data/test_users.csv', 'r'))
testusers.head()

Unnamed: 0,uid,email
0,5ed7ceb017a07779525e3bb6,carmela.c@menolabs.com
1,5ed9637917a07779525e3c55,supportninja@menolabs.com
2,5ed7cda40aa5594e18fc1c8f,irene.p@menolabs.com
3,5e4c6dc64d8bcf4e32f7ec5c,vanessa@menolabs.com
4,5ed7ce4b17a07779525e3bb4,rio.s@menolabs.com


In [127]:
%set_env AWS_PROFILE=opensci
import awswrangler as wr
# Store this into our database.
wr.s3.to_parquet(
    df=testusers,
    path="s3://menolife-opensci/dataset/testusers/",
    dataset=True,
    database="menolife",
    table="testusers",
    index=False
)

env: AWS_PROFILE=opensci


{'paths': ['s3://menolife-opensci/dataset/testusers/35baeb9c76a94b17bd3c90a5edc90c7c.snappy.parquet'],
 'partitions_values': {}}

# Period start and end times

The original period data dump we got did not have the end date for the periods. Hence we extract just the start and end dates for the period from the new data dump. 

In [19]:
filename = 'raw_data/periods_start_end.csv'
periods = pd.read_csv(open(filename, 'r'))
periods.head()

Unnamed: 0,_id,community_id,end_time,start_time,uid
0,6463472bfed62d3e645e05cf,5e4a9bb73c52676149bb86f2,2020-10-20 19:44:12,2020-10-17 18:30:00,5f8e89640054a6c5fab1544d
1,6463472bfed62d3e645e05d0,5efb7a38be8ac404c2a8f8e9,2020-10-21 18:30:00,2020-10-14 18:30:00,5f8f274466dae0d0328adb80
2,6463472bfed62d3e645e05d1,5e81f0663e55077bcf96dfc1,,2020-09-22 18:30:00,5f8f260566dae0d0328adb7e
3,6463472bfed62d3e645e05d2,5f924ecb148e7ccea682ae43,2020-10-28 13:57:29,2020-10-23 03:32:53,5f924ee1a42cd9d7053ec1f2
4,6463472bfed62d3e645e05d3,5f935270d02b92cbe65cce83,,2020-10-23 22:01:39,5f935278a42cd9d7053ec212


In [20]:
periods.dtypes

_id             object
community_id    object
end_time        object
start_time      object
uid             object
dtype: object

In [21]:
periods['start_time'] = periods['start_time'].astype('datetime64')
periods['end_time'] = periods['end_time'].astype('datetime64')

In [22]:
periods.dtypes

_id                     object
community_id            object
end_time        datetime64[ns]
start_time      datetime64[ns]
uid                     object
dtype: object

In [23]:
%set_env AWS_PROFILE=opensci
import awswrangler as wr
# Store this into our database.
wr.s3.to_parquet(
    df=periods,
    path="s3://menolife-opensci/dataset/periodsstartend/",
    dataset=True,
    database="menolife",
    table="periods_start_end",
    index=False
)

env: AWS_PROFILE=opensci


{'paths': ['s3://menolife-opensci/dataset/periodsstartend/119760049e9b42c68444f2945e0623e8.snappy.parquet'],
 'partitions_values': {}}