# Prepare success data for analysis

In [60]:
import pandas as pd
from collections import Counter
# Specify the dataset by date
DAYLIO_DATE = '11_20'
# Specify the removal threshold
FREQ_THRESHOLD = 4
# Specify the days before limit
DAYS_BEFORE_LIMIT = 1

### Read app data

In [61]:
# Read CSV file exported from the Daylio app
filename = f'daylio_export_2023_{DAYLIO_DATE}.csv'
df = pd.read_csv(filename)
# Drop diary note columns
df.drop(columns=['note_title', 'note'], inplace=True)
df.head(2)

Unnamed: 0,full_date,date,weekday,time,mood,activities
0,2023-11-20,November 20,Monday,03:38,strategic failure,takeaway | exhausted | 7-9h | family | board g...
1,2023-11-18,November 18,Saturday,23:29,day off,fast cooking | treated | slow | 7-9h | 1 shot ...


### Create datetime column

In [62]:
# Combine the date and time columns into a single one
df['datetime'] = pd.to_datetime(df['full_date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M')
# Drop other date columns
df.drop(columns=['full_date', 'weekday', 'date', 'time'], inplace=True)
# Print earliest and latest dates
print(f"From {df['datetime'].min()} to {df['datetime'].max()}")

From 2023-10-07 23:59:00 to 2023-11-20 03:38:00


### Scale Feature Functions

In [63]:
def clean_activities(scale):
    """Remove activities from the activity list."""
    df['activities'] = df['activities'].apply(lambda x: [i for i in x if i not in scale.keys()])

def scale_activities(scale, new_col_name):
    """Scale activities in the activity list."""
    df[new_col_name] = df['activities'].apply(lambda x: [i for i in x if i in scale.keys()])
    clean_activities(scale)
    df[new_col_name] = df[new_col_name].apply(lambda x: scale[x[0]] if len(x)>0 else None)

def increment_scale(activities, scale):
    """Increment scale with other activities."""
    total = 0
    for item in activities:
        if item in scale:
            total += scale[item]
    return total

def balance_feature(col, default=0):
    """Put zeroes and minimise ones."""
    df[col].fillna(default, inplace=True)
    df[col] = df[col].apply(lambda x: 1 if x>1 else x)

# Parse activities into list
df['activities'] = df['activities'].apply(lambda x: x.split(' | '))

### Scale ordinal success labels

In [64]:
success_col = 'success'
# Map each success label to ordinal scale
success_scale = {
    'strategic success': 1.00,
    'gain': 0.75,
    'expensive gain': 0.50,
    'day off': 0.25,
    'strategic failure': 0.00
}
# Create the success target column
df.rename(columns={'mood': success_col}, inplace=True)
df[success_col] = df[success_col].map(success_scale)
df.head(2)

Unnamed: 0,success,activities,datetime
0,0.0,"[takeaway, exhausted, 7-9h, family, board game...",2023-11-20 03:38:00
1,0.25,"[fast cooking, treated, slow, 7-9h, 1 shot, sh...",2023-11-18 23:29:00


### Scale energy

In [65]:
# Map each energy label to ordinal scale 
energy_scale = {
    'rush': 1.00,
    'quick': 0.66,
    'slow': 0.33,
    'exhausted': 0
}
scale_activities(energy_scale, 'energy')
df.head(2)

Unnamed: 0,success,activities,datetime,energy
0,0.0,"[takeaway, 7-9h, family, board games, youtube,...",2023-11-20 03:38:00,0.0
1,0.25,"[fast cooking, treated, 7-9h, 1 shot, shower, ...",2023-11-18 23:29:00,0.33


### Scale ordinal sleep quality labels

In [66]:
# Map each sleep quality label to ordinal scale 
sleep_scale = {
    '9+h': 1.00,
    '7-9h': 0.66,
    '5-7h': 0.33,
    '<5h': 0
}
scale_activities(sleep_scale, 'sleep')
df.head(2)

Unnamed: 0,success,activities,datetime,energy,sleep
0,0.0,"[takeaway, family, board games, youtube, reels...",2023-11-20 03:38:00,0.0,0.66
1,0.25,"[fast cooking, treated, 1 shot, shower, family...",2023-11-18 23:29:00,0.33,0.66


### Scale ordinal caffeine labels

In [67]:
# Scale coffee shots
caffeine_scale = {
    '3+ shots': 1.00,
    '2 shots': 0.67,
    '1 shot': 0.34,
}
scale_activities(caffeine_scale, 'caffeine')
# Add other caffeine drinks
other_caffeine_drinks = {
    'soda': 0.16,
    'green tea': 0.25
}
df['caffeine'] += df['activities'].apply(increment_scale, args=(other_caffeine_drinks, ))
clean_activities(other_caffeine_drinks)
# Balance the feature
balance_feature('caffeine')
df.head(2)

Unnamed: 0,success,activities,datetime,energy,sleep,caffeine
0,0.0,"[takeaway, family, board games, youtube, reels...",2023-11-20 03:38:00,0.0,0.66,0.0
1,0.25,"[fast cooking, treated, shower, family, friend...",2023-11-18 23:29:00,0.33,0.66,0.34


### Scale Job

In [68]:
# Map job label to ordinal scale 
job_scale = {
    'worked 7+ hours': 1.00,
    'worked 5-6 hours': 0.75,
    'worked 3-4 hours': 0.50,
    'worked <2 hours': 0.25
}
scale_activities(job_scale, 'job_hours')
balance_feature('job_hours')
df.head(2)

Unnamed: 0,success,activities,datetime,energy,sleep,caffeine,job_hours
0,0.0,"[takeaway, family, board games, youtube, reels...",2023-11-20 03:38:00,0.0,0.66,0.0,0.0
1,0.25,"[fast cooking, treated, shower, family, friend...",2023-11-18 23:29:00,0.33,0.66,0.34,0.0


### Scale Studies

In [69]:
# Map study label to ordinal scale 
study_scale = {
    'studied 7+ hours': 1.00,
    'studied 5-6 hours': 0.75,
    'studied 3-4 hours': 0.50,
    'studied <2 hours': 0.25
}
scale_activities(study_scale, 'study_hours')
balance_feature('study_hours')
df.head(2)

Unnamed: 0,success,activities,datetime,energy,sleep,caffeine,job_hours,study_hours
0,0.0,"[takeaway, family, board games, youtube, reels...",2023-11-20 03:38:00,0.0,0.66,0.0,0.0,0.0
1,0.25,"[fast cooking, treated, shower, family, friend...",2023-11-18 23:29:00,0.33,0.66,0.34,0.0,0.0


### Combine Studies and Job into Work

In [70]:
# Map study label to ordinal scale 
df["total_work_hours"] = df["study_hours"] + df["job_hours"]
balance_feature('total_work_hours')
df.head(2)

Unnamed: 0,success,activities,datetime,energy,sleep,caffeine,job_hours,study_hours,total_work_hours
0,0.0,"[takeaway, family, board games, youtube, reels...",2023-11-20 03:38:00,0.0,0.66,0.0,0.0,0.0,0.0
1,0.25,"[fast cooking, treated, shower, family, friend...",2023-11-18 23:29:00,0.33,0.66,0.34,0.0,0.0,0.0


### Scale Luck

In [71]:
# Map luck label to ordinal scale 
luck_scale = {
    'lucky': 1.00,
    'unlucky': 0.00,
}
scale_activities(luck_scale, 'luck')
balance_feature('luck', 0.5)
df.head(2)

Unnamed: 0,success,activities,datetime,energy,sleep,caffeine,job_hours,study_hours,total_work_hours,luck
0,0.0,"[takeaway, family, board games, youtube, reels...",2023-11-20 03:38:00,0.0,0.66,0.0,0.0,0.0,0.0,0.5
1,0.25,"[fast cooking, treated, shower, family, friend...",2023-11-18 23:29:00,0.33,0.66,0.34,0.0,0.0,0.0,0.5


### Scale Strategy

In [72]:
# Map strategic labels to ordinal scale 
strategic_scale = {
    'Merry Xmas': 0.25,
    'dive into deepest shit': 0.25,
    'stop consuming content': 0.25,
    'avoid mistakes': 0.25,
}
df['strategy'] = df['activities'].apply(increment_scale, args=(strategic_scale, ))
clean_activities(strategic_scale)
df.head(2)

Unnamed: 0,success,activities,datetime,energy,sleep,caffeine,job_hours,study_hours,total_work_hours,luck,strategy
0,0.0,"[takeaway, family, board games, youtube, reels...",2023-11-20 03:38:00,0.0,0.66,0.0,0.0,0.0,0.0,0.5,0.0
1,0.25,"[fast cooking, treated, shower, family, friend...",2023-11-18 23:29:00,0.33,0.66,0.34,0.0,0.0,0.0,0.5,0.0


### Scale external influence

In [73]:
# Map saying "yes" or "no" to people label to ordinal scale 
yes_scale = {
    'said yes""': 1.00,
    'said no""': 0.00,
}
scale_activities(yes_scale, 'agreeableness')
balance_feature('agreeableness', 0.5)
df.head(2)

Unnamed: 0,success,activities,datetime,energy,sleep,caffeine,job_hours,study_hours,total_work_hours,luck,strategy,agreeableness
0,0.0,"[takeaway, family, board games, youtube, reels...",2023-11-20 03:38:00,0.0,0.66,0.0,0.0,0.0,0.0,0.5,0.0,0.5
1,0.25,"[fast cooking, treated, shower, family, friend...",2023-11-18 23:29:00,0.33,0.66,0.34,0.0,0.0,0.0,0.5,0.0,0.5


### One-hot encode other activities

In [74]:
# Count all unique activity occurences
activity_labels = [label for activities in df['activities'] for label in activities]
label_to_count = dict(Counter(activity_labels))
# Remove all low frequency labels
high_freq_labels = [label for label in label_to_count.keys() if label_to_count[label] >= FREQ_THRESHOLD]
df['activities'] = df['activities'].apply(lambda x: [i for i in x if i in high_freq_labels])
# Create columns for one-hot encoding
for label in high_freq_labels:
    df[label] = 0
# Fill one-hot encodings
for i, row in df.iterrows():
    for label in row['activities']:
        df.at[i, label] = 1
# Drop activity column
df.drop(columns=['activities'], inplace=True)
df.head(2)

Unnamed: 0,success,datetime,energy,sleep,caffeine,job_hours,study_hours,total_work_hours,luck,strategy,...,fasted,healthy cooking,notion,snoosing,angry,burtnieku,coworkers,rbs,office,spend money
0,0.0,2023-11-20 03:38:00,0.0,0.66,0.0,0.0,0.0,0.0,0.5,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.25,2023-11-18 23:29:00,0.33,0.66,0.34,0.0,0.0,0.0,0.5,0.0,...,0,0,0,0,0,0,0,0,0,0


### Add homeday

In [75]:
df['day_at_home'] = ((df['office']==1) | (df['rbs']==1) | (df['burtnieku']==1))
df['day_at_home'] = df['day_at_home'].apply(lambda x: 1 if x==0 else 0)
df.head(2)

Unnamed: 0,success,datetime,energy,sleep,caffeine,job_hours,study_hours,total_work_hours,luck,strategy,...,healthy cooking,notion,snoosing,angry,burtnieku,coworkers,rbs,office,spend money,day_at_home
0,0.0,2023-11-20 03:38:00,0.0,0.66,0.0,0.0,0.0,0.0,0.5,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0.25,2023-11-18 23:29:00,0.33,0.66,0.34,0.0,0.0,0.0,0.5,0.0,...,0,0,0,0,0,0,0,0,0,1


### Shift some values

In [76]:
# Sort record by datetime ascending
df.sort_values(by='datetime', ascending=True, inplace=True)
# Put add datetime shifts
for i in range(1, DAYS_BEFORE_LIMIT + 1):
    df[f'success_{i}_day_before'] = df['success'].shift(i)
    df[f'sleep_{i}_day_before'] = df['sleep'].shift(i)
    df[f'caffeine_{i}_day_before'] = df['caffeine'].shift(i)
    df[f'energy_{i}_day_before'] = df['energy'].shift(i)
    df[f'party_{i}_day_before'] = df['party'].shift(i)
    df[f'total_work_hours_{i}_day_before'] = df['total_work_hours'].shift(i)
df.head(2)

Unnamed: 0,success,datetime,energy,sleep,caffeine,job_hours,study_hours,total_work_hours,luck,strategy,...,rbs,office,spend money,day_at_home,success_1_day_before,sleep_1_day_before,caffeine_1_day_before,energy_1_day_before,party_1_day_before,total_work_hours_1_day_before
43,0.5,2023-10-07 23:59:00,0.66,1.0,0.75,0.0,0.0,0.0,0.5,0.25,...,0,0,1,0,,,,,,
42,1.0,2023-10-08 15:09:00,0.66,0.66,0.34,0.0,0.0,0.0,0.5,0.5,...,0,0,1,1,0.5,1.0,0.75,0.66,0.0,0.0


### Minor Improvements

In [77]:
df.rename(columns={"snoosing": "snoozing"}, inplace=True)

### Save for Machine Learning

In [78]:
# Drop meaningless and private features
social_cols = ['friends', 'family', 'coworkers']
emotional_cols = ['sad', 'happy', 'angry', 'feared', 'guilty']
strategy_cols = ['focus on career', 'self- reflect', 'balance studies and job', 'avoid mistakes']
undefined_cols = ['other']
df.drop(columns=social_cols+emotional_cols+strategy_cols+undefined_cols, inplace=True, errors='ignore')
# Export timeseries dataset to CSV
df.to_csv(f'transformed_data_{DAYLIO_DATE}.csv', index=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44 entries, 43 to 0
Data columns (total 36 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   success                        44 non-null     float64       
 1   datetime                       44 non-null     datetime64[ns]
 2   energy                         44 non-null     float64       
 3   sleep                          44 non-null     float64       
 4   caffeine                       44 non-null     float64       
 5   job_hours                      44 non-null     float64       
 6   study_hours                    44 non-null     float64       
 7   total_work_hours               44 non-null     float64       
 8   luck                           44 non-null     float64       
 9   strategy                       44 non-null     float64       
 10  agreeableness                  44 non-null     float64       
 11  takeaway                  