# Prepare success data for analysis

In [1]:
import pandas as pd
from collections import Counter
# Specify the filename of the raw Daylio export
filename = 'daylio_export_2023_10_27.csv'
# Specify the project name
project_name = 'oct_27'
# Specify the removal threshold
freq_threshold = 3

### Read app data

In [2]:
# Read CSV file exported from the Daylio app
df = pd.read_csv(filename)
# Drop diary note columns
df.drop(columns=['note_title', 'note'], inplace=True)
df.head()

Unnamed: 0,full_date,date,weekday,time,mood,activities
0,2023-10-27,October 27,Friday,22:05,day off,treated | exhausted | 9+h | focus on career | ...
1,2023-10-26,October 26,Thursday,22:20,strategic failure,treated | exhausted | <5h | 3+ shots | focus o...
2,2023-10-25,October 25,Wednesday,21:30,strategic success,fast cooking | treated | quick | 5-7h | 3+ sho...
3,2023-10-24,October 24,Tuesday,20:01,strategic success,treated | fasted | rush | 7-9h | 2 shots | gre...
4,2023-10-23,October 23,Monday,20:00,strategic success,fast cooking | fasted | rush | 7-9h | 3+ shots...


### Create datetime column

In [3]:
# Combine the date and time columns into a single one
df['datetime'] = pd.to_datetime(df['full_date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M')
# Drop other date columns
df.drop(columns=['full_date', 'weekday', 'date', 'time'], inplace=True)
# Print earliest and latest dates
print(f"From {df['datetime'].min()} to {df['datetime'].max()}")

From 2023-10-07 23:59:00 to 2023-10-27 22:05:00


### Scale ordinal success labels

In [4]:
success_col = 'success'
# Map each success label to ordinal scale
success_scale = {
    'strategic success': 1.00,
    'gain': 0.75,
    'expensive gain': 0.50,
    'day off': 0.25,
    'strategic failure': 0.00
}
# Create the success target column
df.rename(columns={'mood': success_col}, inplace=True)
df[success_col] = df[success_col].map(success_scale)
df.head()

Unnamed: 0,success,activities,datetime
0,0.25,treated | exhausted | 9+h | focus on career | ...,2023-10-27 22:05:00
1,0.0,treated | exhausted | <5h | 3+ shots | focus o...,2023-10-26 22:20:00
2,1.0,fast cooking | treated | quick | 5-7h | 3+ sho...,2023-10-25 21:30:00
3,1.0,treated | fasted | rush | 7-9h | 2 shots | gre...,2023-10-24 20:01:00
4,1.0,fast cooking | fasted | rush | 7-9h | 3+ shots...,2023-10-23 20:00:00


### Scale ordinal sleep quality labels

In [5]:
# Parse activities into list
df['activities'] = df['activities'].apply(lambda x: x.split(' | '))
# Clean activity
def clean_activities(scale):
    """Remove activities from the activity list."""
    df['activities'] = df['activities'].apply(lambda x: [i for i in x if i not in scale.keys()])
# Prepare a method to scale
def scale_activities(scale, new_col_name):
    """Scale activities in the activity list."""
    df[new_col_name] = df['activities'].apply(lambda x: [i for i in x if i in scale.keys()])
    clean_activities(scale)
    df[new_col_name] = df[new_col_name].apply(lambda x: scale[x[0]] if len(x)>0 else None)
# Map each sleep quality label to ordinal scale 
sleep_scale = {
    '9+h': 1.00,
    '7-9h': 0.66,
    '5-7h': 0.33,
    '<5h': 0
}
scale_activities(sleep_scale, 'sleep')
df.head()

Unnamed: 0,success,activities,datetime,sleep
0,0.25,"[treated, exhausted, focus on career, balance ...",2023-10-27 22:05:00,1.0
1,0.0,"[treated, exhausted, 3+ shots, focus on career...",2023-10-26 22:20:00,0.0
2,1.0,"[fast cooking, treated, quick, 3+ shots, soda,...",2023-10-25 21:30:00,0.33
3,1.0,"[treated, fasted, rush, 2 shots, green tea, fo...",2023-10-24 20:01:00,0.66
4,1.0,"[fast cooking, fasted, rush, 3+ shots, soda, f...",2023-10-23 20:00:00,0.66


### Scale ordinal caffeine labels

In [6]:
# Prepare a method to increment scale
def increment_scale(activities, scale):
    """Increment scale with other activities."""
    total = 0
    for item in activities:
        if item in scale:
            total += scale[item]
    return total
# Balance any feature
def balance_feature(col, default=0):
    """Put zeroes and minimise ones."""
    df[col].fillna(default, inplace=True)
    df[col] = df[col].apply(lambda x: 1 if x>1 else x)
# Scale coffee shots
caffeine_scale = {
    '3+ shots': 1.00,
    '2 shots': 0.67,
    '1 shot': 0.34,
}
scale_activities(caffeine_scale, 'caffeine')
# Add other caffeine drinks
other_caffeine_drinks = {
    'soda': 0.16,
    'green tea': 0.25
}
df['caffeine'] += df['activities'].apply(increment_scale, args=(other_caffeine_drinks, ))
clean_activities(other_caffeine_drinks)
# Balance the feature
balance_feature('caffeine')
df.head()

Unnamed: 0,success,activities,datetime,sleep,caffeine
0,0.25,"[treated, exhausted, focus on career, balance ...",2023-10-27 22:05:00,1.0,0.0
1,0.0,"[treated, exhausted, focus on career, balance ...",2023-10-26 22:20:00,0.0,1.0
2,1.0,"[fast cooking, treated, quick, focus on career...",2023-10-25 21:30:00,0.33,1.0
3,1.0,"[treated, fasted, rush, focus on career, balan...",2023-10-24 20:01:00,0.66,0.92
4,1.0,"[fast cooking, fasted, rush, focus on career, ...",2023-10-23 20:00:00,0.66,1.0


### Scale energy

In [7]:
# Map each energy label to ordinal scale 
energy_scale = {
    'rush': 1.00,
    'quick': 0.66,
    'slow': 0.33,
    'exhausted': 0
}
scale_activities(energy_scale, 'energy')
df.head()

Unnamed: 0,success,activities,datetime,sleep,caffeine,energy
0,0.25,"[treated, focus on career, balance studies and...",2023-10-27 22:05:00,1.0,0.0,0.0
1,0.0,"[treated, focus on career, balance studies and...",2023-10-26 22:20:00,0.0,1.0,0.0
2,1.0,"[fast cooking, treated, focus on career, balan...",2023-10-25 21:30:00,0.33,1.0,0.66
3,1.0,"[treated, fasted, focus on career, balance stu...",2023-10-24 20:01:00,0.66,0.92,1.0
4,1.0,"[fast cooking, fasted, focus on career, balanc...",2023-10-23 20:00:00,0.66,1.0,1.0


### Scale Job

In [8]:
# Map job label to ordinal scale 
job_scale = {
    'worked 7+ hours': 1.00,
    'worked 5-6 hours': 0.75,
    'worked 3-4 hours': 0.50,
    'worked <2 hours': 0.25
}
scale_activities(job_scale, 'job')
balance_feature('job')
df.head()

Unnamed: 0,success,activities,datetime,sleep,caffeine,energy,job
0,0.25,"[treated, focus on career, balance studies and...",2023-10-27 22:05:00,1.0,0.0,0.0,0.25
1,0.0,"[treated, focus on career, balance studies and...",2023-10-26 22:20:00,0.0,1.0,0.0,0.5
2,1.0,"[fast cooking, treated, focus on career, balan...",2023-10-25 21:30:00,0.33,1.0,0.66,0.5
3,1.0,"[treated, fasted, focus on career, balance stu...",2023-10-24 20:01:00,0.66,0.92,1.0,0.75
4,1.0,"[fast cooking, fasted, focus on career, balanc...",2023-10-23 20:00:00,0.66,1.0,1.0,0.5


### Scale Studies

In [9]:
# Map study label to ordinal scale 
study_scale = {
    'studied 7+ hours': 1.00,
    'studied 5-6 hours': 0.75,
    'studied 3-4 hours': 0.50,
    'studied <2 hours': 0.25
}
scale_activities(study_scale, 'study')
balance_feature('study')
df.head()

Unnamed: 0,success,activities,datetime,sleep,caffeine,energy,job,study
0,0.25,"[treated, focus on career, balance studies and...",2023-10-27 22:05:00,1.0,0.0,0.0,0.25,0.0
1,0.0,"[treated, focus on career, balance studies and...",2023-10-26 22:20:00,0.0,1.0,0.0,0.5,0.75
2,1.0,"[fast cooking, treated, focus on career, balan...",2023-10-25 21:30:00,0.33,1.0,0.66,0.5,0.5
3,1.0,"[treated, fasted, focus on career, balance stu...",2023-10-24 20:01:00,0.66,0.92,1.0,0.75,0.75
4,1.0,"[fast cooking, fasted, focus on career, balanc...",2023-10-23 20:00:00,0.66,1.0,1.0,0.5,1.0


### One-hot encode other activities

In [10]:
# Count all unique activity occurences
activity_labels = [label for activities in df['activities'] for label in activities]
label_to_count = dict(Counter(activity_labels))
# Remove all low frequency labels
high_freq_labels = [label for label in label_to_count.keys() if label_to_count[label] >= freq_threshold]
df['activities'] = df['activities'].apply(lambda x: [i for i in x if i in high_freq_labels])
# Create columns for one-hot encoding
for label in high_freq_labels:
    df[label] = 0
# Fill one-hot encodings
for i, row in df.iterrows():
    for label in row['activities']:
        df.at[i, label] = 1
# Drop activity column
df.drop(columns=['activities'], inplace=True)
df.head()

Unnamed: 0,success,datetime,sleep,caffeine,energy,job,study,treated,focus on career,balance studies and job,...,office,fasted,happy,lucky,healthy cooking,notion,feared,restaurant,spend money,movie
0,0.25,2023-10-27 22:05:00,1.0,0.0,0.0,0.25,0.0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0.0,2023-10-26 22:20:00,0.0,1.0,0.0,0.5,0.75,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1.0,2023-10-25 21:30:00,0.33,1.0,0.66,0.5,0.5,1,1,1,...,1,0,0,0,0,0,0,0,0,0
3,1.0,2023-10-24 20:01:00,0.66,0.92,1.0,0.75,0.75,1,1,1,...,0,1,1,0,0,0,0,0,0,0
4,1.0,2023-10-23 20:00:00,0.66,1.0,1.0,0.5,1.0,0,1,1,...,1,1,0,1,0,0,0,0,0,0


### Shift some values

In [11]:
# Sort record by datetime ascending
df.sort_values(by='datetime', ascending=True, inplace=True)
# Put yesterday feature using shift
df['success_yesterday'] = df['success'].shift(1)
df['sleep_yesterday'] = df['sleep'].shift(1)
df['caffeine_yesterday'] = df['caffeine'].shift(1)
df['energy_yesterday'] = df['energy'].shift(1)
df.head()

Unnamed: 0,success,datetime,sleep,caffeine,energy,job,study,treated,focus on career,balance studies and job,...,healthy cooking,notion,feared,restaurant,spend money,movie,success_yesterday,sleep_yesterday,caffeine_yesterday,energy_yesterday
20,0.5,2023-10-07 23:59:00,1.0,0.75,0.66,0.0,0.0,1,0,0,...,0,0,0,1,1,1,,,,
19,1.0,2023-10-08 15:09:00,0.66,0.34,0.66,0.0,0.0,0,0,0,...,1,1,0,0,1,0,0.5,1.0,0.75,0.66
18,0.75,2023-10-09 22:15:00,0.66,1.0,0.66,1.0,0.25,1,1,0,...,1,1,1,0,0,0,1.0,0.66,0.34,0.66
17,0.5,2023-10-10 20:00:00,0.33,1.0,0.0,0.75,0.5,1,1,1,...,0,1,1,0,1,0,0.75,0.66,1.0,0.66
16,1.0,2023-10-11 20:06:00,0.33,1.0,1.0,0.25,1.0,1,1,1,...,1,1,0,0,1,0,0.5,0.33,1.0,0.0


### Save for Machine Learning

In [12]:
# Drop meaningless and private features
social_cols = ['friends', 'family', 'coworkers']
emotional_cols = ['sad', 'happy', 'angry', 'feared', 'guilty']
strategy_cols = ['focus on career', 'self- reflect', 'balance studies and job', 'avoid mistakes']
df.drop(columns=social_cols+emotional_cols+strategy_cols, inplace=True, errors='ignore')
# Export timeseries dataset to CSV
df.to_csv(f'success_data_{project_name}.csv', index=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21 entries, 20 to 0
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   success             21 non-null     float64       
 1   datetime            21 non-null     datetime64[ns]
 2   sleep               21 non-null     float64       
 3   caffeine            21 non-null     float64       
 4   energy              21 non-null     float64       
 5   job                 21 non-null     float64       
 6   study               21 non-null     float64       
 7   treated             21 non-null     int64         
 8   board games         21 non-null     int64         
 9   youtube             21 non-null     int64         
 10  laziness            21 non-null     int64         
 11  shower              21 non-null     int64         
 12  rbs                 21 non-null     int64         
 13  burtnieku           21 non-null     int64         
 14  s