In [None]:
import pandas as pd
from collections import namedtuple, OrderedDict
import datetime
import ujson
import os
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.ticker
%matplotlib inline
import seaborn as sns
import matplotlib.dates as mdates
import math
import time
import gzip

sns.set(font_scale=1.3)

LOGS = './logs/'
Window = namedtuple('Window', ['pid', 'name', 'start_time', 'last_update', 'focus_time', 'exe', 'cmd'])
Event = namedtuple('Event', ['time', 'category', 'text', 'index'])

def load(file):
    if file.split('.')[-1] == 'gz':
        with gzip.open(file) as f:
            data = ujson.loads(f.read().decode('utf-8'))
    else:
        with open(file, encoding='utf-8') as f:
            data = ujson.load(f)
    return [Window(*v) for v in data]

def load_data():
    files = {file : os.path.getctime(os.path.join(LOGS, file)) for file in os.listdir(LOGS)}
    data = None

    for file in files:
        day = load(os.path.join(LOGS, file))
        day = pd.DataFrame.from_records(day, columns=Window._fields)
        day['start_time'] = day['start_time'].apply(lambda x : pd.Timestamp(x))
        day['last_update'] = day['last_update'].apply(lambda x : pd.Timestamp(x))
        day['focus_time'] = day['focus_time'].apply(lambda x : pd.Timedelta(x))
        day['boot'] = day['start_time'].min()
        day['start_time'] = day['last_update'] - day['focus_time']
        data = pd.concat([data, day])
        
    if data is None:    
        data['category'] = merge(data['name'].apply(lambda x: categorize(x, categories_name)).values, 
                                 data['exe'].apply(lambda x: categorize(x, categories_exe)).values,
                                 data['exe'].str.split('\\').apply(lambda x: x[-1]).values)
    return data

def reindex(colname):
    data.index = data[colname]
    data.sort_index(inplace=True, ascending=False)
    return data

def expand_multi_dict(key_val_pair):
    ret = []
    for item in key_val_pair:
        if type(item[0]) != list:
            ret.append(item)
        else:
            for sub_item in item[0]:
                ret.append((sub_item, item[1]))
    return ret

def categorize(x, dictionary):
    for k, v in dictionary.items():
        if k.lower() in x.lower():
            return v
        
def merge(*lists):
    ret = lists[0]
    for l in lists[:-1]:
        assert len(l) == len(lists[-1])
    for i in range(len(lists[0])):
        for l in lists:
            if l[i]:
                ret[i] = l[i]
                break
    return ret

def time_ticks(x, pos):
    return str(datetime.timedelta(milliseconds=x*3.6))
def label_ticks(y, pos):
    global positions_sequence
    return positions_sequence[int(round(y))]
def date_boot_ticks(x, pos):
    global boot_time_round
    return (boot_time_round + datetime.timedelta(milliseconds=x*3.6)).strftime("%Y-%m-%d %H:%M:%S")

In [None]:
# categorizes data points by window_name (first match)
# format: (list of window_names : category)
categories_name = OrderedDict(expand_multi_dict([
    ('- TODO', 'todo'),
    (['notebook', 'jupyter', 'ipython', 'python', 'hackerrank', 'topcoder', 'codingame', 'Focus'], 'python'),
    ('git', 'git'),
    (['Wiki'], 'wiki'),
    ('Stack Overflow', 'stackoverflow'),
    ('Google Search', 'google'),
    ('documentation', 'docs'),
    (['.png', '.jpg', 'imgur', 'gif', 'gifv'], 'img'),
    ('excel', 'excel'),
    (['reddit'], 'reddit'),
    (['AFK', 'Program Manager'], 'afk'),
    ('Twitch','twitch'),
    ('YouTube','youtube'),
    ('https://', 'loading'),
    (['- 360Chrome', 'coolnovo', 'chrome', 'firefox', 'opera', 'vivaldi'], 'browser'),
    ('- Clover','files'),
]))

# categorizes data points by exe_path
# format: (list of exe_paths : category)
categories_exe = OrderedDict(expand_multi_dict([
    ('chrome.exe' , 'browser'),
    ('pycharm' , 'python'),
    ('Spotify.exe' , 'spotify'),
    (['ConEmu64.exe', 'cmd.exe'], 'console'),
    ('notepad++.exe', 'notepad'),
    ('taskmgr.exe', 'taskmgr'),
    ('clover.exe', 'files'),
    ('wox.exe', 'wox'),
    ('excel.exe', 'excel'),
    ('calc.exe', 'calculator'),           
    (r'/games/', 'games'),
    ('explorer.exe', 'files'),
]))

CUTOFF = 20*1e6/60  # display categories with at least 20 minutes total focus time
data = load_data()

In [None]:
def plot_top_categories(data, category_count=None):
    d = data.groupby('category')['focus_time'].sum().apply(lambda x: x / np.timedelta64(1,'ms')/3.6)
    d = d.sort_values(ascending=False)[:category_count] if category_count else d[d>CUTOFF].sort_values(ascending=False)
    category_count = len(d.index)
    plt.figure(figsize=(20,6))
    ax = sns.barplot(d.values, d.index, orient='h', palette=sns.color_palette('husl', category_count))
    ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(time_ticks))
#plot_top_categories(data, 10)

In [None]:
def plot_top_by_date(data):
    d = data.groupby(['category', 'boot'])['focus_time'].sum().apply(lambda x: x / np.timedelta64(1,'ms')/3.6)
    d = d.sort_values(ascending=False).unstack(level=1)

    d['sum'] = d.sum(1)
    d = d[d['sum'] > CUTOFF].sort_values('sum', ascending=True)
    del d['sum']
    global positions_sequence
    positions_sequence = list(d.index)
    sns.set_palette('colorblind')
    ax = d.plot.barh(stacked=True, figsize=(20,6), width=0.8, fontsize=13, legend=False)
    ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(time_ticks))
#plot_top_by_date(data)

In [None]:
def plot_category_by_day(data, exe_name):
    d = data.groupby(['category', 'boot'])['focus_time'].sum().apply(lambda x: x / np.timedelta64(1,'ms')/3.6)
    d = d.sort_values(ascending=False).unstack(level=1)

    d['sum'] = d.sum(1)
    d = d[d['sum'] > CUTOFF].sort_values('sum', ascending=True)
    del d['sum']
    
    ax = ((d.query("category == '{}'".format(exe_name)).dropna(1).T)).plot()
    ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(time_ticks))
#plot_category_by_day(data, 'python')

In [None]:
def plot_timeline_by_category_time(data, categories):
    def date_ticks_from_days(x, pos):
        print(x)
        return pd.Timestamp(x).strftime("%Y-%m-%d %H:%M:%S")
    categories = data.groupby('category')['focus_time'].sum().apply(lambda x: x / np.timedelta64(1,'ms')/3.6).sort_values(ascending=False).head(HEAD).index
    d = data.groupby(['category', 'boot']).focus_time.sum().unstack(0)
    ax = d[categories].resample('D').sum().fillna(pd.Timedelta(0)).apply(lambda x: x / np.timedelta64(1,'ms') / 3.6).plot(figsize=(27,7), x_compat=False)
    ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(time_ticks))
    ax.xaxis.grid(True, which="minor")
#plot_timeline_by_category_time(data, 10)

In [None]:
def plot_day_sequence_chart(category_count):
    def add_event_vline(event):
        pos = (event.time - boot_time_round) / np.timedelta64(1, 'ms') / 3.6
        y = -1.3
        plt.axvline(pos, color=palette[positions_sequence.index(event.category)])
        plt.text(pos, -1-0.7*(event.index%3), event.text, rotation=0, fontsize=13)

    files = {file : os.path.getctime(os.path.join(LOGS, file)) for file in os.listdir(LOGS)}
    today = (dt.datetime.fromtimestamp(files[sorted(files.keys())[-1]]) - pd.Timedelta('6 hours')).date()
    day = sum([load(os.path.join(LOGS, k)) for k, v in files.items() if (dt.datetime.fromtimestamp(v) - pd.Timedelta('6 hours')).date() == today], [])
    day = pd.DataFrame.from_records(day, columns=Window._fields)
    day['focus_time'] = day['focus_time'].apply(lambda x : pd.Timedelta(x))
    day['start_time'] = day['start_time'].apply(lambda x : pd.Timestamp(x))
    day['last_update'] = day['last_update'].apply(lambda x : pd.Timestamp(x))
    day['category'] = merge(day['name'].apply(lambda x: categorize(x, categories_name)).values, 
                             day['exe'].apply(lambda x: categorize(x, categories_exe)).values,
                             day['exe'].str.split('\\').apply(lambda x: x[-1]).values)
    global positions_sequence
    positions_sequence = list(reversed(list(day.groupby('category')['focus_time'].sum().sort_values(ascending=False)[:category_count].index)))
    
    data = day.groupby(['pid', 'name', 'start_time']).agg({'focus_time' : sum, 'last_update' : max, 'exe' : max, 'cmd': max, 'category':max}).reset_index().sort_values('focus_time', ascending=False)
    d = day.set_index('category')[['focus_time', 'last_update']]
    d['start_time'] = d['last_update'] - d['focus_time']
    d['focus_time'] = d['focus_time'].apply(lambda x: x / np.timedelta64(1,'ms') / 3.6)
    d = d.sort_values('start_time')
    boot_time = day['start_time'].min()
    global boot_time_round
    boot_time_round = boot_time.replace(minute=0, second=0)
    
    
    fig = plt.figure(figsize=(27,7))
    ax = fig.add_subplot(111)
    palette = list(reversed(sns.color_palette('husl', category_count)))
    pad = 60*1000/3.6  # 60sec = expand length of event for quick events
    stitch = 60*1000/3.6 # 60sec = events with a (gap < stitch) become one

    # stitching
    d2 = []
    for category in set(d.index):
        group = d[d.index == category].reset_index()
        stime = group.ix[0].start_time
        ltime = group.ix[0].last_update
        lg = len(group)
        if lg==1:
            d2.append(pd.Series(index=['start_time', 'last_update', 'focus_time'], name=category,
                     data=[stime, ltime, (ltime - stime) / np.timedelta64(1,'ms') / 3.6]))
        group['gap'] = (group['start_time'] - group.shift(1)['last_update']) / np.timedelta64(1,'ms') / 3.6
        for row in range(1, lg+1):
            if row==lg or (group.ix[row].gap > stitch and row < lg):
                d2.append(pd.Series(index=['start_time', 'last_update', 'focus_time'], name=category,
                                         data=[stime, group.ix[row-1].last_update, (group.ix[row-1].last_update - stime) / np.timedelta64(1,'ms') / 3.6]))
                if row < lg:
                    stime = group.ix[row].start_time
    d2 = pd.DataFrame(d2)

    for row in range(len(d2)):
        if d2.index[row] in positions_sequence:
            # pad to the left and to the right to reduce noise
            category = positions_sequence.index(d2.index[row])
            ax.barh(positions_sequence.index(d2.index[row]), d2.ix[row, 'focus_time']+pad, height=.8, align='center', color=palette[category],
                    left=(d2.ix[row, 'start_time'] - boot_time_round) / np.timedelta64(1,'ms') / 3.6 - pad, edgecolor = 'none')

    ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(date_boot_ticks))
    ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(label_ticks))
    ax.xaxis.set_minor_locator(matplotlib.ticker.AutoMinorLocator(6))
    ax.tick_params(labelright=True, labeltop=True)
    ax.grid(b=True, which='minor', color='w', linewidth=0.7)
    ax.grid(b=True, which='major', color='w', linewidth=1.5)
    ax.set_yticks(range(len(positions_sequence)))
    ax.set_yticklabels(positions_sequence)
    ax.set_ylim(-2.7, len(positions_sequence))

    # axvlines
    event_delta = pd.Timedelta('10min')
    for i, (_, event_row) in enumerate(data[data['category'] == 'todo'].sort_values('last_update').iterrows()):
        if event_row['last_update'] > pd.Timestamp('2016-05-28 17:00'):
            event_time = event_row['last_update'] - event_row['focus_time'] 
            category = day[(event_time - event_delta < day['last_update'] - day['focus_time']) & (day['last_update'] - day['focus_time'] < event_time + event_delta) | 
                (event_time - event_delta < day['last_update']) & (day['last_update'] < event_time + event_delta) |
                (day['last_update'] - day['focus_time'] < event_time) & (event_time < day['last_update'])].groupby('category')['focus_time'].sum().idxmax()        
            add_event_vline(Event(event_time, category, event_row['name'].split('- TODO')[0], i))
    plt.show()
#plot_day_sequence_chart(7)