In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
from datetime import timedelta
from urllib.parse import urlparse, parse_qs
import re

In [2]:
df = pd.read_json('_data.json')
df = df.drop(['products', 'details'], axis=1)
df.time = pd.to_datetime(df.time)
df

Unnamed: 0,header,title,titleUrl,time
0,Launcher 10,Used Launcher 10,https://play.google.com/store/apps/details?id=...,2020-11-02 08:02:47.171000+00:00
1,"Readably - RSS | Feedbin, Inoreader and Fever API","Used Readably - RSS | Feedbin, Inoreader and F...",https://play.google.com/store/apps/details?id=...,2020-11-02 08:02:43.505000+00:00
2,"Readably - RSS | Feedbin, Inoreader and Fever API","Used Readably - RSS | Feedbin, Inoreader and F...",https://play.google.com/store/apps/details?id=...,2020-11-02 05:07:25.396000+00:00
3,Launcher 10,Used Launcher 10,https://play.google.com/store/apps/details?id=...,2020-11-02 05:07:22.160000+00:00
4,VK — live chatting & free calls,Used VK — live chatting & free calls,https://play.google.com/store/apps/details?id=...,2020-11-02 05:07:11.265000+00:00
...,...,...,...,...
57649,"ivi - фильмы, сериалы, мультфильмы",Viewed Ледяной шторм,https://www.google.com/url?q=http://www.ivi.ru...,2016-04-22 05:00:08.248000+00:00
57650,"ivi - фильмы, сериалы, мультфильмы",Viewed Последние часы,https://www.google.com/url?q=http://www.ivi.ru...,2016-04-22 05:00:07.744000+00:00
57651,"ivi - фильмы, сериалы, мультфильмы",Viewed Звездные войны: Пробуждение силы (Бонус...,https://www.google.com/url?q=http://www.ivi.ru...,2016-04-22 05:00:07.744000+00:00
57652,"ivi - фильмы, сериалы, мультфильмы",Viewed Игра на понижение,https://www.google.com/url?q=http://www.ivi.ru...,2016-04-22 05:00:07.744000+00:00


In [3]:
def fix_time(time):
    # TODO
    time = time + timedelta(hours=3)
    return time

df.time = df.time.apply(fix_time)
df

Unnamed: 0,header,title,titleUrl,time
0,Launcher 10,Used Launcher 10,https://play.google.com/store/apps/details?id=...,2020-11-02 11:02:47.171000+00:00
1,"Readably - RSS | Feedbin, Inoreader and Fever API","Used Readably - RSS | Feedbin, Inoreader and F...",https://play.google.com/store/apps/details?id=...,2020-11-02 11:02:43.505000+00:00
2,"Readably - RSS | Feedbin, Inoreader and Fever API","Used Readably - RSS | Feedbin, Inoreader and F...",https://play.google.com/store/apps/details?id=...,2020-11-02 08:07:25.396000+00:00
3,Launcher 10,Used Launcher 10,https://play.google.com/store/apps/details?id=...,2020-11-02 08:07:22.160000+00:00
4,VK — live chatting & free calls,Used VK — live chatting & free calls,https://play.google.com/store/apps/details?id=...,2020-11-02 08:07:11.265000+00:00
...,...,...,...,...
57649,"ivi - фильмы, сериалы, мультфильмы",Viewed Ледяной шторм,https://www.google.com/url?q=http://www.ivi.ru...,2016-04-22 08:00:08.248000+00:00
57650,"ivi - фильмы, сериалы, мультфильмы",Viewed Последние часы,https://www.google.com/url?q=http://www.ivi.ru...,2016-04-22 08:00:07.744000+00:00
57651,"ivi - фильмы, сериалы, мультфильмы",Viewed Звездные войны: Пробуждение силы (Бонус...,https://www.google.com/url?q=http://www.ivi.ru...,2016-04-22 08:00:07.744000+00:00
57652,"ivi - фильмы, сериалы, мультфильмы",Viewed Игра на понижение,https://www.google.com/url?q=http://www.ivi.ru...,2016-04-22 08:00:07.744000+00:00


In [4]:
def get_app_id(datum):
    try:
        if pd.notna(datum['titleUrl']):
            q = parse_qs(urlparse(datum['titleUrl']).query)
            return q['id'][0]
    except KeyError:
        pass
    try:
        if datum['title'].startswith('Used'):
            return ' '.join(datum['title'].split(' ')[1:])
    except KeyError:
        pass
    return datum['header']

df['app_id'] = df.apply(get_app_id, axis=1)

In [5]:
def fix_name(name):
    if name.startswith('com.'):
        tokens = name.split('.')
        return ' '.join([t[0].upper() + t[1:] for t in tokens[1:]])
    name = re.sub(r'(:|-|—|–).*$', '', name)
    name = re.sub(r'\(.*\)', '', name)
    name = name.strip()
    return name
app_names = {app_name: fix_name(group.iloc[0]['header']) for app_name, group in df.groupby('app_id')}

# for app_name in app_names.values():
#     print(app_name)

df['app_name'] = df.app_id.apply(lambda id_: app_names[id_])

In [6]:
def align_time(time):
    time = time.replace(minute=time.minute // 30 * 30, second=0, microsecond=0)
    return time
    
df.time = df.time.apply(align_time)

In [7]:
dfg = df.groupby(['app_name', 'time']) \
    .agg(lambda x: x.iloc[0]) \
    .reset_index() \
    .sort_values('time', ascending=False) \
    .reset_index(drop=True)
dfg = dfg.drop(['title', 'titleUrl'], axis=1)
dfg

Unnamed: 0,app_name,time,header,app_id
0,Launcher 10,2020-11-02 11:00:00+00:00,Launcher 10,com.nfwebdev.launcher10
1,Readably,2020-11-02 11:00:00+00:00,"Readably - RSS | Feedbin, Inoreader and Fever API",com.isaiasmatewos.readably
2,VK,2020-11-02 08:00:00+00:00,VK — live chatting & free calls,com.vkontakte.android
3,Readably,2020-11-02 08:00:00+00:00,"Readably - RSS | Feedbin, Inoreader and Fever API",com.isaiasmatewos.readably
4,Launcher 10,2020-11-02 08:00:00+00:00,Launcher 10,com.nfwebdev.launcher10
...,...,...,...,...
30950,СберБанк Онлайн,2016-07-11 17:00:00+00:00,СберБанк Онлайн,СберБанк Онлайн
30951,СберБанк Онлайн,2016-07-10 16:30:00+00:00,СберБанк Онлайн,СберБанк Онлайн
30952,ivi,2016-04-24 14:00:00+00:00,"ivi - фильмы, сериалы, мультфильмы","ivi - фильмы, сериалы, мультфильмы"
30953,ivi,2016-04-22 08:00:00+00:00,"ivi - фильмы, сериалы, мультфильмы","ivi - фильмы, сериалы, мультфильмы"
