# Analysis of aggregated data

In [84]:
from pathlib import Path
import json
from math import pi

import pandas as pd
import numpy as np

from bokeh.io import output_file, show, output_notebook, export_png
from bokeh.palettes import Category20c
from bokeh.plotting import figure
from bokeh.transform import cumsum

output_notebook()

In [21]:
BASE_DIR = Path.cwd().parent
SETTINGS_PATH = BASE_DIR / 'config' / 'settings.json'

def get_settings():
    if not SETTINGS_PATH.exists():
        raise Exception('Settings file not found')

    with open(str(SETTINGS_PATH)) as settings_file:
        data = json.load(settings_file)
    return data

In [97]:
def print_summary(label, data, is_pie=False, is_bar=False):
    print('#################################### \n {} \n####################################'.format(label))
    total = 0
    plot_dict = {}
    for key, value in data.items():
        if isinstance(value, int):
            tot = value
        else:
            tot = len(value['data'])
            
        total += tot
        print(key, ': ', tot)
        plot_dict["{} ({})".format(key, tot)] = tot
    
    if is_pie:
        plot_pie_chart(plot_dict, label)
        
    if is_bar:
        plot_bar_chart(plot_dict, label)
    
    print('\nTotal: ', total)
    print('#################################### \n')
    return total
    
def print_total_summary(dataset, sample):
    print('Total: ', dataset, 'Percent: ', 100 * sample / dataset, " % ")
    

In [98]:
def plot_pie_chart(source, label):
    data = pd.Series(source).reset_index(name='value').rename(columns={'index':'class'})
    data['angle'] = data['value'] / data['value'].sum() * 2*pi
    data['color'] = Category20c[len(source)]

    p = figure(
        plot_height=350, title="{} Class Distribution".format(label), toolbar_location=None,
        tools="hover", tooltips="@class: @value", x_range=(-0.5, 1.0)
    )

    p.wedge(x=0, y=1, radius=0.4,
            start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
            line_color="white", fill_color='color', legend='class', source=data)

    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None

    show(p)
    
    export_png(p, '{} pie.png'.format(label))
    

def plot_bar_chart(source, label):
    x = list(source.keys())
    y = list(source.values())
    
    p = figure(
        x_range=x, title="{} Class Distribution".format(label),
        toolbar_location=None, tools="", plot_width=800
    )

    p.vbar(x=x, top=y, width=0.9)
#     p.hbar(y=y, left='Time_min', right='Time_max', height=0.4, source=source)

    p.xgrid.grid_line_color = None
    p.y_range.start = 0
    p.x_range.range_padding = 0.1
    p.xaxis.major_label_orientation = 1

    show(p)
    
    export_png(p, '{} bar.png'.format(label))

## Activity Net

In [7]:
def get_activity_net():
    SETTINGS = get_settings()
    
    JSON_PATH = Path(SETTINGS['activity_net']['json'])
    if not JSON_PATH.exists():
        raise Exception('Activity Net JSON Path does not exist')
    
    with open(str(JSON_PATH)) as settings_file:
        data = json.load(settings_file)
    assert data['version'], 'VERSION 1.3'
    
    # Getting only dance taxonomies
    dance_taxonomy = list(filter(lambda x: x.get('parentName') == 'Dancing', data['taxonomy']))
    result = dict([ (taxonomy['nodeName'], {'meta': taxonomy, 'data': []}) for taxonomy in dance_taxonomy])
    
    for key, value in data['database'].items():
        for annotation in value['annotations']:
            label = annotation['label']

            if result.get(label):
                result[label]['data'].append({ **value, 'key': key, 'source': 'activity_net' })
            
                # Even if one annotation has the required key add the single item
                continue
    # Getting the stats break down
    total = print_summary('Activity Net', result, is_pie=True)
    
    print_total_summary(len(data['database'].keys()), total)
        
    return result

## Kinetics

In [8]:
def get_kinetics_individual(source):
    SETTINGS = get_settings()
    
    version = str(SETTINGS['kinetics']['default'])
    
    JSON_PATH = Path(SETTINGS['kinetics'][version]['json']) / '{}.json'.format(source)
    if not JSON_PATH.exists():
        raise Exception('Kinetics Net JSON Path does not exist')
    
    with open(str(JSON_PATH)) as settings_file:
        data = json.load(settings_file)
    
    return data

def get_kinetics_categories():
    SETTINGS = get_settings()
    
    JSON_PATH = Path(SETTINGS['kinetics']['categories'])
    if not JSON_PATH.exists():
        raise Exception('Categories does not exist')
    
    with open(str(JSON_PATH)) as settings_file:
        data = json.load(settings_file)
    
    return data


def get_kinetics_classes():
    SETTINGS = get_settings()
    
    JSON_PATH = Path(SETTINGS['kinetics']['classes'])
    if not JSON_PATH.exists():
        raise Exception('Classes does not exist')
    
    with open(str(JSON_PATH)) as settings_file:
        data = json.load(settings_file)
    
    return data

def get_kinetics_video_data():
    val_data = get_kinetics_individual('val')
    train_data = get_kinetics_individual('train')
    test_data = get_kinetics_individual('test')
    

    combined = { **val_data, **train_data, **test_data }
    assert len(combined.keys()), len(val_data.keys()) + len(train_data.keys()) + len(test_data.keys())
    
#     key = next(iter(combined.keys()))
#     print(combined[key])
    
    classes = set([value['annotations']['label'].lower() for key, value in combined.items()])
    
#     excluded = set(default_classes) - set(classes)
#     print(len(excluded))
#     print(excluded)
    
    return combined


def get_kinetics():
    result = get_kinetics_video_data()
    default_classes = get_kinetics_classes()
    categories = get_kinetics_categories()
    dance_dict = dict([(dance, { 'data': []}) for dance in categories['dancing']])
    
    for key, value in result.items():
        label = value['annotations']['label']
        if dance_dict.get(label):
            dance_dict[label]['data'].append({
                **value,
                'source': 'kinetics'
            })
    
    total = print_summary('Kinetics', dance_dict, is_bar=True)
    print_total_summary(len(result.keys()), total)
    
    return dance_dict

In [9]:
def run():
    get_activity_net()
    get_kinetics()
run()

#################################### 
 Activity Net 
####################################
Tango :  92
Cheerleading :  143
Cumbia :  86
Breakdancing :  107
Belly dance :  75



Total:  503
#################################### 

Total:  19994 Percent:  2.515754726417925  % 
#################################### 
 Kinetics 
####################################
belly dancing :  1147
breakdancing :  1051
cheerleading :  1149
country line dancing :  993
dancing ballet :  1146
dancing charleston :  713
dancing gangnam style :  665
dancing macarena :  951
jumpstyle dancing :  778
krumping :  708
marching :  990
robot dancing :  925
salsa dancing :  1088
swing dancing :  808
tango dancing :  968
tap dancing :  1035
zumba :  1074
cumbia :  716
square dancing :  704
mosh pit dancing :  637
pirouetting :  859



Total:  19105
#################################### 

Total:  480173 Percent:  3.978774316756669  % 


## UCF-101

In [11]:
def get_ucf():
    SETTINGS = get_settings()
    
    DATA_PATH = Path(SETTINGS['ucf']['data'])
    if not DATA_PATH.exists():
        raise Exception('Activity Net Data Path does not exist')
    
    SALSA_SPIN_PATH = DATA_PATH / 'SalsaSpin'
    total_salsa_spin = len([x for x in SALSA_SPIN_PATH.iterdir()])
    total = 0
    for dir in DATA_PATH.iterdir():
        if dir.is_dir():
            t = len([x for x in dir.iterdir()])
            total += t
#             print(dir.name, t)
    print(total_salsa_spin, total, 100 * total_salsa_spin / total)
get_ucf()

133 13320 0.9984984984984985


In [110]:
def get_lets_dance():
    SETTINGS = get_settings()
    
    DATA_PATH = Path(SETTINGS['lets_dance']['rgb_data'])
    if not DATA_PATH.exists():
        raise Exception('Lets Dance RGB Path does not exist')
    
    original = ['ballet', 'flamenco', 'latin', 'square', 'tango', 'breakdancing', 'foxtrot', 'quickstep', 'swing', 'waltz']
    recent = [x.name for x in DATA_PATH.iterdir()]
    #     print(sorted(recent), len(recent), sorted(original))
    #     print(set(recent) - set(original))
    
    META_DATA_PATH = Path(SETTINGS['lets_dance']['meta'])
    
    if not META_DATA_PATH.exists():
        raise Exception('Lets Dance Meta Path does not exist')
    
    data_dict = {}
    with open(str(META_DATA_PATH)) as f:
        data = [x.strip() for x in f.readlines() if '.jpg' in x]  
        
    print(len(data))
    print(data[:5])
    
    for index, item in enumerate(data):
        _, dance, filename = item.split("/")
#         print(filename)
        
        if not data_dict.get(dance):
            data_dict[dance] = {}
        
        seg = filename.split("_")
        
        z = seg.pop().split(".")[0]
        y = seg.pop()
        
        uuid = "_".join(seg)
        uuid = "{}___{}".format(uuid, y)
        
        x = seg[-1]
#         print(uuid, y, z)
        
        if not data_dict[dance].get(uuid):
            data_dict[dance][uuid] = {'y': [], 'z': []}
        
        if y not in data_dict[dance][uuid]['y']:
            data_dict[dance][uuid]['y'].append(y)
            
        data_dict[dance][uuid]['z'].append(z)
        
        data_dict[dance][uuid]['z'] = sorted(data_dict[dance][uuid]['z'])
    
    frame_len = []
    
    data_display = {}
    i = 0
    for key, value in data_dict.items():
        key_frame = []
        y = []
        for uuid, f in value.items():
            frame_len.append(frame_len)
            
            key_frame.append(len(f['z']))
            frame_len.append(len(f['z']))
            y.append(f['y'])
            if len(f['y']) > 1:
                print(len(f['y']))
            
        data_display[key] = len(key_frame)
        
#         print(key, len(value.keys()), 'Mean: ', int(np.mean(key_frame)), 'Std: ', int(np.std(key_frame)))
        
#         if i > 2:
#             break
        if(i % 1000 == 0):
            print(uuid)
        
        i += 1
        
    print_summary("Let's dance", data_display, is_bar=True)
    total_list = [v for v in data_display.values()]
    print(np.mean(total_list))
    print(np.std(total_list))
    print(np.sum(total_list))
        
get_lets_dance() 

412668
['rgb/tap/5xxTkB5bGy4_046_0026.jpg', 'rgb/tap/Rl88sW_rtv0_115_0249.jpg', 'rgb/tap/7Tftcimjo5o_210_0168.jpg', 'rgb/tap/5xxTkB5bGy4_046_0081.jpg', 'rgb/tap/ISeCp56ud4I_056_0165.jpg']
OAfHveS6cMw___052
#################################### 
 Let's dance 
####################################
tap :  95
ballet :  89
break :  95
foxtrot :  79
tango :  80
jive :  106
square :  97
waltz :  80
swing :  95
latin :  90
rumba :  94
quickstep :  82
samba :  97
pasodoble :  98
flamenco :  88
cha :  98



Total:  1463
#################################### 

91.4375
7.623719154717073
1463
