> ## Data preparation scripts
In this kernel I want to share data preparation scripts which were used for this visualization project. 
Data used on the demo page are for 2006–2007 period, but since its format is the same as current one - code below is applicable for any period of time. 
You can build your own dataset using those scripts and visualize it using project which is publicly available on the GitHub.

Code: https://github.com/edmarisov/cavernsoftime 

Visualization demo: https://www.cavernsoftime.net/

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import Counter

In [None]:
#Data export tools.
#usage: prepare_data(category, dataframe) 
# - Saves data in chunks of 2 days under a folder named with value from "category" variable. 
# - Also generates index.json file with mapping (<date>: <chunk file path>)

import json
import os
import math

def del_none(d):
    """
    Delete keys with the value ``None`` in a dictionary, recursively.

    This alters the input so you may wish to ``copy`` the dict first.
    """
    for key, value in list(d.items()):
        #print('1:%s %s' % (key, type(value)))
        if isinstance(value, dict):
            del_none(value)
        elif value is None or math.isnan(value) or not isinstance(key, str):
            del d[key]

    return d  # For convenience


def prepare_data(category, df):
    directory = '../cavernsoftime/data/%s' % category
    
    if os.path.exists(directory):
        shutil.rmtree(directory)
        
    os.makedirs(directory)
    
    prev_date = None
    data = {}
    index = {}
    for idx in range(len(df.index.get_level_values(0))):
        l = df.index.get_level_values(0)[idx]
        if(prev_date is None):
            prev_date = l
        key = l.strftime('%m/%d/%y %H:%M:%S')
        data[key] = df.xs(l).to_dict(orient='index')
        if l - prev_date > pd.Timedelta(days=2) or idx == len(df.index.get_level_values(0)) - 1:
            print(l)
            file_name = prev_date.strftime('%m_%d_%y_%H_%M_%S.json')
            file_path = '%s/%s' % (directory, file_name)
            index[prev_date.strftime('%m/%d/%y %H:%M:%S')] = '/data/%s/%s' % (category, file_name)
            with open(file_path, "w") as data_file:
                json.dump(del_none(data), data_file, indent=2)

            prev_date = None
            data = {}

    with open('data/index.json', "w") as data_file:
        json.dump(index, data_file, indent=2)

In [None]:
#loading the data
df = pd.read_csv('../input/wowah_data.csv', 
                         sep=',', 
                         error_bad_lines=False)

#column names contains leading space
df.columns = [col.replace(' ', '') for col in df.columns]

df['timestamp'] = pd.to_datetime(df['timestamp'], utc=False, format='%m/%d/%y %H:%M:%S')
df['level'] = df['level'].astype('int64')

In [None]:
#New column "moved_to" containing information about player's next zone:
char_grp = df.groupby('char', sort=False)
df['moved_to'] = char_grp['zone'].shift()
df.loc[df['moved_to'] == df['zone'], 'moved_to'] = None

#New column lvl_diff which contains player's level difference between two consequent observations
df['lvl_diff'] = char_grp['level'].diff().fillna(0)

In [None]:
#exporting data, which contains number of players per zone and how many players moved from one specific zone to another.

all_data = df.groupby([pd.Grouper(key='timestamp', freq='10Min'), 'zone'], sort=False).agg({
            'char': {
                'char_count': 'nunique'
            },
            'moved_to': {
                'moves': lambda s: Counter(s)
            }
    })

all_data.columns = all_data.columns.droplevel()
all_data[1000:].head()

#export the data
#prepare_data('default', all_data)

In [None]:
#Now I want to find player with "average leveling path from level 1 to lvl 60". 
#It is done by selecting all players who leveled from lvl 1 to lvl 60,
#And then taking average time taken to level-up (time between different levels) per player followed by median across those values.

start = df.loc[df['level'] == 1]
end = df.loc[df['level'] == 60]

#take all users who did 1-60 lvl
joined = start.merge(end, on='char', how='inner', suffixes=('l', 'r'))

#cleaning the data, since they are a bit skewed, there are cases when same charId has level 60 earlier than it had lvl 1
oneToSixty = joined.loc[joined['timestampl'] < joined['timestampr']]['char'].unique()

leveling = df[df.char.isin(oneToSixty)].copy()

#calculating online time, I count all time diffs above 15 minutes as 0s since the data are being polled every 10 minutes. 
leveling['diff_time'] = leveling.groupby(['char'], sort=False)['timestamp'].diff().fillna(pd.Timedelta('0 second'))

leveling['online_time'] = leveling['diff_time'] /pd.Timedelta('1 minutes')
leveling.loc[leveling['online_time'] > 15, 'online_time'] = 0

char_level_grp = leveling.groupby([leveling.char, leveling.level])['online_time'].sum().reset_index().rename(columns={'online_time': 'time_per_level'})
time_per_level = char_level_grp.groupby('char')['time_per_level'].mean()

time_per_level.loc[time_per_level>=time_per_level.median()].head(1)

In [None]:
#Now lets take that charId and export "average player" leveling path
def build_aggregations(s):
    ss = pd.Series([Counter(s.moved_to), s.char.nunique(), {'radius': 15 if s.lvl_diff.max() == 0 else 55, 'tooltip': {'level': max(s.level)}}])
    return ss

avg_leveling = df[df['char'] == 4629]\
    .groupby([pd.Grouper(key='timestamp', freq='10Min'), 'zone'], sort=False)

avg_leveling = avg_leveling.apply(build_aggregations)
avg_leveling = avg_leveling.rename(columns={0:'moves', 1: 'char_count', 2: 'meta'})
avg_leveling.head()

#export the data
#prepare_data('avg_leveling', avg_leveling)