In [1]:
'''GENERAL'''
import os
import collections
import glob
'''DATA'''
import numpy as np
import pandas as pd
import databricks.koalas as ks
'''SIGNATURE'''
!fortune | cowsay -f dragon

 ______________________________________ 
/ Suicide is simply a case of mistaken \
\ identity.                            /
 -------------------------------------- 
      \                    / \  //\
       \    |\___/|      /   \//  \\
            /0  0  \__  /    //  | \ \    
           /     /  \/_/    //   |  \  \  
           @_^_@'/   \/_   //    |   \   \ 
           //_^_/     \/_ //     |    \    \
        ( //) |        \///      |     \     \
      ( / /) _|_ /   )  //       |      \     _\
    ( // /) '/,_ _ _/  ( ; -.    |    _ _\.-~        .-~~~^-.
  (( / / )) ,-{        _      `-.|.-~-.           .~         `.
 (( // / ))  '/\      /                 ~-. _ .-~      .-~^-.  \
 (( /// ))      `.   {            }                   /      \  \
  (( / ))     .----~-.\        \-'                 .~         \  `. \^-.
             ///.----..>        \             _ -~             `.  ^-`  ^-_
               ///-._ _ _ _ _ _ _}^ - - - - ~                     ~-

# Initial Processing

I have a directory of csvs from stat.ink which contains player submitted match data. I want to process them into one cleaned database. Leveraging the power of Koalas, I can use a Spark backend with a Pandas-like interface. I will first clean one csv to devolop a pipeline.

### Combining csv's
My data is currently spread among hundreds of csvs.

In [2]:
# Gather list of raw data filenames
raw = glob.glob(os.path.join('data/raw', '*.csv'))
# Count the number of csvs
len(raw)

684

In [3]:
# Load one to inspect
test = pd.read_csv(raw[0])
test.head()

Unnamed: 0,# period,game-ver,lobby-mode,lobby,mode,stage,time,win,knockout,A1-weapon,...,B3-level,B4-weapon,B4-kill-assist,B4-kill,B4-assist,B4-death,B4-special,B4-inked,B4-rank,B4-level
0,2018-08-31T12:00:00+00:00,3.2.2,gachi,standard,hoko,sumeshi,300,bravo,False,dynamo,...,19,maneuver,10.0,9.0,1.0,5.0,4.0,1298.0,b+,23.0
1,2018-08-31T12:00:00+00:00,3.2.2,gachi,standard,hoko,sumeshi,167,alpha,True,dynamo,...,45,furo,2.0,2.0,0.0,6.0,2.0,716.0,b+,45.0
2,2018-08-31T12:00:00+00:00,3.2.2,gachi,standard,hoko,sumeshi,108,alpha,True,dynamo,...,13,longblaster,4.0,4.0,0.0,2.0,2.0,623.0,b,27.0
3,2018-09-06T12:00:00+00:00,3.2.2,gachi,standard,hoko,shottsuru,312,bravo,False,dynamo,...,15,maneuver,3.0,1.0,2.0,7.0,1.0,0.0,b+,24.0
4,2018-09-06T12:00:00+00:00,3.2.2,gachi,standard,hoko,zatou,66,bravo,True,dynamo,...,31,furo,5.0,3.0,2.0,2.0,1.0,354.0,b+,26.0


I will need to combine all these into something usable like a pandas dataframe.

In [4]:
def splatoon_concat(path = 'data/raw'):
    """ 
    Concatanates multiple csvs from one directory into a dataframe
  
    Parameters: 
    path (string): optional specified file path
  
    Returns: 
    DataFrame: Koalas dataframe containing data from all csvs.

    """
    # get filenames
    files = glob.glob(os.path.join(path, '*.csv'))
    # concat all files
    return pd.concat((pd.read_csv(f) for f in files))

In [5]:
df = splatoon_concat()
df.shape

(1831760, 81)

### Preliminary Feature Selection

Since this data is from match results and I want to create a predictive model for win rate, there are many features that cannot be used as they are only knowable at the end of a match:

- period: when the match was played
    - irrelevant because this is not a timeseries model
    - version number better covers patch changes
- time: how long the match took
- knockout: if the match was won through the objective as opposed to a time-out
- player kill/assist/death/special/inked: player statistics calculated at the end of each match

In [6]:
def splatoon_drop(df):
    """ 
    Drops unnessecary features in stat.ink dataframes
  
    Parameters: 
    df (DataFrame): Pandas dataframe on which to drop values
  
    Returns: 
    DataFrame: the dataframe with dropped features
  
    """
    # initialize list with first features
    drop_lst = ['# period','time', 'knockout']
    # concatanate player statistics features for each player
    for player in ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4']:
        drop_lst += [player+'-kill-assist', player+'-kill', player+'-assist',
                     player+'-death', player+'-special', player+'-inked']
    # drop features in list
    return df.drop(drop_lst, axis=1)

In [7]:
df = splatoon_drop(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1831760 entries, 0 to 2873
Data columns (total 30 columns):
game-ver      object
lobby-mode    object
lobby         object
mode          object
stage         object
win           object
A1-weapon     object
A1-rank       object
A1-level      int64
A2-weapon     object
A2-rank       object
A2-level      float64
A3-weapon     object
A3-rank       object
A3-level      float64
A4-weapon     object
A4-rank       object
A4-level      float64
B1-weapon     object
B1-rank       object
B1-level      int64
B2-weapon     object
B2-rank       object
B2-level      float64
B3-weapon     object
B3-rank       object
B3-level      float64
B4-weapon     object
B4-rank       object
B4-level      float64
dtypes: float64(6), int64(2), object(22)
memory usage: 433.2+ MB


Save output as a pickle file for midpoint backup, the next part can get dicey.

In [8]:
df.to_pickle('data/merge.pkl')

### Exploding the rows

Since I am interested in making predictions based off data from one player, I will explode each match data into 8 separate rows, one for every player. The function will drop data from A1 by default as A1 is the player who submitted the data, which can lead to bias. I have more than enough data to drop it.

In [15]:
def splatoon_explode(df, drop=True):
    """ 
    Explodes rows of stat.ink dataframes into one row for each player.
    This function is intended to be run after splatoon_drop and
    is hardcoded to accept its output features.
  
    Parameters: 
    df (DataFrame): Pandas dataframe on which to explode
  
    Returns: 
    koalas: Exploded dataframe stored in a koalas database
  
    """
    # get feature names
    features = df.columns.tolist()
    # features for all players
    shared = features[:5]
    # features for specific player
    a1 = features[6:9]
    a2 = features[9:12]
    a3 = features[12:15]
    a4 = features[15:18]
    b1 = features[18:21]
    b2 = features[21:24]
    b3 = features[24:27]
    b4 = features[27:]
    # group players by team
    if drop:
        a_team = [a2, a3, a4]
    else:
        a_team = [a1, a2, a3, a4]
    b_team = [b1, b2, b3, b4]

    # initialize temporaty storage list
    tmp = collections.deque()
    # iterate through rows
    for index, row in df.iterrows():
        # get features common to both teams
        both = [index] + row[shared].tolist()
        # append features for each a team player
        for player in a_team:
            tmp.append(both + row[player].tolist() + [row.win == 'alpha'])
        # append features for each b team player
        for player in b_team:
            tmp.append(both + row[player].tolist() + [row.win == 'bravo'])

    #create new column names, including the index of the match the data was from
    new_cols = ['match'] + shared + ['weapon', 'rank', 'level', 'win']
    # return new koalas database
    return ks.DataFrame(tmp, columns=new_cols)

In [None]:
df = splatoon_explode(df)
df.info()

Save a spark parquet file of the output

In [None]:
df.to_spark_io('data/explode.parquet', format='parquet')
# check file
ks.read_spark_io('data/explode.parquet', format='parquet').head(3)