In [1]:
'''GENERAL'''
import os
import collections
import glob
'''DATA'''
import numpy as np
import pandas as pd
import databricks.koalas as ks
'''SIGNATURE'''
!fortune | cowsay -f dragon

 _________________________________________ 
/  A master was explaining the nature of  \
| Tao to one of his novices. "The Tao is  |
| embodied in all software -- regardless  |
| of how insignificant," said the master. |
|                                         |
| "Is Tao in a hand-held calculator?"     |
| asked the novice.                       |
|                                         |
| "It is," came the reply.                |
|                                         |
| "Is the Tao in a video game?" continued |
| the novice.                             |
|                                         |
| "It is even in a video game," said the  |
| master.                                 |
|                                         |
| "And is the Tao in the DOS for a        |
| personal computer?"                     |
|                                         |
| The master coughed and shifted his      |
| position slightly. "The lesson is over  |
| for toda

# Initial Processing

I have a directory of csvs from stat.ink which contains player submitted match data. I want to process them into one cleaned database. Leveraging the power of Koalas, I can use a Spark backend with a Pandas-like interface. I will first clean one csv to devolop a pipeline.

### Combining csv's
My data is currently spread among hundreds of csvs.

In [None]:
# Gather list of raw data filenames
raw = glob.glob(os.path.join('data/raw', '*.csv'))
# Count the number of csvs
len(raw)

In [None]:
# Load one to inspect
test = pd.read_csv(raw[0])
test.head()

I will need to combine all these into something usable like a pandas dataframe.

In [None]:
def splatoon_concat(path = 'data/raw'):
    """ 
    Concatanates multiple csvs from one directory into a dataframe
  
    Parameters: 
    path (string): optional specified file path
  
    Returns: 
    DataFrame: pandas dataframe containing data from all csvs.

    """
    # get filenames
    files = glob.glob(os.path.join(path, '*.csv'))
    # concat all files
    return pd.concat((pd.read_csv(f) for f in files))

In [None]:
df = splatoon_concat()
df.shape

### Preliminary Feature Selection

Since this data is from match results and I want to create a predictive model for win rate, there are many features that cannot be used as they are only knowable at the end of a match:

- period: when the match was played
    - irrelevant because this is not a timeseries model
    - version number better covers patch changes
- time: how long the match took
- knockout: if the match was won through the objective as opposed to a time-out
- player kill/assist/death/special/inked: player statistics calculated at the end of each match

In [None]:
def splatoon_drop(df):
    """ 
    Drops unnessecary features in stat.ink dataframes
  
    Parameters: 
    df (DataFrame): pandas or koalas dataframe on which to drop values
  
    Returns: 
    DataFrame: the dataframe with dropped features
  
    """
    # initialize list with first features
    drop_lst = ['# period','time', 'knockout']
    # concatanate player statistics features for each player
    for player in ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4']:
        drop_lst += [player+'-kill-assist', player+'-kill', player+'-assist',
                     player+'-death', player+'-special', player+'-inked']
    # drop features in list
    return df.drop(drop_lst, axis=1)

In [None]:
df = splatoon_drop(df)
df.shape

Save output as a pickle file for midpoint backup, the next part can get dicey.

In [None]:
# df.to_pickle('data/merge.pkl')

Load from pickle due to crashing kernel

In [None]:
kdf = ks.from_pandas(pd.read_pickle('data/merge.pkl'))
kdf.shape

### Exploding the rows

Since I am interested in making predictions based off data from one player, I will explode each match data into 8 separate rows, one for every player. The function will drop data from A1 by default as A1 is the player who submitted the data, which can lead to bias. I have more than enough data to drop it.

In [None]:
def splatoon_explode(df, drop=True):
    """ 
    Explodes rows of stat.ink dataframes into one row for each player.
    This function is intended to be run after splatoon_drop and
    is hardcoded to accept its output features.
  
    Parameters: 
    df (DataFrame): Pandas dataframe on which to explode
  
    Returns: 
    koalas: Exploded dataframe stored in a koalas database
  
    """
    # get feature names
    features = df.columns.tolist()
    # features for all players
    shared = features[:5]
    # features for specific player
    a1 = features[6:9]
    a2 = features[9:12]
    a3 = features[12:15]
    a4 = features[15:18]
    b1 = features[18:21]
    b2 = features[21:24]
    b3 = features[24:27]
    b4 = features[27:]
    # group players by team
    if drop:
        a_team = [a2, a3, a4]
    else:
        a_team = [a1, a2, a3, a4]
    b_team = [b1, b2, b3, b4]

    # initialize temporaty storage list
    tmp = collections.deque()
    # iterate through rows
    for index, row in df.iterrows():
        # get features common to both teams
        both = [index] + row[shared].tolist()
        # append features for each a team player
        for player in a_team:
            tmp.append(both + row[player].tolist() + [row.win == 'alpha'])
        # append features for each b team player
        for player in b_team:
            tmp.append(both + row[player].tolist() + [row.win == 'bravo'])

    #create new column names, including the index of the match the data was from
    new_cols = ['match'] + shared + ['weapon', 'rank', 'level', 'win']
    # return new koalas database
    return ks.DataFrame(list(tmp), columns=new_cols)

In [None]:
df = splatoon_explode(df)
df.shape

Save a spark parquet file of the output

In [None]:
df.to_spark_io('data/explode.parquet', format='parquet')
# check file
ks.read_spark_io('data/explode.parquet', format='parquet').head(3)