# Data Cleaning Playground Notebook

This notebook contains random code snippets, thoughts, and ideas for cleaning up the data related to my Splatoon 3 battle data.

## Package Imports and Set Up

In [8]:
import pandas as pd
import json
import requests
import numpy as np

## Read in My Personal Battle Data

In [51]:
battle_df = pd.read_csv('./../data/statink-super64guy.csv', index_col='id')
len(battle_df.index)

327

In [52]:
print(battle_df['lobby'].isna().sum())
print(battle_df['rule'].isna().sum())
print(battle_df['stage'].isna().sum())
print(battle_df['rank_before'].isna().sum())
print(battle_df['game_version'].isna().sum())

0
0
0
221
0


## General Function to Pop Out JSON Data

In [10]:
def pop_general_json_data(df, col_name):
    for i in df.index:
        if type(df[col_name][i]) == str:
            df.loc[i, col_name] = eval(df[col_name][i])['name']['en_US']

In [54]:
col_names = ['lobby', 'rule', 'stage', 'game_version']

for name in col_names:
    pop_general_json_data(battle_df, name)

battle_df.to_csv('test.csv', index=True)

## Function for Popping Timestamps

In [8]:
def pop_timestamp_json_data(df, col_name):
    for i in df.index:
        df.loc[i, col_name] = eval(df[col_name][i])['time']

In [56]:
col_names = ['start_at', 'end_at']

for name in col_names:
    pop_timestamp_json_data(battle_df, name)

battle_df.to_csv('test.csv', index=True)

## Function for Popping Weapons

In [57]:
def pop_weapon_data(df):
    main_list = []
    sub_list = []
    special_list = []
    for i in df.index:
        tmp_dict = eval(df.loc[i, 'weapon'])
        main_list.append(tmp_dict['name']['en_US'])
        sub_list.append(tmp_dict['sub']['name']['en_US'])
        special_list.append(tmp_dict['special']['name']['en_US'])
    if len(main_list) == len(sub_list) == len(special_list):
        df['main_weapon'] = main_list
        df['sub_weapon'] = sub_list
        df['special_weapon'] = special_list

pop_weapon_data(battle_df)
battle_df.to_csv('test.csv', index=True)

## Update Rule to Remove Lobby

In [58]:
set(battle_df['lobby'].to_list())

{'Anarchy Battle (Open)',
 'Anarchy Battle (Series)',
 'Private Battle',
 'Regular Battle',
 'Splatfest (Open)'}

In [49]:
def update_lobby(df):
    for i in df.index:
        if df['lobby'][i] == 'Regular Battle':
            df.loc[i, 'rule'] = df['rule'][i] + ' (Regular)'
        elif df['lobby'][i] == 'Anarchy Battle (Open)':
            df.loc[i, 'rule'] = df['rule'][i] + ' (Open)'
        elif df['lobby'][i] == 'Anarchy Battle (Series)':
            df.loc[i, 'rule'] = df['rule'][i] + ' (Series)'
        elif df['lobby'][i] == 'Private Battle':
            df.loc[i, 'rule'] = df['rule'][i] + ' (Private)'
        elif df['lobby'][i] == 'Splatfest (Open)' and df['rule'][i] != 'Tricolor Turf War':
            df.loc[i, 'rule'] = df['rule'][i] + ' (Splatfest Open)'
        elif df['lobby'][i] == 'Splatfest (Series)':
            df.loc[i, 'rule'] = df['rule'][i] + ' (Splatfest Series)'

update_lobby(battle_df)
battle_df.to_csv('test.csv', index=True)

## Data Cleaning for Salmon Run

Many of the same functions that have already been written can also be used to clean the salmon run data as well. We can see the effects of this with the following code cell:

In [6]:
import pandas as pd

salmonrun_df = pd.read_csv('./../data/statink-super64guy-salmonrun.csv', index_col='id')
print(salmonrun_df['fail_reason']['d83dd9d7-a4ae-46d9-a34c-1835fe3c0f89'])
print(type(salmonrun_df['fail_reason']['d83dd9d7-a4ae-46d9-a34c-1835fe3c0f89']))
print(type(salmonrun_df['fail_reason']['d83dd9d7-a4ae-46d9-a34c-1835fe3c0f89']) == str)

{'key': 'wipe_out', 'name': {'en_US': 'Wipe out', 'ja_JP': '全滅'}}
<class 'str'>
True


In [9]:
pop_timestamp_json_data(salmonrun_df, 'created_at')
pop_timestamp_json_data(salmonrun_df, 'start_at')

In [11]:
cols = ['game_version', 'fail_reason', 'king_salmonid', 'stage', 'title_before', 'title_after']
for col in cols:
    pop_general_json_data(salmonrun_df, col)

In [12]:
salmonrun_df.to_csv('test_salmon.csv', index=True)

### Salmon Run: Navigating Player Processing

* This function is being used to parse out the information regarding player information in salmon run
* My thought it to create a dictionary mapping all the values needed, then appending it to the already existing dataframe

In [23]:
def pop_player_salmon_run_info(df):
    cum_player_info = {'p1':{}, 'p2':{}, 'p3':{}, 'my':{}}

    for i in df.index:
        player_info_json = None
        if type(df['players'][i]) == str:
            player_info_json = eval(df['players'][i])
        player_info_dict = {1:{}, 2:{}, 3:{}, 4:{}}
        for i in range(4):
            player_info_dict[i+1]['is_me'] = player_info_json[i]['me']
            player_info_dict[i+1]['golden_eggs'] = player_info_json[i]['golden_eggs']
            player_info_dict[i+1]['golden_assists'] = player_info_json[i]['golden_assist']
            player_info_dict[i+1]['power_eggs'] = player_info_json[i]['power_eggs']
            player_info_dict[i+1]['rescues'] = player_info_json[i]['rescue']
            player_info_dict[i+1]['rescued'] = player_info_json[i]['rescued']
            player_info_dict[i+1]['boss_kills'] = player_info_json[i]['defeat_boss']

        # sort the dictionary
        sorted_dict = dict(sorted(player_info_dict.items(), key=lambda x: x[1]['is_me']))
        # print(sorted_dict)

        count = 1
        for i in sorted_dict.keys():
            curr_dict = player_info_dict[i]
            if curr_dict['is_me'] == True:
                for key, value in curr_dict.items():
                    if key not in cum_player_info['my']:
                        cum_player_info['my'][key] = []
                    cum_player_info['my'][key].append(value)
            else:
                # print(curr_dict)
                for key, value in curr_dict.items():
                    if key not in cum_player_info['p'+str(count)]:
                        cum_player_info['p'+str(count)][key] = []
                    cum_player_info['p'+str(count)][key].append(value)
                count += 1
    
    # print(cum_player_info)
    for i in cum_player_info.keys():
        # print(i)
        for key, value in cum_player_info[i].items():
            df[i + '_' + key] = value

    return df

pop_player_salmon_run_info(salmonrun_df).to_csv('test_salmon.csv', index=True)

{'p1': {'is_me': [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, Fa

In [2]:
tmp_dict = {1:{}, 2:{}}
tmp_dict[1]['a'] = 'test'

for key in tmp_dict:
    print(tmp_dict[key])

{'a': 'test'}
{}


In [3]:
tmp_dict = {1:{'test':True}, 2:{'test':True}, 3:{'test':False}}

sorted_dict = dict(sorted(tmp_dict.items(), key=lambda x: x[1]['test']))
sorted_dict

{3: {'test': False}, 1: {'test': True}, 2: {'test': True}}

## Cleaning the Worldwide Data

This code will now clean the worldwide data. There really isn't much cleaning that we have to do - mostly we just have to go through and update the keys with their English names