# Data Collection

#### Here we are taking all of the match JSONs from the first stage of the data collection, and extracting useful information. We'll get an overview of each players performance in the match, the details, and the URL for the telemtery file, which gives a comprehensive match breakdown.

In [3]:
from tqdm import tqdm
import requests
import pandas as pd
import numpy as np
import time
import json
from datetime import datetime
import os
import pickle

In [4]:
folder_path = r'/Users/tim/Desktop/GA/lessons/DSI16-lessons/project/project-capstone/My capstone/solo_b_matches'
other_solos = r'/Users/tim/Desktop/GA/lessons/DSI16-lessons/project/project-capstone/My capstone/other_solo_matches'

#this function creates a list of the files in the paths listed above.

def listDir(dir):
    temp = []
    fileNames = os.listdir(dir)
    for fileName in fileNames:
        temp.append(fileName)
    return temp

solo_match_files_ = listDir(folder_path)

This makes a dataframe with the summaries for each players actions in a match.

In [None]:
player_list = []

for match_file in tqdm(solo_match_files_):
    with open('../solo_b_matches/' +match_file) as json_file:
        match_data = json.load(json_file)
    
    temp_attributes = []
    for i in sorted(list(match_data['data']['attributes'])):
        temp_attributes.append(match_data['data']['attributes'][i])

    temp_attributes.append(len(match_data['data']['relationships']['rosters']['data']))
    temp_attributes.append(match_data['data']['id'])

    for i in match_data['included']:
        temp_list = temp_attributes.copy()
        if i['type'] == 'roster':
            continue
        if i['type'] == 'participant':
            temp_list.append(i['attributes']['shardId'])
            for j in sorted(list(i['attributes']['stats'].keys())):
                temp_list.append(i['attributes']['stats'][j])
        player_list.append(temp_list)

column_names = ['createdAt','duration', 'gameMode', 'isCustomMatch', 'mapName', 'matchType', 'seasonState', 'shardId', 'stats', 'tags','titleId','starting_players',
                'matchID','console','DBNOs', 'assists', 'boosts', 'damageDealt', 'deathType', 'headshotKills', 'heals', 'killPlace', 'killStreaks', 'kills', 'longestKill',
                'name', 'playerId', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills', 'timeSurvived', 'vehicleDestroys', 'walkDistance',
                'weaponsAcquired', 'winPlace']

df = pd.DataFrame(player_list,columns=column_names)
df.to_csv(r'match_df_expanded.csv', index = False)

This pulls slightly different info. the top 3 placing players, two middle and two bottom, and the three players are identified as 'key players'. I take data on these players so I can later track their movements and position through a game. We also pull the telemtery URL at this point, which can be used for indepth details of the match.

In [None]:
#for pulling solo baltic files and pushing that df.

rows = []
col_names = []

for match_file in tqdm(solo_match_files_):
    with open('solo_b_matches/' +match_file) as json_file:
        match_data = json.load(json_file)

    #here we're checking that a match has at least 70 players in it. If it hasn't, I'm not interested.
    if len(match_data['data']['relationships']['rosters']['data']) < 70:
        continue
    
    try:

        ##THIS PULLS BASIC MATCH ATTRIBUTES AS WELL AS COLUMNS NAMES FOR THEM.
        match_keys = sorted(list(match_data['data']['attributes']))
        match_attributes = [match_data['data']['id']]
        match_attributes_col_names = ['matchId']
        for i in sorted(match_data['data']['attributes']):
            match_attributes.append(match_data['data']['attributes'][i])
            match_attributes_col_names.append(i)

        list_of_players = []
        list_of_placement_positions = []
        list_of_kill_positions = []
        for i in match_data['included']:
            if i['type'] == 'participant':
                list_of_players.append(i)
                list_of_placement_positions.append(i['attributes']['stats']['winPlace'])
                list_of_kill_positions.append(i['attributes']['stats']['killPlace'])
            #this pulls the telemetry url for indepth match data.
            if i['type'] == 'asset':
                telemetry_url = [i['attributes']['URL']]


        list_of_placement_positions = sorted(list_of_placement_positions)
        list_of_kill_positions = sorted(list_of_kill_positions)

        for j,i in enumerate(list_of_players):
            #iterating through players to find winner. Naming index for winner.
            if i['attributes']['stats']['winPlace'] == list_of_placement_positions[0]:
                P_1 = ('P_1', j)
                continue
            if i['attributes']['stats']['winPlace'] == list_of_placement_positions[1]:
                P_2 = ('P_2', j)
                continue
            if i['attributes']['stats']['winPlace'] == list_of_placement_positions[2]:
                P_3 = ('P_3', j)
                continue
            if i['attributes']['stats']['winPlace'] == list_of_placement_positions[(round(len(list_of_placement_positions)/2))] :
                P_mid = ('P_mid', j)
                continue
            if i['attributes']['stats']['winPlace'] == list_of_placement_positions[(round(len(list_of_placement_positions)/2)-1)]:
                P_mid2 = ('P_mid2', j)
                continue
            if i['attributes']['stats']['winPlace'] == list_of_placement_positions[-10]:
                P_last_but10 = ('P_last_but10', j)
                continue
            if i['attributes']['stats']['winPlace'] == list_of_placement_positions[-1]:
                P_last = ('P_last', j)
                continue

        for j,i in enumerate(list_of_players):
            if i['attributes']['stats']['killPlace'] == list_of_kill_positions[0]:
                K_1 = ('K_1', j)
                continue
            if i['attributes']['stats']['killPlace'] == list_of_kill_positions[1]:
                K_2 = ('K_2', j)
                continue
            if i['attributes']['stats']['killPlace'] == list_of_kill_positions[2]:
                K_3 = ('K_3', j)


        players_to_pull = [P_1,P_2, P_3,P_mid,P_mid2,P_last_but10,P_last,K_1,K_2,K_3]        


        #this creates a list of keys for the 'stats' dictionary
        stats_keys = sorted(list(list_of_players[0]['attributes']['stats']))

        #This compiles all the player stat values and column titles
        player_stat_values = []
        player_stat_column_names = []
        for i in players_to_pull:
            for x in stats_keys:
                player_stat_values.append(list_of_players[i[1]]['attributes']['stats'][x])
                player_stat_column_names.append(f'{i[0]}_{x}')
            player_stat_values.append(list_of_players[i[1]]['attributes']['shardId'])
            player_stat_column_names.append(i[0]+'_gaming_system')

        row_values = match_attributes + telemetry_url + player_stat_values
        rows.append(row_values)
        
    except:
        pass

#don't indent this.
telemetry_name = ['telemetry_url']
col_names = match_attributes_col_names + telemetry_name + player_stat_column_names
df = pd.DataFrame(rows,columns=col_names)
df.to_csv(r'match_df.csv', index = False)
