In [1]:
import datetime
import pandas as pd
import json
import pickle
with open('json_dict.pkl', 'rb') as file:
    data_list = pickle.load(file)

In [2]:
def aggregate_data_collect(data_list):
    aggregate_data_list = []

    for match in data_list:
        keys = list(match["clubs"].keys())
        team_1_data = {
            'matchId': int(match["matchId"]),
            'timestamp' :datetime.datetime.fromtimestamp((int(match['timestamp']))),
            'teamId' : int(keys[0]),
            'teamSize' : len(match["players"][keys[0]])
        }
        team_2_data = {
            'matchId': int(match["matchId"]),
            'timestamp' :datetime.datetime.fromtimestamp((int(match['timestamp']))),
            'teamId' : int(keys[1]),
            'teamSize' : len(match["players"][keys[1]])
        }
        
        # Add key-value pairs from match["aggregate"][keys[0]] to team_1_data
        for key, value in match["aggregate"][keys[0]].items():
            team_1_data[key] = value
        
        # Add key-value pairs from match["aggregate"][keys[1]] to team_2_data
        for key, value in match["aggregate"][keys[1]].items():
            team_2_data[key] = value

        aggregate_data_list.append(team_1_data)
        aggregate_data_list.append(team_2_data)
    
    return aggregate_data_list


In [3]:
def player_data_collect(data_list):
    aggregate_data_list = []

    for match in data_list:
        keys = list(match["clubs"].keys())

        # Create lists to store player data for each team
        team_1_players_data = []
        team_2_players_data = []

        # Generate list of Team 1 players
        team_1_players = list(match["players"][keys[0]].keys())

        # Generate list of Team 2 players
        team_2_players = list(match["players"][keys[1]].keys())

        # Add key-value pairs from match["players"][keys[0]] to team_1_data
        for player_id, player_data in match["players"][keys[0]].items():
            player_data_dict = {
                'matchId': int(match["matchId"]),
                "playerId": int(player_id),
                'teamId': int(keys[0])
            }
            for key, value in player_data.items():
                # Convert to integer if value can be converted and key is not "playername"
                if value.isdigit() and key != "playername":
                    player_data_dict[key] = int(value)
                else:
                    player_data_dict[key] = value
            team_1_players_data.append(player_data_dict)

        # Add key-value pairs from match["players"][keys[1]] to team_2_data
        for player_id, player_data in match["players"][keys[1]].items():
            player_data_dict = {
                'matchId': int(match["matchId"]),
                "playerId": int(player_id),
                'teamId': int(keys[0])
            }
            for key, value in player_data.items():
                # Convert to integer if value can be converted and key is not "playername"
                if value.isdigit() and key != "playername":
                    player_data_dict[key] = int(value)
                else:
                    player_data_dict[key] = value
            team_2_players_data.append(player_data_dict)

        aggregate_data_list.extend(team_1_players_data)
        aggregate_data_list.extend(team_2_players_data)

        # Remove the player attribute data from each dictionary. 
        for data in aggregate_data_list:
            data.pop("vproattr", None)
            
    return aggregate_data_list


In [4]:
vproattr_key_names = [
    "vproattr_1", "vproattr_2", "vproattr_3", "vproattr_4", "vproattr_5",
    "vproattr_6", "vproattr_7", "vproattr_8", "vproattr_9", "vproattr_10",
    "vproattr_11", "vproattr_12", "vproattr_13", "vproattr_14", "vproattr_15",
    "vproattr_16", "vproattr_17", "vproattr_18", "vproattr_19", "vproattr_20",
    "vproattr_21", "vproattr_22", "vproattr_23", "vproattr_24", "vproattr_25",
    "vproattr_26", "vproattr_27", "vproattr_28", "vproattr_29", "vproattr_30",
    "vproattr_31", "vproattr_32", "vproattr_33", "vproattr_34"
]


In [5]:
def player_build_attributes(data_list, vproattr_key_names):
    vproattr_data_list = []

    for match in data_list:
        keys = list(match["clubs"].keys())

        # Team IDs
        team1_id = int(keys[0])
        team2_id = int(keys[1])

        # Iterate through players in Team 1
        for player_id, player_data in match["players"][keys[0]].items():
            game_id = int(match["matchId"])
            player_id = int(player_id)
            vproattr_values = player_data.get("vproattr", "").split("|")
            vproattr_values = [int(value) for value in vproattr_values if value.isdigit()]

            # Create a dictionary with keys for each vproattr value
            vproattr_data = {
                "gameId": game_id,
                "playerId": player_id,
                "teamId": team1_id,  # Add the team ID here
            }

            # Add vproattr values with user-defined key names
            for i, key_name in enumerate(vproattr_key_names):
                if i < len(vproattr_values):
                    vproattr_data[key_name] = vproattr_values[i]

            vproattr_data_list.append(vproattr_data)

        # Iterate through players in Team 2
        for player_id, player_data in match["players"][keys[1]].items():
            game_id = int(match["matchId"])
            player_id = int(player_id)
            vproattr_values = player_data.get("vproattr", "").split("|")
            vproattr_values = [int(value) for value in vproattr_values if value.isdigit()]

            # Create a dictionary with keys for each vproattr value
            vproattr_data = {
                "gameId": game_id,
                "playerId": player_id,
                "teamId": team2_id,  # Add the team ID here
            }

            # Add vproattr values with user-defined key names
            for i, key_name in enumerate(vproattr_key_names):
                if i < len(vproattr_values):
                    vproattr_data[key_name] = vproattr_values[i]

            vproattr_data_list.append(vproattr_data)

    return vproattr_data_list

In [6]:
def extract_cosmetic_data(data_list):
    club_data_dict = {}  # Dictionary to store data for each club

    for match in data_list:
        keys = list(match["clubs"].keys())

        for key in keys:
            club_data = {
                'clubId': int(key),
                'timestamp': datetime.datetime.fromtimestamp(int(match['timestamp'])),
                'name': match['clubs'][key]['details']['name'],
                'regionId': int(match['clubs'][key]['details']['regionId']),
                'teamId': int(match['clubs'][key]['details']['teamId']),
                **match['clubs'][key]['details']['customKit'],  # Include all customKit values
            }

            # Convert as many values as possible to integers
            for k, v in club_data.items():
                if k not in ["name", "stadName"]:
                    try:
                        club_data[k] = int(v)
                    except (ValueError, TypeError):
                        pass

            # Check if the clubId already exists in club_data_dict and if the new data has a newer timestamp
            if key in club_data_dict:
                existing_timestamp = club_data_dict[key]['timestamp']
                if club_data['timestamp'] > existing_timestamp:
                    club_data_dict[key] = club_data
            else:
                club_data_dict[key] = club_data

    # Convert the values in club_data_dict to a list
    custom_kit_data_list = list(club_data_dict.values())

    return custom_kit_data_list

In [7]:
aggregate_data_final = []
player_data_final = []
player_build_stats_final = []
cosmetic_data_final = []

aggregate_data = aggregate_data_collect(data_list)
player_data = player_data_collect(data_list)
player_build_stats = player_build_attributes(data_list, vproattr_key_names)
cosmetic_data = extract_cosmetic_data(data_list)
    
aggregate_data_final.extend(aggregate_data)
player_data_final.extend(player_data)
player_build_stats_final.extend(player_build_stats)
cosmetic_data_final.extend(cosmetic_data)

In [8]:
aggregate_df = pd.DataFrame(aggregate_data_final)
player_data_df = pd.DataFrame(player_data_final)
player_build_stats_df = pd.DataFrame(player_build_stats_final)
cosmetic_data_df = pd.DataFrame(cosmetic_data_final)

In [9]:
print(len(aggregate_df))
aggregate_df.head()

10


Unnamed: 0,matchId,timestamp,teamId,teamSize,assists,cleansheetsany,cleansheetsdef,cleansheetsgk,goals,goalsconceded,...,realtimeidle,redcards,saves,SCORE,shots,tackleattempts,tacklesmade,vproattr,vprohackreason,wins
0,103580633270055,2023-07-03 15:08:02,1298463,9,3,0,0,0,5,0,...,138,0,0,54,9,11,6,0,72,9
1,103580633270055,2023-07-03 15:08:02,13718725,7,0,0,0,0,0,6,...,100,0,0,0,0,2,1,0,48,0
2,103542747090343,2023-07-03 14:53:49,1298463,9,4,0,0,0,4,0,...,89,0,0,36,7,9,3,0,64,9
3,103542747090343,2023-07-03 14:53:49,7804248,2,0,0,0,0,0,4,...,20,0,0,0,0,1,0,0,16,0
4,98578949310256,2023-06-19 14:59:40,1298463,9,0,0,4,0,0,0,...,65,0,0,0,2,34,14,0,64,0


In [10]:
print(len(player_data_df))
player_data_df.head()

84


Unnamed: 0,matchId,playerId,teamId,assists,cleansheetsany,cleansheetsdef,cleansheetsgk,goals,goalsconceded,losses,...,realtimeidle,redcards,saves,SCORE,shots,tackleattempts,tacklesmade,vprohackreason,wins,playername
0,103580633270055,211006765,1298463,1,0,0,0,1,0,0,...,3,0,0,6,1,1,0,8,1,gl0rf1n
1,103580633270055,246851342,1298463,0,0,0,0,0,0,0,...,3,0,0,6,0,0,0,8,1,MCBizzare_SNR
2,103580633270055,261249817,1298463,1,0,0,0,1,0,0,...,16,0,0,6,1,1,0,8,1,Alean1ck
3,103580633270055,263109809,1298463,0,0,0,0,1,0,0,...,20,0,0,6,2,2,1,8,1,Radi4ego07
4,103580633270055,302183775,1298463,0,0,0,0,0,0,0,...,45,0,0,6,0,0,0,8,1,Ch1chha


In [11]:
print(len(player_build_stats_df))
player_build_stats_df.head()

84


Unnamed: 0,gameId,playerId,teamId,vproattr_1,vproattr_2,vproattr_3,vproattr_4,vproattr_5,vproattr_6,vproattr_7,...,vproattr_25,vproattr_26,vproattr_27,vproattr_28,vproattr_29,vproattr_30,vproattr_31,vproattr_32,vproattr_33,vproattr_34
0,103580633270055,211006765,1298463,84.0,91.0,81.0,82.0,78.0,87.0,95.0,...,51.0,44.0,88.0,74.0,90.0,10.0,10.0,10.0,10.0,10.0
1,103580633270055,246851342,1298463,86.0,92.0,72.0,65.0,90.0,95.0,91.0,...,92.0,84.0,66.0,92.0,68.0,10.0,10.0,10.0,10.0,10.0
2,103580633270055,261249817,1298463,93.0,95.0,94.0,90.0,79.0,94.0,85.0,...,64.0,61.0,73.0,79.0,75.0,10.0,10.0,10.0,10.0,10.0
3,103580633270055,263109809,1298463,89.0,97.0,90.0,90.0,80.0,90.0,89.0,...,67.0,64.0,82.0,78.0,81.0,10.0,10.0,10.0,10.0,10.0
4,103580633270055,302183775,1298463,84.0,94.0,73.0,81.0,93.0,93.0,88.0,...,96.0,94.0,70.0,76.0,68.0,10.0,10.0,10.0,10.0,10.0


In [12]:
print(len(cosmetic_data_df))
cosmetic_data_df.head()

6


Unnamed: 0,clubId,timestamp,name,regionId,teamId,stadName,kitId,isCustomTeam,customKitId,customAwayKitId,...,kitColor2,kitColor3,kitColor4,kitAColor1,kitAColor2,kitAColor3,kitAColor4,dCustomKit,crestColor,crestAssetId
0,1298463,2023-07-03 15:08:02,Echelon FC,4539733,111473,Kirklees Stadium,1826373632,0,7509,7509,...,592397,592397,592397,592397,15921906,15921906,15921906,0,-1,99160728
1,13718725,2023-07-03 15:08:02,SHADDUP,4344147,1474,Stadion Hanguk,12075008,1,7513,7517,...,592397,14358546,14358546,7192366,592397,592397,592397,0,0,99160119
2,7804248,2023-07-03 14:53:49,Bad Boys 2023,4539733,1335,Parc des Princes,21872640,0,7509,7509,...,592397,592397,592397,592397,15921906,15921906,15921906,0,-1,99160212
3,13075737,2023-06-19 14:59:40,FC Continental,5129557,100409,Coliseum Alfonso Pérez,822550529,0,7509,7509,...,592397,592397,592397,592397,15921906,15921906,15921906,0,-1,99160601
4,859463,2023-06-15 15:28:57,VFC RED STAR,5719381,217,red star,1777667,0,7646,7646,...,33627,12525622,12525622,1713203,15921906,33627,15921906,1,1611589,99160603
