In [1]:
import pandas as pd
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup
import datetime
import json
import requests
import sys
import time

In [None]:
leaderboard_df = pd.read_csv('T100Leaderboard.csv')
leaderboard_df = leaderboard_df[["clubId", "platform"]]
leaderboard_df.head(3)

In [None]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
match_overview_dictionary_list = []
player_build_stat_list = []
player_stat_list = []
club_cosmetic_list = []

In [None]:
vproattr_key_names = [
    "vproattr_1", "vproattr_2", "vproattr_3", "vproattr_4", "vproattr_5",
    "vproattr_6", "vproattr_7", "vproattr_8", "vproattr_9", "vproattr_10",
    "vproattr_11", "vproattr_12", "vproattr_13", "vproattr_14", "vproattr_15",
    "vproattr_16", "vproattr_17", "vproattr_18", "vproattr_19", "vproattr_20",
    "vproattr_21", "vproattr_22", "vproattr_23", "vproattr_24", "vproattr_25",
    "vproattr_26", "vproattr_27", "vproattr_28", "vproattr_29", "vproattr_30",
    "vproattr_31", "vproattr_32", "vproattr_33", "vproattr_34"
]

In [None]:
def match_overview_data_assign(json_dict):
    result = []

    for game in json_dict:
        match_dict = {
            'matchId': game['matchId'],
            'timestamp': datetime.datetime.utcfromtimestamp(game['timestamp']).strftime('%Y-%m-%d %H:%M:%S UTC'),
        }

        both_club_data = game['clubs']

        # Initialize team index
        team_index = 1

        # Iterate through team data directly
        for team_prefix, data in both_club_data.items():
            prefix = f'team_{team_index}'  # Create a dynamic prefix
            team_index += 1  # Increment team index for the next iteration

            for key, value in data.items():
                if key == 'details':
                    match_dict[f'{prefix}_{key}_name'] = value['name']
                    match_dict[f'{prefix}_{key}_clubId'] = str(value['clubId'])
                    match_dict[f'{prefix}_{key}_regionId'] = str(value['regionId'])
                    match_dict[f'{prefix}_{key}_teamId'] = str(value['teamId'])
                    match_dict[f'{prefix}_{key}_stadName'] = value['customKit']['stadName']
                else:
                    match_dict[f'{prefix}_{key}'] = str(value)

        result.append(match_dict)

    return result

In [None]:
def player_build_attributes(data_list, vproattr_key_names):
    vproattr_data_list = []

    for match in data_list:
        keys = list(match["clubs"].keys())

        # Team IDs
        team1_id = int(keys[0])
        team2_id = int(keys[1])

        # Iterate through players in Team 1
        for player_id, player_data in match["players"][keys[0]].items():
            game_id = int(match["matchId"])
            player_id = int(player_id)
            vproattr_values = player_data.get("vproattr", "").split("|")
            vproattr_values = [int(value) for value in vproattr_values if value.isdigit()]

            # Create a dictionary with keys for each vproattr value
            vproattr_data = {
                "gameId": game_id,
                "playerId": player_id,
                "teamId": team1_id,  # Add the team ID here
            }

            # Add vproattr values with user-defined key names
            for i, key_name in enumerate(vproattr_key_names):
                if i < len(vproattr_values):
                    vproattr_data[key_name] = vproattr_values[i]

            vproattr_data_list.append(vproattr_data)

        # Iterate through players in Team 2
        for player_id, player_data in match["players"][keys[1]].items():
            game_id = int(match["matchId"])
            player_id = int(player_id)
            vproattr_values = player_data.get("vproattr", "").split("|")
            vproattr_values = [int(value) for value in vproattr_values if value.isdigit()]

            # Create a dictionary with keys for each vproattr value
            vproattr_data = {
                "gameId": game_id,
                "playerId": player_id,
                "teamId": team2_id,  # Add the team ID here
            }

            # Add vproattr values with user-defined key names
            for i, key_name in enumerate(vproattr_key_names):
                if i < len(vproattr_values):
                    vproattr_data[key_name] = vproattr_values[i]

            vproattr_data_list.append(vproattr_data)

    return vproattr_data_list

In [None]:
def player_data_collect(data_list):
    aggregate_data_list = []

    for match in data_list:
        keys = list(match["clubs"].keys())
        team_1_data = {
            'matchId': int(match["matchId"]),
            'teamId': int(keys[0]),
        }
        team_2_data = {
            'matchId': int(match["matchId"]),
            'teamId': int(keys[1]),
        }

        # Create lists to store player data for each team
        team_1_players_data = []
        team_2_players_data = []

        # Generate list of Team 1 players
        team_1_players = list(match["players"][keys[0]].keys())

        # Generate list of Team 2 players
        team_2_players = list(match["players"][keys[1]].keys())

        # Add key-value pairs from match["players"][keys[0]] to team_1_data
        for player_id, player_data in match["players"][keys[0]].items():
            player_data_dict = {
                'matchId': int(match["matchId"]),
                'teamId': int(keys[0]),
                "playerId": int(player_id),
            }
            for key, value in player_data.items():
                # Convert to integer if value can be converted and key is not "playername"
                if value.isdigit() and key != "playername":
                    player_data_dict[key] = int(value)
                else:
                    player_data_dict[key] = value
            team_1_players_data.append(player_data_dict)

        # Add key-value pairs from match["players"][keys[1]] to team_2_data
        for player_id, player_data in match["players"][keys[1]].items():
            player_data_dict = {
                'matchId': int(match["matchId"]),
                'teamId': int(keys[1]),
                "playerId": int(player_id),
            }
            for key, value in player_data.items():
                # Convert to integer if value can be converted and key is not "playername"
                if value.isdigit() and key != "playername":
                    player_data_dict[key] = int(value)
                else:
                    player_data_dict[key] = value
            team_2_players_data.append(player_data_dict)

        aggregate_data_list.extend(team_1_players_data)
        aggregate_data_list.extend(team_2_players_data)

    return aggregate_data_list

In [None]:
def extract_cosmetic_data(data_list):
    club_data_dict = {}  # Dictionary to store data for each club

    for match in data_list:
        keys = list(match["clubs"].keys())

        for key in keys:
            club_data = {
                'clubId': int(key),
                'timestamp': datetime.datetime.fromtimestamp(int(match['timestamp'])),
                'name': match['clubs'][key]['details']['name'],
                'regionId': int(match['clubs'][key]['details']['regionId']),
                'teamId': int(match['clubs'][key]['details']['teamId']),
                **match['clubs'][key]['details']['customKit'],  # Include all customKit values
            }

            # Convert as many values as possible to integers
            for k, v in club_data.items():
                if k not in ["name", "stadName"]:
                    try:
                        club_data[k] = int(v)
                    except (ValueError, TypeError):
                        pass

            # Check if the clubId already exists in club_data_dict and if the new data has a newer timestamp
            if key in club_data_dict:
                existing_timestamp = club_data_dict[key]['timestamp']
                if club_data['timestamp'] > existing_timestamp:
                    club_data_dict[key] = club_data
            else:
                club_data_dict[key] = club_data

    # Convert the values in club_data_dict to a list
    custom_kit_data_list = list(club_data_dict.values())

    return custom_kit_data_list

In [None]:
# Initialize variables to track progress
total_rows = len(leaderboard_df)
successful_scrapes = 0
failed_scrapes = 0
failed_rows = []

# Iterate through the rows of leaderboard_df
for index, row in leaderboard_df.iterrows():
    # Construct the URL based on clubId and platform
    club_id = row['clubId']
    platform = row['platform']
    url = f"https://proclubs.ea.com/api/fc/clubs/matches?matchType=leagueMatch&platform={platform}&clubIds={club_id}"

    try:
        browser.visit(url)
        site = soup(browser.html, "html.parser")

        # Find the <pre> tag containing the JSON data
        pre_tag = site.find('pre')

        if pre_tag:
            # Extract the JSON data from the <pre> tag
            json_data = pre_tag.text

            # Convert JSON to a Python dictionary
            json_dict = json.loads(json_data)

            # Extract the broad match data and extend the list
            match_overview_dictionary_list.extend(match_overview_data_assign(json_dict))
            player_build_stat_list.extend(player_build_attributes(json_dict, vproattr_key_names))
            player_stat_list.extend(player_data_collect(json_dict))
            club_cosmetic_list.extend(extract_cosmetic_data(json_dict))
            
            # Increment the successful scrape count
            successful_scrapes += 1

        # Add a one-second pause
        time.sleep(1)

    except Exception as e:
        failed_scrapes += 1
        failed_rows.append(index)
        print(f"\rFailed to scrape row {index}: {str(e)}", end="")
        sys.stdout.flush()

    # Print progress with carriage return to overwrite previous line
    progress_msg = f"Scraping progress: {successful_scrapes} out of {total_rows} complete, {failed_scrapes} failed"
    print(f"\r{progress_msg}", end="")
    sys.stdout.flush()

In [None]:
browser.quit()

In [None]:
match_overview_dictionary_df = pd.DataFrame(match_overview_dictionary_list)
player_build_stat_df = pd.DataFrame(player_build_stat_list)
player_data_collect_df = pd.DataFrame(player_stat_list)
club_cosmetic_df = pd.DataFrame(club_cosmetic_list)

In [5]:
# For Preview Purposes
match_overview_dictionary_df.head()

NameError: name 'match_overview_dictionary_df' is not defined

In [None]:
# For Preview Purposes
player_build_stat_df.head()

In [None]:
# For Preview Purposes
player_data_collect_df.head()

In [None]:
# For Preview Purposes
club_cosmetic_df.head()

In [None]:
# Saving of Files


#Generate Datetime for timestamping
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M")

match_overview_dictionary_df.to_csv(f"Match_Information_{timestamp}.csv", index=False)
player_match_data_df.to_csv(f"Player_Performances{timestamp}.csv", index=False)
player_data_collect_df.to_csv(f"Player_Attributes{timestamp}.csv", index=False)
club_cosmetic_df.to_csv(f"Club_Cosmetics{timestamp}.csv", index=False)