In [17]:
import requests
import json
import os
import pandas as pd
import time
import re

Define the URL for the tournament<br>
series_url='https://www.rib.gg/series/52003'

In [18]:
tourney_url = "https://www.rib.gg/events/champions-tour-2023-americas-last-chance-qualifier/matches/2917"

File to store scraped seriesIds

In [19]:
scraped_series_ids_file = 'scraped_series_ids.txt'

Function to sanitize filenames and directory names

In [20]:
def sanitize_filename(filename):
    return re.sub(r'[<>:"/\\|?*]', '', filename)

Function to load scraped seriesIds from file

In [21]:
def load_scraped_series_ids():
    if os.path.exists(scraped_series_ids_file):
        with open(scraped_series_ids_file, 'r') as file:
            return file.read().splitlines()
    else:
        return []

Function to save scraped seriesIds to file

In [22]:
def save_scraped_series_ids(scraped_series_ids):
    with open(scraped_series_ids_file, 'w') as file:
        for series_id in scraped_series_ids:
            file.write(f"{series_id}\n")

Load scraped seriesIds from file

In [23]:
scraped_series_ids = load_scraped_series_ids()

Function to extract JSON data from HTML

In [24]:
def extract_json_data(response_text):
    start_index = response_text.find('<script id="__NEXT_DATA__" type="application/json">')
    start_index += len('<script id="__NEXT_DATA__" type="application/json">')
    end_index = response_text.find('</script>', start_index)
    json_string = response_text[start_index:end_index].strip()
    return json.loads(json_string)

Function to get series header data and match IDs

In [25]:
def SeriesHeader(seriesId):
    print(f"Fetching series header data for series ID: {seriesId}")
    url = f'https://www.rib.gg/series/{seriesId}'
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f'Failed trying to scrape series header: {e}')
        return None, None
    stats_data_raw = extract_json_data(response.text)
    matches = stats_data_raw['props']['pageProps']['series']['matches']
    match_ids = [match['id'] for match in matches]
    return stats_data_raw, match_ids

Function to update IGN and ID data

In [26]:
def update_ign_and_id(stats_data):
    print("Updating IGN and ID data")
    csv_file = 'ignfile.csv'
    
    matches = stats_data['props']['pageProps']['series']['matches']
    ids, igns = [], []
    for match in matches:
        players = match['players']
        for player_obj in players:
            ids.append(player_obj['player']['id'])
            igns.append(player_obj['player']['ign'])
    df = pd.DataFrame({'id': ids, 'name': igns}).drop_duplicates(subset=['id', 'name'])
    if os.path.exists(csv_file) and os.path.getsize(csv_file) > 0:
        existing_df = pd.read_csv(csv_file)
    else:
        existing_df = pd.DataFrame()
    combined_df = pd.concat([existing_df, df], ignore_index=True).drop_duplicates(subset=['id', 'name'])
    try:
        combined_df.to_csv(csv_file, index=False)
        print(f"IGN and ID data saved to {csv_file}")
    except Exception as e:
        print(f"Failed to save IGN and ID data: {e}")

Function to update team data

In [27]:
def update_team(stats_data):
    print("Updating team data")
    csv_file = 'teamfile.csv'
    keys = ['id', 'name', 'shortName']
    t1 = stats_data['props']['pageProps']['series']['team1']
    t2 = stats_data['props']['pageProps']['series']['team2']
    df1 = pd.DataFrame([t1], columns=keys).drop_duplicates(subset=keys)
    df2 = pd.DataFrame([t2], columns=keys).drop_duplicates(subset=keys)
    if os.path.exists(csv_file) and os.path.getsize(csv_file) > 0:
        existing_df = pd.read_csv(csv_file)
    else:
        existing_df = pd.DataFrame()
    combined_df = pd.concat([existing_df, df1, df2], ignore_index=True).drop_duplicates(subset=keys)
    try:
        combined_df.to_csv(csv_file, index=False)
        print(f"Team data saved to {csv_file}")
    except Exception as e:
        print(f"Failed to save team data: {e}")

Function to update abilities data

In [28]:
def update_abilities(stats_data):
    print("Updating abilities data")
    csv_file = 'abilities.csv'
    abilities = stats_data['props']['pageProps']['content']['abilities']
    ids, names, types, agentId, damages = [], [], [], [], []
    for agent in abilities:
        ids.append(agent['id'])
        names.append(agent['name'])
        types.append(agent['type'])
        agentId.append(agent['agentId'])
        damages.append(agent['damages'])
    df = pd.DataFrame({'id': ids, 'name': names, 'type': types, 'agentId': agentId, 'damages': damages})
    if os.path.exists(csv_file) and os.path.getsize(csv_file) > 0:
        existing_df = pd.read_csv(csv_file)
    else:
        existing_df = pd.DataFrame()
    combined_df = pd.concat([existing_df, df], ignore_index=True).drop_duplicates(subset=['id', 'name', 'type', 'agentId', 'damages'])
    try:
        combined_df.to_csv(csv_file, index=False)
        print(f"Abilities data saved to {csv_file}")
    except Exception as e:
        print(f"Failed to save abilities data: {e}")

Function to process bracketJson for different types

In [29]:
def process_bracket_json(bracketJson, bracket_title):
    series_ids = []
    bracket_type = bracketJson['type']
    if bracket_type == 'weekly':
        for week in bracketJson['weekly']['weeks']:
            for series in week['series']:
                series_ids.append(series['id'])
    elif bracket_type == 'double':
        for section in ['winners', 'losers']:
            for round_data in bracketJson[section]:
                for seed in round_data['seeds']:
                    series_ids.append(seed['seriesId'])
    elif bracket_type == 'single':
        for section in ['winners']:
            for round_data in bracketJson[section]:
                for seed in round_data['seeds']:
                    series_ids.append(seed['seriesId'])
    elif bracket_type == 'group':
        for group in bracketJson.get('groups', []):
            for seed in group.get('seeds', []):
                series_ids.append(seed.get('id'))
    else:
        print(f"Unhandled bracket type: {bracket_type}")
    return bracket_type, series_ids

Function to scrape tournament data

In [30]:
def scrapeTourney(tourney_url):
    print(f"Scraping tournament data from URL: {tourney_url}")
    try:
        response = requests.get(tourney_url)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f'Failed trying to scrape tournament data: {e}')
        return
    stats_data_raw = extract_json_data(response.text)
    child_events = stats_data_raw['props']['pageProps']['event']['childEvents']
    for event in child_events:
        event_title = sanitize_filename(event.get('name', 'unknown_event'))
        bracketJson = event.get('bracketJson', {})
        
        if not bracketJson:
            print(f"No bracketJson found for event: {event_title}")
            continue
        bracket_type, series_ids = process_bracket_json(bracketJson, event_title)
        for series_id in series_ids:
            if series_id in scraped_series_ids:
                print(f"Series ID {series_id} already scraped. Skipping...")
                continue
            bracket_folder = os.path.abspath(f'./{event_title}/{bracket_type}/{series_id}')
            os.makedirs(bracket_folder, exist_ok=True)
            print(f"Created directory for series ID {series_id} at {bracket_folder}")
            header_for_extra_data, match_ids = SeriesHeader(series_id)
            if header_for_extra_data is not None:
                update_abilities(header_for_extra_data)
                update_ign_and_id(header_for_extra_data)
                update_team(header_for_extra_data)
                for match_id in match_ids:
                    print(f"Fetching details for match ID: {match_id}")
                    try:
                        response = requests.get(f'https://be-prod.rib.gg/v1/matches/{match_id}/details')
                        response.raise_for_status()
                        details = response.json()
                        with open(f'{bracket_folder}/{match_id}_details.json', 'w') as json_file:
                            json.dump(details, json_file)
                        print(f"Details for match ID {match_id} saved to {bracket_folder}")
                    except requests.RequestException as e:
                        print(f'Failed to fetch details for match ID {match_id}: {e}')
                    except json.JSONDecodeError:
                        print(f"Failed to decode JSON for match ID {match_id}")
                    time.sleep(3)
            scraped_series_ids.append(series_id)
            save_scraped_series_ids(scraped_series_ids)

In [31]:
def scrapeSeries(series_url):
    series_id = series_url.rstrip('/').split('/')[-1]
    if series_id in scraped_series_ids:
        print(f"Series ID {series_id} already scraped. Skipping...")
        return
    print(f"Scraping data for series ID: {series_id}")
    header_for_extra_data, match_ids = SeriesHeader(series_id)
    if header_for_extra_data is not None:
        # event_title = sanitize_filename(header_for_extra_data['props']['pageProps']['series']['title'])
        bracket_folder = os.path.abspath(f'./{series_id}')
        os.makedirs(bracket_folder, exist_ok=True)
        print(f"Created directory for series ID {series_id} at {bracket_folder}")
        update_abilities(header_for_extra_data)
        update_ign_and_id(header_for_extra_data)
        update_team(header_for_extra_data)
        for match_id in match_ids:
            print(f"Fetching details for match ID: {match_id}")
            try:
                response = requests.get(f'https://be-prod.rib.gg/v1/matches/{match_id}/details')
                response.raise_for_status()
                details = response.json()
                with open(f'{bracket_folder}/{match_id}_details.json', 'w') as json_file:
                    json.dump(details, json_file)
                print(f"Details for match ID {match_id} saved to {bracket_folder}")
            except requests.RequestException as e:
                print(f'Failed to fetch details for match ID {match_id}: {e}')
            except json.JSONDecodeError:
                print(f"Failed to decode JSON for match ID {match_id}")
            time.sleep(3)
        scraped_series_ids.append(series_id)
        save_scraped_series_ids(scraped_series_ids)

Uncomment to run scraping

In [32]:
# scrapeTourney(tourney_url)
# scrapeSeries(series_url)