### Author: Paula Abigail Tam
### Project: GBF character dataset

This project is to scrape the character data from the tierlist on GBFwiki. End goal is to make a visualizer to make character statistics more digestible.

Example: To be able to compare the number of Dark SSR characters vs number of Light SSR characters.

In [None]:
#imports2
import requests
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selectolax.parser import HTMLParser
from IPython.display import display

In [None]:
#selenium driver since gbfwiki made website dynamic
options = webdriver.ChromeOptions()
options.add_argument('--headless=new')
options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"

In [None]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
#character data url
URL = "https://gbf.wiki/Character_Tier_List/Gamewith/Ratings"
driver.get(URL)

In [None]:
#selectolax try
html = driver.page_source
tree = HTMLParser(html)

body = tree.body
table = body.css('table.tierlist-details')[0]
tr = table.child.css('[data-short-id]')

In [None]:
#function to determine rarity of character
def chara_rarity(id_num):
    match id_num:
        case "4":
            return "SSR"
        case "3":
            return "SR"
        case "2":
            return "R"

In [None]:
#function to determine a character's element
def which_element(element_n):
    match element_n:
        case "01":
            return "Fire"
        case "02":
            return "Water"
        case "03":
            return "Earth"
        case "04":
            return "Wind"
        case "05":
            return "Light"
        case "06":
            return "Dark"
        case default:
            return "Any"

In [None]:
#function to check if multiple series / weapons
def is_multiple(string):
    line = string.split(",")
    return line

In [None]:
#function to rename series
def which_series(series_name):
    match series_name:
        case "":
            return "-"
        case "none":
            return "Permanent"
        case "tie-in":
            return "Tie-In / Collab"
        case "12generals":
            return "Zodiac"
        case "evokers":
            return "Evoker"
        case "eternals":
            return "Eternal"
        case default:
            return series_name.capitalize()

In [None]:
#function to get rating of character
def get_rating(item):
    return item.child.next.next.next.text()

In [None]:
#function to get name of character
def access_name(item):
    return item.child.next.child.attributes.get('title')

In [None]:
#function to get url of character
def access_details(item):
    url = item.child.next.child.attributes.get('href')
    chara_url = "https://gbf.wiki" + url
    return chara_url

In [None]:
list_of_charas = []

for i in tr:
    item = i.attributes
    chara_id = item.get('data-short-id')
    series = is_multiple(item.get('data-filter-series'))
    wep = is_multiple(item.get('data-filter-weapon'))
    race = is_multiple(item.get('data-filter-race'))
    
    chara_info = {}
    chara_info['ID'] = chara_id
    chara_info['Rating'] = get_rating(i)
    chara_info['Rarity'] = chara_rarity(chara_id[0])
    #element (need to do more detailed)
    chara_info['Name'] = access_name(i)
    
    chara_info['Series'] = which_series(series[0]) #seasonal/grand/etc.
    if len(series) > 1:
        chara_info['2nd Series'] = which_series(series[1])
    else:
        chara_info['2nd Series'] = "-"

    chara_info['Weapon'] = wep[0].capitalize()
    if len(wep) > 1:
        chara_info['2nd Weapon'] = wep[1].capitalize()
    else:
        chara_info['2nd Weapon'] = "-"

    chara_info['Race'] = race[0].capitalize()
    if len(race) > 1:
        chara_info['2nd Race'] = race[1].capitalize()
    else:
        chara_info['2nd Race'] = "-"
       
    chara_info['Type'] = item.get('data-filter-style').capitalize()
    chara_info['URL'] = access_details(i)
    
    list_of_charas.append(chara_info)
    
driver.quit()

In [None]:
df = pd.DataFrame(list_of_charas)

In [None]:
url_list = df["URL"].values.tolist()

In [None]:
print(url_list)

In [None]:
#function to return only number 
def get_number(item):
    res = item.css('td')[0].text()
    ans = re.findall("\d+", res)[0]
    return ans

In [None]:
def get_element(item, stats):
    n = 0
    for idx, i in enumerate(item):
        if i.text() == "Element":
            n += idx
    
    ele = stats[n].css('td')[0].css_first('span').text()
    return which_element(ele)

In [None]:
detail_list = []
for url in url_list:
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    driver.get(url)
    test_html = driver.page_source
    test_tree = HTMLParser(test_html)

    test_body = test_tree.body
    test_table = test_body.css('[data-title="Stats"]')[0]
    ele_span = test_table.css('th') 
    stats_list = test_table.css('tr')

    chara_info = {}

    chara_info['URL'] = url
    chara_info['HP'] = get_number(stats_list[1])
    chara_info['ATK'] = get_number(stats_list[2])
    chara_info['Element'] = get_element(ele_span, stats_list)
    chara_info['Gender'] = test_table.css('[data-icon]')[0].attributes.get('data-icon').capitalize()
    chara_info['VA'] = test_table.css_first('tbody').last_child.prev.css('td')[0].text()

    detail_list.append(chara_info)

    driver.quit()

In [None]:
print(detail_list)

In [None]:
df2 = pd.DataFrame(detail_list)

In [None]:
df_merged = pd.merge(df, df2, on='URL', how='inner')

In [None]:
#removed col 15 (Gender) as it got some wrong values
#rearranged columns to have URL at end
df_reordered = df_merged.iloc[:, [0, 1, 2, 14, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 11]]

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df_reordered)

In [None]:
df_reordered.to_csv('GBF_character_dataset_04_14_2025.csv', index=False)