In [1]:
import requests
from bs4 import BeautifulSoup
import re
from typing import Union

In [2]:
def parse_int(original: str) -> Union[float, str]:
    num_string = re.findall(r'\d+\.?\d+|\d+', original)
    if num_string == []: return original
    return float(num_string[0])

In [3]:
def process_value(value: str) -> str:
    #result = parse_int(value)
    result = value.strip('\n \t  ●○')
    result = result.replace('(', '')
    result = result.replace(')', '')
    result = result.replace(' mag', ' Magazine')
    result = result.replace(' multiplier', ' Multiplier ').strip(' ')
    result = result.replace(' m', ' meters ')
    result = re.sub(r'   \S+%', '', result)
    return result

In [4]:
def scrape_page(url: str) -> dict[str, Union[str, dict[str, str]]]:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    del response
    result = {}
        
    statistics = soup.find_all('div', class_='pi-data-value pi-font')[1:5]
    result['Statistics'] = {
        'Mastery': statistics[0].text,
        'Slot': statistics[1].text,
        'Type': statistics[2].text,
        'Trigger Type': statistics[3].text
    }
    del statistics
    
    all_sections = soup.find_all('section', class_='pi-item pi-group pi-border-color pi-collapse pi-collapse-open')
    for section in all_sections:
        values = []
        keys = []

        for elem in section.find_all('div'):
            try: keys.append(elem.find('h3').text)
            except AttributeError: continue

            try:
                # Polarities Case
                if elem.img and (elem.a['title'] == 'Polarity' or elem.a['title'] == 'Exilus Weapon Adapter'):
                    polarities = elem.find_all('img')
                    value = ''
                    for p in polarities: value += p['alt'].split(' ')[0] + ' '
                # Normal Case
                else:
                    value = process_value(elem.find('div').text)
                values.append(value)
            except AttributeError: continue

            result[section.h2.text] = {keys[i]: values[i] for i in range(len(keys))}
    del all_sections
    
    return {soup.find('h2', class_='pi-item pi-item-spacing pi-title').span.text : result}

In [5]:
f = open('urls.txt', 'r')

In [6]:
data = {}
for index, line in enumerate(f.readlines()):
    data = data | scrape_page(line.strip('\n'))
    if index % 50 == 0: print(index)

0
50
100
150
200
250
300
350
400
450
500
550
600


In [7]:
f.close()

In [8]:
import json
with open('weapons.json', 'w') as fp:
    json.dump(data, fp)