In [1]:
from bs4 import BeautifulSoup

# read algs.html 
with open("algs.html") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")

# Select all elements with class "singlealgorithm"
elements = soup.find_all(class_="singlealgorithm")

In [8]:
import re
#data-subgroup="Lightning Shapes" data-alg="OLL 8"
# read a > data-alg-filter prop
# read div > .setup-case and remove a div which is inside it 
# readAll .formatted-alg
def get_alg_data(element):
    data = {}
    data["name"] = element["data-alg"]
    data["setup"] = element.find(class_="setup-case").text.replace("setup:", "")
    data['subgroup'] = element["data-subgroup"]
    data["algs"] = [alg.text for alg in element.find_all(class_="formatted-alg")]

    # iterate over all values and clean them i.e spaces, newlines etc 
    for key, value in data.items():
        if isinstance(value, str):
            data[key] = re.sub(r"\s+", " ", value).strip()
        elif isinstance(value, list):
            data[key] = [re.sub(r"\s+", " ", v).strip() for v in value]

    return data

algs = [get_alg_data(element) for element in elements]
algs

[{'name': 'OLL 1',
  'setup': "F R' F' R U2' F R' F' R2' U2' R'",
  'subgroup': 'Dot Case',
  'algs': ["R U2 R2 F R F' U2 R' F R F'",
   "y R U' R2 D' r U' r' D R2 U R'",
   "f R U R' U' R f' U' r' U' R U M'",
   "R' U' F R' F' R2 U R f' U' f"]},
 {'name': 'OLL 2',
  'setup': "f U R U' R' f' F U R U' R' F'",
  'subgroup': 'Dot Case',
  'algs': ["y' R U' R2 D' r U r' D R2 U R'",
   "F R U R' U' S R U R' U' f'",
   "F R U R' U' F' f R U R' U' f'",
   "y r U r' U2 R U2 R' U2 r U' r'"]},
 {'name': 'OLL 3',
  'setup': "F U R U' R' F' U f U R U' R' f' y",
  'subgroup': 'Dot Case',
  'algs': ["y R' F2 R2 U2 R' F R U2 R2 F2 R",
   "y' f R U R' U' f' U' F R U R' U' F'",
   "r' R2 U R' U r U2 r' U M'",
   "M R U R' U r U2 r' U M'"]},
 {'name': 'OLL 4',
  'setup': "F U R U' R' F' U' f U R U' R' f' y",
  'subgroup': 'Dot Case',
  'algs': ["y' R' F2 R2 U2 R' F' R U2 R2 F2 R",
   "y' f R U R' U' f' U F R U R' U' F'",
   "R' F R F' U' S R' U' R U R S'",
   "y F U R U' R' F' U' F R U R' U' F'"]},
 {'n

In [9]:
# save this algs as json 
import json 
with open("algs.json", "w") as f:
    json.dump(algs, f, indent=2)