In [1]:
import datetime
import requests
from lxml import etree
import re
from dataclasses import dataclass
from typing import Optional, Union
import pickle

In [2]:
pattern_title = re.compile(r'^収録曲/(かんたん|ふつう|むずかしい|おに)/(.+)$')
pattern_date = re.compile(r'^\d{4}/\d{1,2}/\d{1,2}$')
pattern_date_special = re.compile(r'^\d{2}/\d{1,2}/\d{1,2}$')

In [3]:
datetime.datetime.strptime('23/5/3', '%y/%m/%d').date()

datetime.date(2023, 5, 3)

In [4]:
@dataclass(frozen=True)
class Song:
    title: str
    date: Optional[datetime.date]
    work: Optional[str]


def create_songs(line: etree._ElementTree) -> list[Song]:
    # s = etree.tostring(line, encoding=str)
    diff = [e for e in line.xpath('.//a') if 'title' in e.attrib]
    titles = set(
        [pattern_title.match(e.attrib['title']).group(2) for e in diff]
    )
    # 裏譜面を別とする仕様を削除
    titles = list(filter(lambda x: len(x) == min(map(len, titles)), titles))
    if len(line) == 7:
        return [Song(title, None, None) for title in titles]
    # args = line.xpath('td//strong/text()')
    args = [
        '/'.join(td.xpath('.//strong/text()')) for td in line.xpath('.//td')
    ]
    # print(args)
    try:
        if len(line) == 9:
            return [
                Song(
                    title,
                    get_date(args[1]),
                    args[0],
                )
                for title in titles
            ]
        if pattern_date.match(args[0]) or pattern_date_special.match(args[0]):
            return [
                Song(
                    title,
                    get_date(args[0]),
                    None,
                )
                for title in titles
            ]
    except ValueError:
        # print(s)
        print(args)
        return []
    return [Song(title, None, args[0]) for title in titles]


def get_date(s: str) -> datetime.datetime:
    if pattern_date_special.match(s):
        return datetime.datetime.strptime(s, '%y/%m/%d').date()
    return datetime.datetime.strptime(s, '%Y/%m/%d').date()


def interpolate(data: list[Song]) -> list[Song]:
    new_data = [data[0]]
    for d in data[1:]:
        date = new_data[-1].date if d.date is None else d.date
        work = new_data[-1].work if d.work is None else d.work
        new_data.append(Song(d.title, date, work))
    return new_data

In [5]:
def get_songs_from_release(url):
    response = requests.get(url)
    html = etree.HTML(response.text)
    tables = html.xpath('//table')
    rows = [table.xpath('./tbody/tr') for table in tables]
    rows = sum([r for r in rows if len(r) > 50], [])
    data = [row for row in rows if len(row) > 1]
    songs = interpolate(sum(map(create_songs, data), []))
    return songs

In [6]:
urls = [
    'https://wikiwiki.jp/taiko-fumen/%E5%8F%8E%E9%8C%B2%E6%9B%B2/%E5%88%9D%E5%87%BA%E9%A0%86',
    'https://wikiwiki.jp/taiko-fumen/%E5%8F%8E%E9%8C%B2%E6%9B%B2/%E5%88%9D%E5%87%BA%E9%A0%86/%E6%96%B0%E7%AD%90%E4%BD%93%E4%BB%A5%E9%99%8D',
    'https://wikiwiki.jp/taiko-fumen/%E5%8F%8E%E9%8C%B2%E6%9B%B2/%E5%88%9D%E5%87%BA%E9%A0%86/%E6%96%B0%E7%AD%90%E4%BD%932%E4%BB%A5%E9%99%8D',
]

In [7]:
songs = sum([get_songs_from_release(url) for url in urls], [])

['RT', '2004/11/中旬', '春夏秋冬ドンドコドン', '', '', '', '', '', '']


In [8]:
songs.sort(key=lambda x: x.date)
with open('songs.pkl', 'bw') as f:
    pickle.dump(songs, f)