### Setup

In [1]:
from concurrent.futures import ThreadPoolExecutor
import requests
import gzip
import tqdm
import os

In [2]:
THREAD_TOTAL = 1  # Multithreading is technically not allowed based on Tenhou's use terms.
DATASET_RAW_HTML_PATH = "dataset/raw"
ID_LIST_PATH = "dataset/raw/id_list.txt"
GAME_MODES = ["四鳳東喰赤－", "四鳳東喰赤速"]
LOG_HTML_FORMAT = "https://tenhou.net/0/log/?{log}"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"}

sca=個室<br>
scb=段位戦<br>
scc=鳳凰卓(牌譜あり)<br>
scd=雀荘戦<br>
sce=技能戦+琥珀卓(牌譜あり)<br>
<br>
FileIndex (URL="https://tenhou.net/sc/raw/dat/"+list[i].file)<br>
https://tenhou.net/sc/raw/list.cgi (Last 7days)<br>
https://tenhou.net/sc/raw/list.cgi?old (old archives)<br>

### Extracting game IDs

In [None]:
url_old = "https://tenhou.net/sc/raw/list.cgi?old"
url_new = "https://tenhou.net/sc/raw/list.cgi"
response_old = requests.get(url_old, headers=HEADERS)
response_new = requests.get(url_new, headers=HEADERS)

time_log_old = list(
    map((lambda s: f"https://tenhou.net/sc/raw/dat/{s}"),
        filter((lambda s: s[5:8] == "scc"), 
               map((lambda s: s[7:].split("'")[0]), 
                   response_old.text.split("\r\n")[1:-2]
))))
time_log_new = list(
    map((lambda s: f"https://tenhou.net/sc/raw/dat/{s}"),
        filter((lambda s: s[0:3] == "scc"), 
               map((lambda s: s[7:].split("'")[0]), 
                   response_new.text.split("\r\n")[1:-2]
))))

time_log = time_log_old + time_log_new
p_bar = tqdm.tqdm(range(len(time_log)), desc="Downloading game ids")
os.makedirs(os.path.dirname(ID_LIST_PATH), exist_ok=True)
with open(ID_LIST_PATH, "w") as f:
    for time_log_hour in time_log:
        response = requests.get(time_log_hour, headers=HEADERS)
        for log_line in gzip.decompress(response.content).decode("utf-8").split("\n")[:-1]:
            _, _, game_mode, game_log, _ = log_line.split(" | ")
            game_log = game_log.split('"')[1].split("=")[1]
            f.write(f"{game_mode} {game_log}\n")
        p_bar.update(1)
del p_bar

### Downloading game data

In [5]:
os.makedirs(DATASET_RAW_HTML_PATH, exist_ok=True)
with open(ID_LIST_PATH, "r") as f:
    game_list = list(filter((lambda g: g[0] in GAME_MODES), map((lambda line: line[:-1].split(" ")), f.readlines())))

p_bar = tqdm.tqdm(range(len(game_list)), desc="Downloading replays")

def dl_thread(thread_num):
    global game_list
    while game_list:
        global p_bar
        game_mode, game_id = game_list.pop()
        response = requests.get(LOG_HTML_FORMAT.format(log=game_id), headers=HEADERS)
        with open(f"{DATASET_RAW_HTML_PATH}/{game_id}.html", "w") as f:
            f.write(response.text)
        p_bar.update(1)

pool = ThreadPoolExecutor()
for thread_num in range(1, THREAD_TOTAL + 1):
    pool.submit(dl_thread, thread_num)
pool.shutdown(wait=True)

Downloading replays: 100%|██████████████████████████████████████████████████████| 35544/35544 [18:01<00:00, 43.03it/s]

### TEST

In [None]:
from mahjong.tile import TilesConverter

In [None]:
print(TilesConverter.string_to_34_array(man="789", pin="456", sou="123", honors="123"))

In [None]:
from mahjong.shanten import Shanten
 
shanten = Shanten()
tiles = TilesConverter.string_to_34_array(man='123567', pin='123456', sou='44')
result = shanten.calculate_shanten(tiles)

print(result)