In [1]:
from pymongo import MongoClient
from lxml import html

import os
import json

dataset = []
temp_dict = dict()

mongo_connection_uri = os.environ.get("ME_CONFIG_MONGODB_URL")
client = MongoClient(mongo_connection_uri)

db = client["atcoder"]
task_info_raw_collection = db["task_info_raw"]
submission_raw_collection = db["submission_raw"]
contest_collection = db["contest"]

for task_item in task_info_raw_collection.find():
    task_url = task_item["task_url"]
    contest_id = task_url.split("/")[2]
    task_id = task_url.split("/")[-1]

    raw_html = task_item["raw_html"]
    tree = html.fromstring(raw_html)
    try:
        main_part = tree.xpath("//div[@id='task-statement']//span[@class='lang-en']")[0]
        score = main_part.xpath("./p/var")[0].text_content()

        before_io_style = main_part.xpath(
            ".//div[@class='io-style']/preceding-sibling::div[@class='part']"
        )
        problem_statement = ""
        constraints = ""
        for part_item in before_io_style:
            text_content = html.tostring(part_item).decode("utf-8")
            if "Problem Statement" in text_content:
                problem_statement = text_content
            elif "Constraints" in text_content:
                constraints = text_content
        if problem_statement == "":
            continue
    except:
        continue

    io_style = main_part.xpath(".//div[@class='part']")
    input_format = html.tostring(io_style[0]).decode("utf-8")
    output_format = html.tostring(io_style[1]).decode("utf-8")

    samples = []
    after_io_style = main_part.xpath(
        ".//div[@class='io-style']/following-sibling::div[@class='part']"
    )
    sample_count = len(after_io_style)
    try:
        for i in range(int(sample_count / 2)):
            sample_input = after_io_style[i * 2].xpath(".//pre")[0].text_content()
            sample_output = after_io_style[i * 2 + 1].xpath(".//pre")[0].text_content()
            samples.append({"sample_input": sample_input, "sample_output": sample_output})
    except:
        continue

    submissions = []
    for submission_item in submission_raw_collection.find(
        {"contest_id": contest_id, "task_id": task_id}
    ):
        raw_html = submission_item["raw_html"]
        tree = html.fromstring(raw_html)
        submissions.append(tree.xpath("//pre[@id='submission-code']")[0].text_content())

    task_dict = {
        "contest_id": contest_id,
        "task_id": task_id,
        "score": score,
        "problem_statement": problem_statement,
        "constraints": constraints,
        "input_format": input_format,
        "output_format": output_format,
        "samples": samples,
        "submissions": submissions,
    }

    if contest_id not in temp_dict:
        temp_dict[contest_id] = []
    temp_dict[contest_id].append(task_dict)

for contest_item in contest_collection.find():
    start_time = contest_item["start_time"]
    contest_id = contest_item["contest_url"].split("/")[2]
    contest_name = contest_item["contest_name"]
    if contest_id not in temp_dict:
        continue
    tasks = temp_dict[contest_id]
    dataset.append(
        {
            "start_time": start_time,
            "contest_id": contest_id,
            "contest_name": contest_name,
            "tasks": tasks,
        }
    )

In [2]:
with open("/app/atcoder.jsonl", "w") as f:
    for contest in dataset:
        json_line = json.dumps(contest, ensure_ascii=False)
        f.write(json_line + "\n")

In [3]:
import os
os.getcwd()

'/'