In [4]:
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive"
}
url = 'https://www.dlsite.com/maniax/work/=/product_id/RJ438625.html'

response = requests.get(url, headers=headers)

In [5]:
# Parse the HTML response
soup = BeautifulSoup(response.text, "html.parser")

# Find the table by its ID
analysis_element = soup.find("table", id="work_outline")

# Extract headers and corresponding detail links in one pass using zip
table_headers = [th.text.strip() for th in analysis_element.find_all("th")]
detail_links = [td.find_all("a") for td in analysis_element.find_all("td")]

# Create a dictionary using dictionary comprehension for readability
work_details = {
    header: [link.text.strip() for link in links]
    for header, links in zip(table_headers, detail_links)
}

# Map the work details to a structured dictionary with defaults
work_details_mapped = {
    'release_date': work_details.get('販売日', [''])[0],
    'series_name': work_details.get('シリーズ名', [''])[0],
    'scenario': work_details.get('シナリオ', []),
    'illustration': work_details.get('イラスト', []),
    'voice_actor': work_details.get('声優', []),
    'age_rating': work_details.get('年齢指定', [''])[0],
    'product_format': work_details.get('作品形式', [''])[0],
    'genre': work_details.get('ジャンル', []),
}

print(work_details_mapped)

{'release_date': '2022年12月23日', 'series_name': '', 'scenario': ['乾右京'], 'illustration': ['なぎは'], 'voice_actor': ['秋野かえで', '逢坂成美'], 'age_rating': 'R18', 'product_format': 'ボイス・ASMR', 'genre': ['バイノーラル/ダミヘ', 'ASMR', '妹', 'サキュバス/淫魔', 'ラブラブ/あまあま', 'ハーレム', '中出し', '男性受け']}


In [1]:
from dlsite_analyzer.config import RAW_JSON_DATA_DIR
from dlsite_analyzer import (
    DatabaseInitializer,
    archive_and_cleanup,
    fetch_and_save_voice_works,
    import_voice_works_to_db,
)

In [5]:
# Fetch and save voice works
archive_and_cleanup()

[90m2025-01-06 01:15:14 [94mINFO     [35mdlsite_analyzer.utils.file_util [37mFiles archived and saved as ZIP: d:\workspace\python\DLsite-Analyzer\data\archives\2025-01-06-011503.zip[0m
[90m2025-01-06 01:15:14 [94mINFO     [35mdlsite_analyzer.utils.file_util [37mCleaned up and recreated directory: data\raw_json[0m


In [3]:
# Initialize the database if required
DatabaseInitializer().initialize()

[90m2025-01-06 01:14:55 [94mINFO     [35mdlsite_analyzer.database_initializer [37mTables created.[0m
[90m2025-01-06 01:14:55 [94mINFO     [35mdlsite_analyzer.database_initializer [37mIndexes created.[0m
[90m2025-01-06 01:14:55 [94mINFO     [35mdlsite_analyzer.database_initializer [37mInitial data inserted.[0m
[90m2025-01-06 01:14:55 [94mINFO     [35mdlsite_analyzer.database_initializer [37mViews created.[0m


In [2]:
# ボイス作品のデータをJSONファイルに保存
fetch_and_save_voice_works(RAW_JSON_DATA_DIR)

[90m2025-01-06 00:21:52 [94mINFO     [35mdlsite_analyzer [37mTotal pages to process: 567[0m
Fetching pages:   1%|▏         | 8/567 [00:36<40:55,  4.39s/it][90m2025-01-06 00:22:33 [31mERROR    [35mdlsite_analyzer [37mFailed to fetch page 9. Retrying (1/3)...[0m
Fetching pages:   3%|▎         | 15/567 [01:18<50:27,  5.48s/it] [90m2025-01-06 00:23:17 [31mERROR    [35mdlsite_analyzer [37mFailed to fetch page 16. Retrying (1/3)...[0m
Fetching pages:   4%|▎         | 20/567 [01:52<54:46,  6.01s/it]  [90m2025-01-06 00:23:52 [31mERROR    [35mdlsite_analyzer [37mFailed to fetch page 21. Retrying (1/3)...[0m
Fetching pages:   6%|▌         | 32/567 [03:04<47:22,  5.31s/it]  [90m2025-01-06 00:25:04 [31mERROR    [35mdlsite_analyzer [37mFailed to fetch page 33. Retrying (1/3)...[0m
Fetching pages:  17%|█▋        | 99/567 [08:38<39:27,  5.06s/it]  [90m2025-01-06 00:30:37 [31mERROR    [35mdlsite_analyzer [37mFailed to fetch page 100. Retrying (1/3)...[0m
Fetching pages:  

In [4]:
# JSONファイルからデータベースにデータをインポート
import_voice_works_to_db(RAW_JSON_DATA_DIR)

Importing JSON to DB: 100%|██████████| 567/567 [00:02<00:00, 256.61it/s]
[90m2025-01-06 01:15:01 [94mINFO     [35mdlsite_analyzer [37mAll JSON data imported to the database.[0m
