In [4]:
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive"
}
url = 'https://www.dlsite.com/maniax/work/=/product_id/RJ438625.html'

response = requests.get(url, headers=headers)

In [5]:
# Parse the HTML response
soup = BeautifulSoup(response.text, "html.parser")

# Find the table by its ID
analysis_element = soup.find("table", id="work_outline")

# Extract headers and corresponding detail links in one pass using zip
table_headers = [th.text.strip() for th in analysis_element.find_all("th")]
detail_links = [td.find_all("a") for td in analysis_element.find_all("td")]

# Create a dictionary using dictionary comprehension for readability
work_details = {
    header: [link.text.strip() for link in links]
    for header, links in zip(table_headers, detail_links)
}

# Map the work details to a structured dictionary with defaults
work_details_mapped = {
    'release_date': work_details.get('販売日', [''])[0],
    'series_name': work_details.get('シリーズ名', [''])[0],
    'scenario': work_details.get('シナリオ', []),
    'illustration': work_details.get('イラスト', []),
    'voice_actor': work_details.get('声優', []),
    'age_rating': work_details.get('年齢指定', [''])[0],
    'product_format': work_details.get('作品形式', [''])[0],
    'genre': work_details.get('ジャンル', []),
}

print(work_details_mapped)

{'release_date': '2022年12月23日', 'series_name': '', 'scenario': ['乾右京'], 'illustration': ['なぎは'], 'voice_actor': ['秋野かえで', '逢坂成美'], 'age_rating': 'R18', 'product_format': 'ボイス・ASMR', 'genre': ['バイノーラル/ダミヘ', 'ASMR', '妹', 'サキュバス/淫魔', 'ラブラブ/あまあま', 'ハーレム', '中出し', '男性受け']}


In [1]:
from dlsite_analyzer.config import RAW_JSON_DATA_DIR
from dlsite_analyzer import (
    DatabaseInitializer,
    fetch_and_save_voice_works,
    import_voice_works_to_db,
)

In [4]:
# Initialize the database if required
DatabaseInitializer().initialize()

[90m2025-01-05 21:10:48 [94mINFO     [35mdlsite_analyzer.database_initializer [37mTables created.[0m
[90m2025-01-05 21:10:48 [94mINFO     [35mdlsite_analyzer.database_initializer [37mIndexes created.[0m
[90m2025-01-05 21:10:48 [94mINFO     [35mdlsite_analyzer.database_initializer [37mInitial data inserted.[0m
[90m2025-01-05 21:10:48 [94mINFO     [35mdlsite_analyzer.database_initializer [37mViews created.[0m


In [None]:
# ボイス作品のデータをJSONファイルに保存
fetch_and_save_voice_works(RAW_JSON_DATA_DIR)

[90m2024-11-08 19:00:31 [94mINFO     [35mdlsite_analyzer [37mTotal pages to process: 545[0m
100%|██████████| 545/545 [43:58<00:00,  4.84s/it]
[90m2024-11-08 19:44:30 [94mINFO     [35mdlsite_analyzer [37mFinished saving voice works to JSON files.[0m


In [5]:
# JSONファイルからデータベースにデータをインポート
import_voice_works_to_db(RAW_JSON_DATA_DIR)

Importing JSON to DB:   0%|          | 0/545 [00:00<?, ?it/s]

Importing JSON to DB: 100%|██████████| 545/545 [00:10<00:00, 53.53it/s]
[90m2025-01-05 21:11:09 [94mINFO     [35mdlsite_analyzer [37mAll JSON data imported to the database.[0m
