In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import functools
import json
import os
import re

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 400)
pd.options.display.max_rows = 100

In [2]:
# path
DATA_PATH = "../data/drug"
RESULT_PATH = "../result/drug"
MASTER_PATH = "../master/"

In [4]:
# 厚労省の医薬品マスターをダウンロードして、以下のように保存してください。
# ダウンロード先URL: https://shinryohoshu.mhlw.go.jp/shinryohoshu/downloadMenu/
# 保存場所: ../data/drug/drug_master.csv
# 元のファイル名: y_YYYYMMDD.csv

# CSVファイルを読み込み
drug_df = pd.read_csv(os.path.join(DATA_PATH, 'drug_df.csv'), encoding='cp932', dtype=str, header=None)

In [5]:
# json定義
with open(os.path.join(MASTER_PATH, "drug_df_rename_master.json"), encoding="utf-8") as f:
    """ 列名変換用マスタ"""
    drug_df_rename_master_dict = json.load(f)
with open(os.path.join(MASTER_PATH, "therapeutic_code_master.json"), encoding="utf-8") as f:
    """ 薬効分類マスタ"""
    therapeutic_code_master_dict = json.load(f)
with open(os.path.join(MASTER_PATH, "atc_code_master.json"), encoding="utf-8") as f:
    """ ATC分類マスタ"""
    atc_code_master_dict = json.load(f)

In [14]:
# ATCコード
"""
code_re = re.compile(r'^(?:D[0-9A-Z]+|[A-Z][0-9A-Z]*)(?=\s|　|$)')

def extract_code(name: str):
    if not isinstance(name, str):
        return None
    m = code_re.match(name.strip())
    return m.group(0) if m else None

rows = []  

def add_row(numeric_codes, d_code=""):
    pad = (numeric_codes + ["", "", "", ""])[:5]
    rows.append(pad + [d_code])

def walk(node, numeric_path):
    name = node.get("name", "")
    code = extract_code(name)
    if not code:
        for ch in (node.get("children") or []):
            walk(ch, numeric_path)
        return

    if code.startswith("D"):
        add_row(numeric_path, d_code=code)
        return

    next_path = (numeric_path + [code])[:5]
    add_row(next_path, d_code="")

    for ch in (node.get("children") or []):
        walk(ch, next_path)

if isinstance(atc_code_master_dict, dict):
    for child in (atc_code_master_dict.get("children") or []):
        walk(child, [])
elif isinstance(atc_code_master_dict, list):
    for node in atc_code_master_dict:
        walk(node, [])
else:
    raise TypeError("Unsupported JSON root type")

seen = set()
deduped = []
for r in rows:
    key = tuple(r)
    if key not in seen:
        seen.add(key)
        deduped.append(r)

atc_df = pd.DataFrame(deduped, columns=["atc_level1", "atc_level2", "atc_level3", "atc_level4", "atc_level5", "drug_code"])

# Excelに出力（DATA_PATH下に atc_code_master.xlsxとして保存）
atc_df.to_excel(os.path.join(DATA_PATH, "atc_code_master.xlsx"), index=False)
"""

_ATC_CODE_RE = re.compile(r'^(?:[A-Z](?:\d{2})?(?:[A-Z]{0,2})?(?:\d{2})?|D\d{5})$')
_DRUG_CODE_RE = re.compile(r'^D\d{5}$')  # 薬剤コード (例: D00202)

def extract_code_and_label(name: str):
    """'A01AA01 ナイスタチン' -> ('A01AA01', 'A01AA01 ナイスタチン')"""
    if not isinstance(name, str):
        return None, None
    s = name.strip().replace('\xa0', ' ')
    if not s:
        return None, None
    parts = re.split(r'[\s\u3000]+', s, maxsplit=1)  # 半角/全角スペースで分割
    code = parts[0]
    rest = parts[1] if len(parts) > 1 else ""
    if not _ATC_CODE_RE.match(code):
        return None, None
    label = f"{code} {rest}".strip()
    return code, label

rows = []

def add_row(level_labels):
    # level1〜5 を「コード＋名称」で埋める（不足は空文字）
    padded = (level_labels + ["", "", "", ""])[:5]
    rows.append(padded)

def walk(node, level_path):
    name_raw = node.get("name", "")
    code, label = extract_code_and_label(name_raw)

    if not code:
        # コードが取れないノードは子へ
        for ch in (node.get("children") or []):
            walk(ch, level_path)
        return

    # 薬剤コード（D+5桁）は出力せずスキップ
    if _DRUG_CODE_RE.match(code):
        return

    # ATC階層ノード：パスに積んで1行出力
    next_path = (level_path + [label])[:5]
    add_row(next_path)

    # 子ノードへ
    for ch in (node.get("children") or []):
        walk(ch, next_path)

# ルート走査
if isinstance(atc_code_master_dict, dict):
    for child in (atc_code_master_dict.get("children") or []):
        walk(child, [])
elif isinstance(atc_code_master_dict, list):
    for node in atc_code_master_dict:
        walk(node, [])
else:
    raise TypeError("Unsupported JSON root type")

# 重複除去
seen, deduped = set(), []
for r in rows:
    t = tuple(r)
    if t not in seen:
        seen.add(t)
        deduped.append(r)

# DataFrame（level1〜5のみ）
atc_df = pd.DataFrame(deduped, columns=["atc_level1","atc_level2","atc_level3","atc_level4","atc_level5"])

# 例：保存
atc_df.to_excel(os.path.join(DATA_PATH, "atc_code_master1.xlsx"), index=False)

In [120]:
# 薬効分類コード
code_re = re.compile(r'^([0-9]+|D[0-9A-Z]+)\b')

def extract_code(name: str):
    if not isinstance(name, str):
        return None
    m = code_re.match(name.strip())
    return m.group(1) if m else None

rows = []  

def add_row(numeric_codes, d_code=""):
    pad = (numeric_codes + ["", "", "", ""])[:4]
    rows.append(pad + [d_code])

def walk(node, numeric_path):
    name = node.get("name", "")
    code = extract_code(name)
    if not code:
        for ch in (node.get("children") or []):
            walk(ch, numeric_path)
        return

    if code.startswith("D"):
        add_row(numeric_path, d_code=code)
        return

    next_path = (numeric_path + [code])[:4]
    add_row(next_path, d_code="")

    for ch in (node.get("children") or []):
        walk(ch, next_path)

if isinstance(therapeutic_code_master_dict, dict):
    for child in (therapeutic_code_master_dict.get("children") or []):
        walk(child, [])
elif isinstance(therapeutic_code_master_dict, list):
    for node in therapeutic_code_master_dict:
        walk(node, [])
else:
    raise TypeError("Unsupported JSON root type")

seen = set()
deduped = []
for r in rows:
    key = tuple(r)
    if key not in seen:
        seen.add(key)
        deduped.append(r)

therapeutic_df = pd.DataFrame(deduped, columns=["therapeutic_level1", "therapeutic_level2", "therapeutic_level3", "therapeutic_level4", "drug_code"])

# Excelに出力（DATA_PATH下に therapeutic_code_master.xlsxとして保存）
therapeutic_df.to_excel(os.path.join(DATA_PATH, "therapeutic_code_master.xlsx"), index=False)

In [124]:
# ATC分類と薬効分類の対応表作成
atc_df_filtered = atc_df[atc_df["drug_code"].notna() & (atc_df["drug_code"] != "")]
therapeutic_df_filtered = therapeutic_df[therapeutic_df["drug_code"].notna() & (therapeutic_df["drug_code"] != "")]

merged_df = pd.merge(
    atc_df_filtered,
    therapeutic_df_filtered,
    on="drug_code",
    how="outer"
)

# Excelに出力（DATA_PATH下に therapeutic_atc_master.xlsxとして保存）
merged_df.to_excel(os.path.join(DATA_PATH, "therapeutic_atc_master.xlsx"), index=False)

In [107]:
# 文字列から共通する部分だけを抽出
def common_prefix(strings):
    if not strings:
        return ""
    strings = [str(s).strip() for s in strings if pd.notna(s)]
    if not strings:
        return ""
    # 先頭からの共通部分を取得
    prefix = os.path.commonprefix(strings)
    return prefix

# 階層構造をたどってprefixに合致するnameを返す
code_index = {}
code_head_re = re.compile(r'^(\d+)\s*')

def index_tree(node):
    name = node.get("name", "")
    m = code_head_re.match(name)
    if m:
        code_index[m.group(1)] = name.split(None, 1)[-1] if ' ' in name else name
    for child in node.get("children", []) or []:
        index_tree(child)

if isinstance(therapeutic_code_master_dict, dict):
    index_tree(therapeutic_code_master_dict)
elif isinstance(therapeutic_code_master_dict, list):
    for n in therapeutic_code_master_dict:
        index_tree(n)
def find_name_by_code_prefix(prefix: str):
    return code_index.get(prefix)

# 前処理
def filter_columns(df: pd.DataFrame) -> pd.DataFrame:
    # 列名変換
    df.columns = df.columns.astype(str)
    df = df.rename(columns=drug_df_rename_master_dict)
    # generic_nameから"【般】"を削除
    df["generic_name"] = df["generic_name"].str.replace("【般】", "", regex=False)
    # generic_name、product_nameの数字と英語を半角に変換
    df["generic_name"] = df["generic_name"].str.normalize("NFKC")
    df["product_name"] = df["product_name"].str.normalize("NFKC")
    # nddb_codeの先頭から7文字を抽出して新しい列nddb_code_7を作成
    df["nddb_code_7"] = df["nddb_code"].str[:7]
    # nddb_code_7でグループ化して、generic_nameの共通する部分だけをを抽出して新しい列generic_base_nameを作成
    df["generic_base_name"] = df.groupby("nddb_code_7")["generic_name"].transform(
        lambda x: common_prefix(x.tolist())
    )
    # generic_base_nameから複数の余分な文字列を一括削除（錠、mgなど）
    remove_list = [
        "錠", "0.", "mg", "顆粒", "%", "点眼液", ":", "坐剤", "テープ", 
        "カプセル", "点鼻液", "μg", "細粒", "配合", "徐放", "FF",  
        "GL", ".5mg", "点眼", "mL", "口腔内崩壊", "経口液", "含嗽用", "・B6()・B12", 
        "シロップ用", "クリーム", "トローチ", "原末", "外用液", "シロップ", 
        "点耳液", "50(制酸剤)", "経口ゼリー", "吸入粉末剤", "シリンジ", "吸入液", 
        "軟膏", "(鉄として)", "点耳", "g", "(35×50mm)", "2.", "口腔用",
        "点鼻", "スプレー", "(温感)", "(1日6回)", "(非温感)", "・B2・B6・B",
        "注腸剤", "含嗽剤", "舌下", "・B6(25)・B", "3.", "(制酸剤)", "等配合パップ",
        "1000等消化酵素", "2000等消化酵素"]
    remove_list_sorted = sorted(remove_list, key=len, reverse=True)
    pattern = "|".join(map(re.escape, remove_list_sorted))
    df["generic_base_name"] = df["generic_base_name"].str.replace(pattern, "", regex=True)
    # generic_base_nameが数字で終わっている場合、数字を削除
    df["generic_base_name"] = df["generic_base_name"].str.replace(r"\d+$", "", regex=True)
    # generic_base_nameが"散"で終わっている場合、"散"を削除
    df["generic_base_name"] = df["generic_base_name"].str.replace(r"散$", "", regex=True)
    
    col_names = {
        1: "therapeutic_code_level1",
        2: "therapeutic_code_level2",
        3: "therapeutic_code_level3",
        4: "therapeutic_code_level4"
    }
    # 薬効分類の各レベルを格納
    for length, col_name in col_names.items():
        df[col_name] = df["nddb_code_7"].str[:length] \
            .apply(lambda x: find_name_by_code_prefix(x) or "")
    # nddb_code_7の5-7文字目を抽出して新しい列dose_routeを作成（001-399: 内服薬, 400-699: 注射薬, 700-999: 外用薬）
    codes = pd.to_numeric(df["nddb_code_7"].str[4:7], errors="coerce")
    df["dose_route"] = np.select(
        [
            (codes >= 1) & (codes <= 399),
            (codes >= 400) & (codes <= 699),
            (codes >= 700) & (codes <= 999),
        ],
        ["内服薬", "注射薬", "外用薬"],
        default=np.nan,
    )
    # 必要な列を指定
    columns = ["generic_base_name", "generic_name", "product_name", "nddb_code_7", "nddb_code", "receipt_code", "therapeutic_code_level1", "therapeutic_code_level2", "therapeutic_code_level3", "therapeutic_code_level4", "dose_route"]
    df = df[columns]
    return df

In [108]:
drug_df = filter_columns(drug_df)

In [103]:
drug_df["therapeutic_code_level4"].value_counts()

therapeutic_code_level4
生薬                    1796
その他の精神神経用剤            1091
その他の血圧降下剤             1056
他に分類されないその他の代謝性医薬品     686
その他のアレルギー用薬            630
                      ... 
アクチノマイシンＤ製剤              1
メルカプトプリン系製剤              1
エチレンイミン系製剤               1
コンドロイチン製剤                1
ハロゲン炭化水素製剤；ハロタン等         1
Name: count, Length: 401, dtype: int64