<a href="https://colab.research.google.com/github/Serena-G-LEE/25-2_Project/blob/main/openAlex_API_Data_Scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install aiohttp pandas rapidfuzz pyarrow orjson tqdm

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.1


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2025_with_authors"
YEAR = 2025
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W4410789811,Molecular targets and therapies associated wit...,2025,2025-05-27,review,International Journal of Oncology,4,False,closed,en,...,Triple‑negative breast cancer (TNBC) is a high...,1,A5074700953,Eun‐Sook Kim,first,https://orcid.org/0009-0003-6205-1794,Duksung Women's University,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W4406237236,Enhanced oral bioavailability of pranlukast by...,2025,2025-01-10,article,Journal of Pharmaceutical Investigation,3,False,closed,en,...,,9,A5007187569;A5106685919;A5102742775;A505425133...,Dong‐Hyeon Ha;Eun-Sol Ha;Heejun Park;Seon-Kwan...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0002-3663-9258;https://...,Pusan National University;Pusan National Unive...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W4407835592,Simultaneous analysis of 203 drugs of abuse an...,2025,2025-02-21,article,Journal of Chromatography B,3,False,closed,en,...,,16,A5107397243;A5100413483;A5113339993;A510169577...,Sangeun Lee;Jihyun Lee;Dain Jang;Hee‐Jung Cho;...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0001-5693-0109;https://...,Duksung Women's University;Shimadzu (Japan);Du...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W4409997504,Synthesis of versatile fluorescent Isoquinolin...,2025,2025-01-01,review,Journal of Materials Chemistry B,3,False,closed,en,...,Isoquinolinium salts are well-known N-heterocy...,2,A5102498849;A5100626451,Ye Ri Han;Sang Bong Lee,first;last,https://orcid.org/0000-0002-5415-3008;https://...,Duksung Women's University;Chonnam National Un...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W4406134280,Autonomic Readiness for Social Threats in Pati...,2025,2025-01-08,article,Clinical Psychopharmacology and Neuroscience,1,True,gold,en,...,Pathological anxiety is characterized by dysre...,7,A5032793075;A5114348369;A5100314956;A505100864...,S.W. Kim;Dasom Lee;Jae Hyun Kim;Joongsuk Lee;D...,first;middle;middle;middle;middle;middle;last,https://orcid.org/0000-0003-3776-2263;https://...,Seoul National University Hospital;Seoul Natio...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2024_with_authors"
YEAR = 2024
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W4392882830,Attitudes towards artificial intelligence at w...,2024,2024-03-15,article,Journal of Occupational and Organizational Psy...,41,True,green,en,...,Abstract Research suggests that understanding ...,3,A5100409607;A5065878227;A5048198213,Jiyoung Park;Sang Eun Woo;JeongJin Kim,first;middle;last,https://orcid.org/0000-0003-4397-9645;https://...,Duksung Women's University;Purdue University W...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W4391750334,Viscometry-based prediction of structural prop...,2024,2024-02-12,article,Food Hydrocolloids,27,False,closed,en,...,,5,A5061802294;A5102799508;A5012862211;A502605833...,Hyun Woo Choi;Minji Choi;Chaerin Ryoo;Jungwoo ...,first;middle;middle;middle;last,https://orcid.org/0000-0002-1365-1721;https://...,Seoul National University;Seoul National Unive...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W4392627262,ANXA2 (annexin A2) is crucial to ATG7-mediated...,2024,2024-01-30,article,Autophagy,26,False,closed,en,...,Triple-negative breast cancer (TNBC) is associ...,18,A5109463778;A5076925914;A5071943179;A508273845...,Minsoo Koh;Hyesol Lim;Hao Jin;Minjoo Kim;Yeji ...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0003-1262-1675;https://...,Duksung Women's University|Yonsei University;D...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W4393992300,High-set curdlan emulsion gel fortified by tra...,2024,2024-04-05,article,Food Hydrocolloids,22,False,closed,en,...,,5,A5102799508;A5061802294;A5046803667;A502605833...,Minji Choi;Hyun Woo Choi;Myeongsu Jo;Jungwoo H...,first;middle;middle;middle;last,https://orcid.org/0000-0001-6533-3760;https://...,Seoul National University;Seoul National Unive...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W4402824362,<scp>EPR</scp> spectroscopy: A versatile tool ...,2024,2024-09-25,article,Bulletin of the Korean Chemical Society,21,True,hybrid,en,...,"Abstract Paramagnetic molecules, such as main‐...",3,A5108334853;A5100380881;A5079028591,Minyoung Ju;Jin Kim;Jeongcheol Shin,first;middle;last,https://orcid.org/0000-0003-4006-8897;https://...,Duksung Women's University;Sunchon National Un...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2023_with_authors"
YEAR = 2023
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W4318478298,Modulating Glycolysis to Improve Cancer Therapy,2023,2023-01-30,review,International Journal of Molecular Sciences,230,True,gold,en,...,Cancer cells undergo metabolic reprogramming a...,4,A5047462797;A5038662502;A5002243126;A5010982531,Chaithanya Chelakkot;Vipin Shankar Chelakkot;Y...,first;middle;middle;last,https://orcid.org/0000-0002-4548-5510;https://...,Seoul National University;Cleveland Clinic Ler...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W4386697415,Humans as Creativity Gatekeepers: Are We Biase...,2023,2023-09-14,article,Journal of Business and Psychology,92,True,hybrid,en,...,Abstract With artificial intelligence (AI) inc...,3,A5085030247;A5100409607;A5061711155,Federico Magni;Jiyoung Park;Melody Manchi Chao,first;middle;last,https://orcid.org/0000-0002-3797-8155;https://...,ETH Zurich;Duksung Women's University;Hong Kon...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W4323295433,Antibody–drug conjugates and bispecific antibo...,2023,2023-03-01,review,Archives of Pharmacal Research,44,False,closed,en,...,,3,A5030158170;A5090640494;A5083115378,Yeji Hong;Su-Min Nam;Aree Moon,first;middle;last,https://orcid.org/0000-0001-7182-9789,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W4383904229,Mimicking animal adipose tissue using a hybrid...,2023,2023-07-11,article,Food Hydrocolloids,40,False,closed,en,...,,5,A5102799508;A5061802294;A5033946983;A502605833...,Minji Choi;Hyun Woo Choi;Haeun Kim;Jungwoo Hah...,first;middle;middle;middle;last,https://orcid.org/0000-0001-6533-3760;https://...,Seoul National University;Seoul National Unive...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W4385728455,C-Reactive Protein Signaling Pathways in Tumor...,2023,2023-08-11,review,Biomolecules & Therapeutics,36,True,hybrid,en,...,Many cancers arise from sites of chronic infla...,3,A5074700953;A5100335623;A5083115378,Eun‐Sook Kim;Sun Young Kim;Aree Moon,first;middle;last,https://orcid.org/0009-0003-6205-1794;https://...,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2022_with_authors"
YEAR = 2022
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W4292548987,"A Comparative Study of Hesperetin, Hesperidin ...",2022,2022-08-20,article,Antioxidants,140,True,gold,en,...,"The antioxidant, anti-inflammatory and antibac...",3,A5066636038;A5050952182;A5113608328,Sung-Sook Choi;Sunhyung Lee;Kyung‐Ae Lee,first;middle;last,https://orcid.org/0000-0003-2814-6346,Duksung Women's University;Anyang University,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W4206550328,Who Likes Artificial Intelligence? Personality...,2022,2022-01-02,article,The Journal of Psychology,131,False,closed,en,...,We examined how individuals' personality relat...,2,A5100409607;A5065878227,Jiyoung Park;Sang Eun Woo,first;last,https://orcid.org/0000-0003-4397-9645;https://...,Duksung Women's University;Purdue University W...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W4313335837,Immersive interactive technologies and virtual...,2022,2022-12-31,article,Telematics and Informatics,116,False,closed,en,...,,4,A5100360487;A5100416219;A5101894678;A5102740803,Jung-Hwan Kim;Minjeong Kim;Minjung Park;Jungmi...,first;middle;middle;last,https://orcid.org/0000-0001-9350-1231;https://...,University of South Carolina;Indiana Universit...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W4303685506,Molecular Basis of Non-β-Lactam Antibiotics Re...,2022,2022-10-08,review,Antibiotics,41,True,gold,en,...,Methicillin-resistant Staphylococcus aureus (M...,3,A5025786310;A5003249495;A5029539402,Harshad Lade;Hwang‐Soo Joo;Jae‐Seok Kim,first;middle;last,https://orcid.org/0000-0002-9809-2921;https://...,Kangdong Sacred Heart Hospital|Hallym Universi...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W4210440897,Gelatin Coating for the Improvement of Stabili...,2022,2022-02-03,article,Molecules,34,True,gold,en,...,Most therapeutic agents have limitations owing...,5,A5067093226;A5053519788;A5043728037;A510274277...,Gantumur Battogtokh;Yechan Joo;S. ABUZAR;Heeju...,first;middle;middle;middle;last,https://orcid.org/0000-0002-8215-9992;https://...,Yonsei University;Yonsei University;Yonsei Uni...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2021_with_authors"
YEAR = 2021
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W3177014436,How interactivity and vividness influence cons...,2021,2021-06-20,article,Journal of Research in Interactive Marketing,192,False,closed,en,...,Purpose The purpose of the study is to investi...,4,A5100360487;A5100416219;A5101894678;A5102740803,Jung-Hwan Kim;Minjeong Kim;Minjung Park;Jungmi...,first;middle;middle;last,https://orcid.org/0000-0001-9350-1231;https://...,University of South Carolina;Indiana Universit...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W3208569491,A Comparative Study of Rutin and Rutin Glycosi...,2021,2021-10-27,article,Antioxidants,124,True,gold,en,...,The effects of rutin and rutin glycoside with ...,3,A5066636038;A5038576846;A5113608328,Sung-Sook Choi;Hye-Ryung Park;Kyung‐Ae Lee,first;middle;last,,Duksung Women's University;Kyung Hee Universit...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W3139356424,Fibronectin regulates anoikis resistance via c...,2021,2021-03-23,article,Cancer Letters,107,False,closed,en,...,,14,A5109349079;A5111921858;A5100765070;A508531445...,Hyeong‐jun Han;Jee Young Sung;Su-Hyeon Kim;Un-...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0001-9144-689X;https://...,National Cancer Center;National Cancer Center;...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W3185567511,Recent progress and challenges in microbial po...,2021,2021-07-21,review,Bioresource Technology,74,False,closed,en,...,,5,A5101448050;A5017278958;A5069333187;A510199774...,Jiye Lee;Hyun June Park;Myounghoon Moon;Jin-Su...,first;middle;middle;middle;last,https://orcid.org/0000-0001-6723-4010;https://...,Korea Institute of Energy Research;Duksung Wom...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W3201227628,Mixed or Segregated: Toward Efficient and Stab...,2021,2021-09-08,review,ACS Omega,62,True,gold,en,...,Convenient modulation of bandgap for the mixed...,4,A5052391760;A5112877857;A5102973598;A5030026591,Hyejin Choe;Dohyun Jeon;Seon Joo Lee;Junsang Cho,first;middle;middle;last,https://orcid.org/0000-0002-5307-4471;https://...,Duksung Women's University;Korea Research Inst...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2020_with_authors"
YEAR = 2020
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W3095726951,Antimicrobial and Immunomodulatory Effects of ...,2020,2020-11-04,review,Journal of Microbiology and Biotechnology,64,True,bronze,en,...,,2,A5112388283;A5018446601,Hyun Jung Lim;Hea Soon Shin,first;last,,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W3082772483,Elucidating the Mechanistic Origins of Photoca...,2020,2020-08-31,article,ACS Applied Materials & Interfaces,58,False,closed,en,...,Solar fuel generation mediated by semiconducto...,7,A5030026591;A5074609470;A5089646746;A500424741...,Junsang Cho;Nuwanthi Suwandaratne;Sara Abdel R...,first;middle;middle;middle;middle;middle;last,https://orcid.org/0000-0003-4211-4113;https://...,Texas A&M University|Duksung Women's Universit...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W3005776191,Multidimensional Cognitive Behavioral Therapy ...,2020,2020-02-10,article,JMIR mhealth and uhealth,55,True,gold,en,...,"Background Developing effective, widely useful...",8,A5100580073;A5003266453;A5032571771;A504086106...,Meelim Kim;Young‐In Kim;Yoonjeong Go;Seokoh Le...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0002-0507-1727;https://...,Duksung Women's University|Seoul National Univ...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W3024968792,The Effects of Perceived Quality of Augmented ...,2020,2020-05-15,article,Informatics,52,True,gold,en,...,Augmented reality (AR) enables consumers to br...,1,A5102740803,Jungmin Yoo,first,https://orcid.org/0000-0002-8398-955X,Duksung Women's University,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W3108559236,Consumer decision-making in a retail store: th...,2020,2020-11-30,article,International Journal of Retail & Distribution...,47,False,closed,en,...,Purpose The purpose of the study is to investi...,4,A5100360487;A5100416219;A5102740803;A5072514254,Jung-Hwan Kim;Minjeong Kim;Jungmin Yoo;Minjung...,first;middle;middle;last,https://orcid.org/0000-0001-9350-1231;https://...,University of South Carolina;Indiana Universit...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2019_with_authors"
YEAR = 2019
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W2968846733,Effects of perceived interactivity of augmente...,2019,2019-08-17,article,Journal of Retailing and Consumer Services,311,False,closed,en,...,,2,A5072514254;A5102740803,Minjung Park;Jungmin Yoo,first;last,https://orcid.org/0000-0003-3040-2759;https://...,Ewha Womans University;Duksung Women's University,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W2987873067,Biofilm Formation by Staphylococcus aureus Cli...,2019,2019-11-02,article,Journal of Clinical Medicine,99,True,gold,en,...,Staphylococcus aureus (S. aureus) causes persi...,7,A5025786310;A5018827803;A5101948147;A500491728...,Harshad Lade;Joon Hyun Park;Sung Hee Chung;In ...,first;middle;middle;middle;middle;middle;last,https://orcid.org/0000-0002-9809-2921;https://...,Kangdong Sacred Heart Hospital|Hallym Universi...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W2983005260,"Concentrations of THC, CBD, and CBN in commerc...",2019,2019-11-15,article,Forensic Science International,84,False,closed,en,...,,9,A5020314332;A5101581762;A5020198852;A508393339...,Eunyoung Jang;Hyo‐Jeong Kim;Seojeong Jang;Jaes...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0003-3714-6495;https://...,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W2955577995,Antitumor and Anti-Invasive Effect of Apigenin...,2019,2019-06-27,article,International Journal of Molecular Sciences,82,True,gold,en,...,Interleukin (IL)-6 plays a crucial role in the...,5,A5110478953;A5100625726;A5083115378;A506312125...,Hwan Hee Lee;Joohee Jung;Aree Moon;Hyojeung Ka...,first;middle;middle;middle;last,https://orcid.org/0000-0001-9124-9052;https://...,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W2928455896,"Effects of temperature, light, and pH on the s...",2019,2019-04-01,article,Food Chemistry,80,False,closed,en,...,,8,A5102958189;A5081989199;A5017686656;A506986092...,Dong Zhao;Daeung Yu;Moojoong Kim;Ming‐Yao Gu;S...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0002-9018-3191;https://...,Gangneung–Wonju National University|Korea Univ...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2018_with_authors"
YEAR = 2018
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W2810454451,"Sex Differences in Cancer: Epidemiology, Genet...",2018,2018-06-27,review,Biomolecules & Therapeutics,293,True,green,en,...,Growing evidence shows sex-specific difference...,3,A5105519402;A5076925914;A5083115378,Hae-In Kim;Hyesol Lim;Aree Moon,first;middle;last,https://orcid.org/0009-0008-4844-3587,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W2897911253,Biowaste-to-bioenergy using biological methods...,2018,2018-10-10,article,Energy Conversion and Management,269,False,closed,en,...,,3,A5082323415;A5003249495;A5026721642,Shashi Kant Bhatia;Hwang‐Soo Joo;Yung‐Hun Yang,first;middle;last,https://orcid.org/0000-0002-7688-6069;https://...,Konkuk University;Duksung Women's University;K...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W2794212562,Creative destruction of the sharing economy in...,2018,2018-03-20,article,Transportation Research Part A Policy and Prac...,113,False,closed,en,...,,3,A5100655034;A5025283796;A5091261738,Ki‐Bum Kim;Chulwoo Baek;Jeong‐Dong Lee,first;middle;last,https://orcid.org/0000-0003-1392-5021;https://...,Seoul National University;Duksung Women's Univ...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W2783837782,A Scenario-Based Cognitive Behavioral Therapy ...,2018,2018-01-11,article,Telemedicine Journal and e-Health,82,False,closed,en,...,While self-administered mobile app-based cogni...,4,A5005773862;A5100417355;A5069646864;A5068174685,Ji‐Won Hur;Boram Kim;Dasom Park;Sungwon Choi,first;middle;middle;last,https://orcid.org/0000-0002-1939-7365;https://...,Chung-Ang University;Duksung Women's Universit...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W2809889057,Crosstalk between cancer cells and endothelial...,2018,2018-06-30,review,Archives of Pharmacal Research,78,False,closed,en,...,,2,A5034950429;A5083115378,Hyojeong Choi;Aree Moon,first;last,,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2017_with_authors"
YEAR = 2017
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W2604154346,Psychometric properties of the Turkish version...,2017,2017-01-02,article,Psychiatry and Clinical Psychopharmacology,117,True,gold,en,...,Objective: The shortage of cross-culturally va...,9,A5089545346;A5009201584;A5073661627;A504587486...,Vahdet Görmez;Ayşe Kılınçaslan;Abdurrahman Cah...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0002-2704-9520;https://...,Bezmiâlem Vakıf Üniversitesi;Istanbul Universi...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W2746615960,The persistency and volatility of the firm R&a...,2017,2017-08-14,article,Research Policy,96,False,closed,en,...,,3,A5004478892;A5025283796;A5091261738,Taewon Kang;Chulwoo Baek;Jeong‐Dong Lee,first;middle;last,https://orcid.org/0000-0001-5388-1604;https://...,Seoul National University;Duksung Women's Univ...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W2747068989,"Sources of social support, job satisfaction, a...",2017,2017-08-18,article,The Social Science Journal,96,False,closed,en,...,Although literature has suggested a link betwe...,2,A5075950841;A5048198293,Jongil Yuh;Sungsil Choi,first;last,https://orcid.org/0000-0001-7482-4182,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W2767815049,DJ-1 controls bone homeostasis through the reg...,2017,2017-11-09,article,Nature Communications,96,True,gold,en,...,Receptor activator of NF-kB ligand (RANKL) gen...,14,A5043210533;A5071611789;A5029720644;A501791781...,Hyuk Soon Kim;Seung Taek Nam;Se Hwan Mun;Sun‐K...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0002-0566-0142;https://...,Konkuk University;Konkuk University;Konkuk Uni...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W2735321801,Programmed Cell Death 1 (PD-1) and Cytotoxic T...,2017,2017-07-13,review,International Journal of Molecular Sciences,95,True,gold,en,...,Virus-specific cluster of differentiation 8 (C...,4,A5010214504;A5063121258;A5023697227;A5067074791,Hyosun Cho;Hyojeung Kang;Hwan Lee;Chang Wook Kim,first;middle;middle;last,https://orcid.org/0000-0001-9131-5742;https://...,Duksung Women's University;Kyungpook National ...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2016_with_authors"
YEAR = 2016
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W2415913580,The effects of e-mass customization on consume...,2016,2016-06-08,article,Journal of Business Research,204,False,closed,en,...,,2,A5102740803;A5072514254,Jungmin Yoo;Minjung Park,first;last,https://orcid.org/0000-0002-8398-955X;https://...,Duksung Women's University;Ewha Womans University,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W2277062722,Training children’s theory-of-mind: A meta-ana...,2016,2016-02-20,review,Cognition,131,True,green,en,...,,8,A5063304307;A5071760398;A5009692359;A510143914...,Stefan G. Hofmann;Stacey N. Doan;Manuel Sprung...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0002-3548-9681;https://...,Boston University;Claremont McKenna College;Th...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W2265352325,The Revised Child Anxiety and Depression Scale...,2016,2016-02-01,article,Assessment,101,False,closed,en,...,To help facilitate the dissemination and imple...,5,A5045874864;A5030074615;A5031035006;A504327620...,Chad Ebesutani;Priya Korathu-Larson;Brad J. Na...,first;middle;middle;middle;last,https://orcid.org/0000-0002-8133-6440;https://...,Duksung Women's University;University of Hawai...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W2345532848,Chronotype Differences in Health Behaviors and...,2016,2016-05-05,article,Behavioral Sleep Medicine,84,False,closed,en,...,"This study investigates health behaviors, heal...",7,A5010591064;A5108512690;A5055753007;A501434999...,Sooyeon Suh;Hae-Chung Yang;Nan Hee Kim;Ji Hee ...,first;middle;middle;middle;middle;middle;last,https://orcid.org/0000-0003-0644-8634;https://...,Sungshin Women's University|Palo Alto Universi...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W2278097313,Hyperoside and rutin of Nelumbo nucifera induc...,2016,2016-02-18,article,Oncology Letters,73,True,gold,en,...,The present study demonstrates the mechanism o...,2,A5109513922;A5101596150,Tae Eun Guon;Ha Sook Chung,first;last,https://orcid.org/0000-0001-9983-6489,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2015_with_authors"
YEAR = 2015
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W2132922452,A microengineered pathophysiological model of ...,2015,2015-01-01,article,Lab on a Chip,214,True,green,en,...,A mounting body of evidence in cancer research...,10,A5107006393;A5011047099;A5072839636;A502742460...,Yoon‐Seok Choi;Eunjeh Hyun;Jeongyun Seo;Cassid...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0003-4794-4263;https://...,Government of the Republic of Korea|Seoul Nati...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W2019507092,"Irisin, a Novel Myokine, Regulates Glucose Upt...",2015,2015-04-01,article,Molecular Endocrinology,193,True,bronze,en,...,Irisin is a novel myokine produced by skeletal...,13,A5100743829;A5110004190;A5052092637;A503170196...,Hye Jeong Lee;Jung Ok Lee;Nami Kim;Joong Kwan ...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0003-4535-9511;https://...,Korea University;Korea University;Korea Univer...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W2036439337,RViz: a toolkit for real domain data visualiza...,2015,2015-04-15,article,Telecommunication Systems,165,False,closed,en,...,,4,A5001183035;A5100402780;A5061685064;A5053313621,Hyeong Ryeol Kam;Sung-Ho Lee;Taejung Park;Chan...,first;middle;middle;last,https://orcid.org/0000-0001-9114-2319;https://...,Korea University;Duksung Women's University;Ko...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W2206531137,Mutual Authentication Scheme in Secure Interne...,2015,2015-12-24,article,Sensors,142,True,gold,en,...,"The Internet of Things (IoT), which can be reg...",2,A5032079311;A5110295646,Namje Park;Namhi Kang,first;last,https://orcid.org/0000-0003-4434-8933,Jeju National University;Duksung Women's Unive...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W2230175145,Natural Products for Chemoprevention of Breast...,2015,2015-12-30,review,Journal of Cancer Prevention,133,True,gold,en,...,Breast cancer is the primary cause of cancer d...,2,A5065134845;A5083115378,EunYi Ko;Aree Moon,first;last,,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2014_with_authors"
YEAR = 2014
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W2156018831,Neuropilin-1 functions as a VEGFR2 co-receptor...,2014,2014-09-22,article,eLife,137,True,gold,en,...,"During development, tissue repair, and tumor g...",10,A5045726685;A5083496533;A5012056698;A504671838...,Maria V. Gelfand;Nellwyn Hagan;Aleksandra Tata...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0003-3270-0485;https://...,Harvard University;Harvard University;Harvard ...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W1980361000,Human Tumor Xenograft Models for Preclinical A...,2014,2014-03-31,article,Toxicological Research,134,True,bronze,en,...,,1,A5100625726,Joohee Jung,first,https://orcid.org/0000-0001-9124-9052,Duksung Women's University,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W2157098018,The Development and Psychometric Investigation...,2014,2014-01-14,article,Journal of Interpersonal Violence,115,False,closed,en,...,Accurate assessment of cyberbullying is essent...,5,A5090348990;A5039183276;A5090372662;A504587486...,Regan W. Stewart;Christopher F. Drescher;Danie...,first;middle;middle;middle;last,https://orcid.org/0000-0002-0994-2227;https://...,University of Mississippi;University of Missis...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W2051756570,Endothelium-derived Relaxing Factors of Small ...,2014,2014-09-30,review,Toxicological Research,110,True,bronze,en,...,,1,A5012379871,Kyu‐Tae Kang,first,https://orcid.org/0000-0001-5163-0262,Duksung Women's University,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W2143680134,Self-Expressiveness in Sport Tourism,2014,2014-05-19,article,Journal of Travel Research,82,False,closed,en,...,This study develops and tests a model of self-...,5,A5037969281;A5089454855;A5014183639;A501889468...,Michael Bošnjak;Carroll A. Brown;Dong‐Jin Lee;...,first;middle;middle;middle;last,https://orcid.org/0000-0002-1431-8461;https://...,Free University of Bozen-Bolzano;Western Carol...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2013_with_authors"
YEAR = 2013
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W1968493165,Pif1 helicase and Polδ promote recombination-c...,2013,2013-09-10,article,Nature,300,True,green,en,...,,11,A5015042419;A5005189404;A5101871800;A508735167...,Marenda A. Wilson;Youngho Kwon;Yuanyuan Xu;Woo...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0003-1610-1044;https://...,Baylor College of Medicine;Yale University;Yal...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W2105414208,<scp>BLH</scp>1 and <scp>KNAT</scp>3 modulate ...,2013,2013-05-11,article,The Plant Journal,120,False,closed,en,...,The signal transduction pathway governed by th...,6,A5043050739;A5045288847;A5078991413;A510149432...,Dachan Kim;Young‐hyun Cho;Hojin Ryu;Yoonhee Ki...,first;middle;middle;middle;middle;last,https://orcid.org/0000-0001-5196-8482;https://...,Pohang University of Science and Technology;Po...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W2015706279,The antioxidant activity of daidzein metabolit...,2013,2013-10-23,article,Molecular Medicine Reports,95,True,bronze,en,...,"Daidzein and its glycoside form daidzin, are k...",2,A5089574455;A5100664725,Eun Jeong Choi;Gun‐Hee Kim,first;last,https://orcid.org/0000-0002-7698-3424;https://...,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W2087330966,Inflammatory and microenvironmental factors in...,2013,2013-11-12,review,Archives of Pharmacal Research,90,False,closed,en,...,,2,A5110901763;A5083115378,Mina Ham;Aree Moon,first;last,,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W2007313876,2-Hydroxychalcone and xanthohumol inhibit inva...,2013,2013-04-03,article,Chemico-Biological Interactions,78,False,closed,en,...,,3,A5100335623;A5103159070;A5083115378,Sun Young Kim;Ik‐Soo Lee;Aree Moon,first;middle;last,https://orcid.org/0000-0002-5994-8889;https://...,Duksung Women's University;Chonnam National Un...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2012_with_authors"
YEAR = 2012
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W2030519527,Comparison of the chemical compositions and nu...,2012,2012-01-01,article,Nutrition Research and Practice,271,True,green,en,...,Pumpkins have considerable variation in nutrie...,5,A5100755756;A5100462734;A5085103874;A501235157...,Mi Young Kim;Eun Jin Kim;Young‐Nam Kim;Changsu...,first;middle;middle;middle;last,https://orcid.org/0000-0002-1202-2245;https://...,Chung-Ang University;Chung-Ang University;Duks...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W2039107480,Drug-Induced Nephrotoxicity and Its Biomarkers,2012,2012-05-31,review,Biomolecules & Therapeutics,266,True,bronze,en,...,Nephrotoxicity occurs when kidney-specific det...,2,A5100335616;A5083115378,Sun Young Kim;Aree Moon,first;last,https://orcid.org/0000-0002-1180-0367,Duksung Women's University,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W2150172235,In vitro evaluation of biomarkers for cisplati...,2012,2012-12-31,article,Toxicology Letters,83,False,closed,en,...,,7,A5075428572;A5100335623;A5052433739;A502603837...,So-Jung Sohn;Sun Young Kim;Hyung Sik Kim;Young...,first;middle;middle;middle;middle;middle;last,https://orcid.org/0000-0002-5994-8889;https://...,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W2096616783,Polymeric Nanoparticles Containing Taxanes Enh...,2012,2012-07-14,article,International Journal of Radiation Oncology*Bi...,70,False,closed,en,...,,10,A5111781835;A5100323621;A5108194390;A510202223...,Joohee Jung;Sung Jin Park;Hye Kyung Chung;Hye-...,first;middle;middle;middle;middle;middle;middl...,https://orcid.org/0000-0002-7340-7835;https://...,Duksung Women's University|University of Ulsan...,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W1989962582,Acupuncture for Shoulder Pain After Stroke: A ...,2012,2012-08-27,review,The Journal of Alternative and Complementary M...,62,True,green,en,...,"Objectives: Shoulder pain, for which acupunctu...",7,A5101499115;A5049870193;A5112771029;A510280976...,Jung Ah Lee;Si‐Woon Park;Pil Woo Hwang;Sung Mi...,first;middle;middle;middle;middle;middle;last,https://orcid.org/0000-0002-5837-3008;https://...,National Rehabilitation Center;National Rehabi...,Duksung Women's University,Duksung Women's University,I65832422


In [None]:
# Colab 1-cell script (institutions.id only, year=2025, top-500 by citations)
# Adds authorship fields + abstract reconstruction + verbose prints

import os, time, requests, pandas as pd

# ===================== 사용자 설정 =====================
SAVE_DIR = "/content/drive/MyDrive/6. 덕성여자대학교/25학년도 2학기/비정형데이터분석/프로젝트/openalex_top10000_2011_with_authors"
YEAR = 2011
TOP_K = 10000        # 기관별 최대 10000편
PER_PAGE = 200     # OpenAlex 최대 권장
SLEEP = 0.2        # 요청 간 대기(429 예방)
RETRY = 5          # 재시도 횟수
TIMEOUT = 60       # 요청 타임아웃(초)

# 대상 14개(덕성여대 포함)
TARGET_UNIVS = [
    "Duksung Women's University",
    "Seoul National University",
    "Korea University",
    "Yonsei University",
    "Korea Advanced Institute of Science and Technology",  # KAIST
    "Sogang University",
    "Sungkyunkwan University",
    "Hanyang University",
    "Pohang University of Science and Technology", #POSTECH
    "Gwangju Institute of Science and Technology", #GIST
    "Daegu Gyeongbuk Institute of Science and Technology", #DGIST
    "Ulsan National Institute of Science and Technology",   #UNIST
    "Chung-Ang University",
    "Kyung Hee University",
    "Kangwon National University",
    "Kyungpook National University",
    "Gyeongsang National University",
    "Pusan National University",
    "Chonnam National University",
    "Jeonbuk National University",
    "Jeju National University",
    "Chungbuk National University",
    "Chungnam National University"
]

# 매칭 실패 대비 간단 별칭
ALT_NAMES = {
    "Duksung Women's University": ["덕성여자대학교", "덕성여대"],
    "Seoul National University": ["서울대학교", "서울대", "SNU"],
    "Korea University": ["고려대학교", "고려대", "KU"],
    "Yonsei University": ["연세대학교", "연세대"],
    "Korea Advanced Institute of Science and Technology": ["한국과학기술원", "KAIST"],
    "Sogang University": ["서강대학교", "서강대"],
    "Sungkyunkwan University": ["성균관대학교", "성균관대"],
    "Hanyang Unviersity": ["한양대학교", "한양대"],
    "Pohang University of Science and Technology": ["포항공과대학교", "포항공대", "POSTECH"],
    "Gwangju Institute of Science and Technology": ["광주과학기술원", "GIST"],
    "Daegu Gyeongbuk Institute of Science and Technology": ["대구경북과학기술원", "DGIST"],
    "Ulsan National Institute of Science and Technology": ["울산과학기술원", "UNIST"],
    "Chung-Ang University": ["중앙대학교", "중앙대", "Chungang University"],
    "Kyung Hee University": ["경희대학교", "경희대", "Kyunghee University"],
    "Kangwon National University": ["강원대학교", "강원대", "KNU"],
    "Kyungpook National University": ["경북대학교", "경북대"],
    "Gyeongsang National University": ["경상국립대학교", "경상대"],
    "Pusan National University": ["부산대학교", "부산대"],
    "Chonnam National University": ["전남대학교", "전남대"],
    "Jeonbuk National University": ["전북대학교", "전북대"],
    "Jeju National University": ["제주대학교", "제주대"],
    "Chungbuk National University": ["충북대학교", "충북대"],
    "Chungnam National University": ["충남대학교", "충남대"]
}

os.makedirs(SAVE_DIR, exist_ok=True)

# ===================== 공통 유틸 =====================
def http_get(url, params, retry=RETRY, backoff=1.4, timeout=TIMEOUT):
    for i in range(retry):
        try:
            r = requests.get(url, params=params, timeout=timeout)
            if r.status_code == 200:
                return r
            print(f"[HTTP:{r.status_code}] retry {i+1}/{retry} url={url} params={params}")
            time.sleep((backoff ** i) + 0.2)
        except requests.RequestException as e:
            print(f"[HTTP:EXC] {e} retry {i+1}/{retry} url={url}")
            time.sleep((backoff ** i) + 0.2)
    print("[HTTP] giving up after retries.")
    return None

def reconstruct_abstract(inv_idx: dict | None) -> str | None:
    """
    OpenAlex abstract_inverted_index(토큰 -> 위치 리스트)를 실제 텍스트로 복원.
    """
    if not inv_idx:
        return None
    try:
        max_pos = max(p for positions in inv_idx.values() for p in positions)
        words = [""] * (max_pos + 1)
        for token, positions in inv_idx.items():
            for p in positions:
                words[p] = token
        text = " ".join(w for w in words if w is not None)
        return " ".join(text.split())
    except Exception as e:
        print(f"[ABSTRACT] reconstruction failed: {e}")
        return None

# ===================== (1) 기관명 → institutions.id =====================
def search_institution_top1(name: str, relax=False):
    """
    institutions 검색은 'search'를 사용. 페이지 크기는 'per-page' 하이픈 표기.
    기본: KR + education 필터, top1
    relax=True면 type 필터 제거(KR만)로 재시도.
    """
    url = "https://api.openalex.org/institutions"
    flt = "country_code:KR,type:education" if not relax else "country_code:KR"
    params = {
        "search": name,
        "filter": flt,
        "per-page": 1
    }
    print(f"[MAP]   -> searching: '{name}' | filter='{flt}'")
    r = http_get(url, params)
    if not r:
        print(f"[MAP]   -> no response for '{name}'")
        return None
    items = r.json().get("results", [])
    if items:
        cand = items[0]
        print(f"[MAP]   -> candidate: {cand.get('display_name')} | id={cand.get('id')}")
        return cand
    print(f"[MAP]   -> no candidates for '{name}' (relax={relax})")
    return None

def resolve_institution_id(qname: str):
    print(f"[MAP] Try EN: {qname}")
    res = search_institution_top1(qname)
    if not res:
        res = search_institution_top1(qname, relax=True)
    if res and res.get("id"):
        inst_id = res["id"].split("/")[-1]
        print(f"[MAP] OK (EN): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
        return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}

    for alt in ALT_NAMES.get(qname, []):
        print(f"[MAP] Try ALT: {qname} -> '{alt}'")
        res = search_institution_top1(alt)
        if not res:
            res = search_institution_top1(alt, relax=True)
        if res and res.get("id"):
            inst_id = res["id"].split("/")[-1]
            print(f"[MAP] OK (ALT): {qname} -> {res.get('display_name')} | institutions.id={inst_id}")
            return {"query_name": qname, "institution_id": inst_id, "display_name": res.get("display_name")}
    print(f"[MAP] FAIL: {qname} (could not resolve institutions.id)")
    return {"query_name": qname, "institution_id": None, "display_name": None}

def build_institution_map(names):
    print("[MAP] ==== start institution mapping ====")
    rows = []
    for idx, nm in enumerate(names, 1):
        print(f"[MAP] [{idx}/{len(names)}] resolving '{nm}'")
        info = resolve_institution_id(nm)
        rows.append(info)
        time.sleep(SLEEP)
    df = pd.DataFrame(rows)
    out = os.path.join(SAVE_DIR, "institutions_mapping.csv")
    df.to_csv(out, index=False, encoding="utf-8")
    print(f"[MAP] saved mapping -> {out}")
    print(df)
    return df

# ===================== (2) 2025 상위 500편 수집 + 초록/저자 =====================
def extract_authors(authorships: list | None):
    """
    authorships 리스트에서 저자 관련 필드 평탄화.
    - author_ids: A... 꼬리만 추출
    - author_names: display_name
    - author_positions: first/middle/last
    - author_orcids: 0000-0002-...
    - author_affiliations: 저자별 기관 display_name들을 '|'로 묶고, 저자 간은 ';'로 연결
    """
    if not authorships:
        return {
            "authors_count": 0,
            "author_ids": None,
            "author_names": None,
            "author_positions": None,
            "author_orcids": None,
            "author_affiliations": None,
        }
    ids, names, poss, orcids, affs = [], [], [], [], []
    for a in authorships:
        author = a.get("author") or {}
        aid = author.get("id") or ""
        aid_tail = aid.split("/")[-1] if aid else None
        ids.append(aid_tail)
        names.append(author.get("display_name"))
        poss.append(a.get("author_position"))
        # orcid 위치가 author.orcid 또는 author.ids.orcid로 있을 수 있음
        orcid = author.get("orcid")
        if not orcid:
            orcid = (author.get("ids") or {}).get("orcid")
        orcids.append(orcid)

        insts = a.get("institutions") or []
        inst_names = [ (inst or {}).get("display_name") for inst in insts if inst ]
        affs.append("|".join([n for n in inst_names if n]))

    return {
        "authors_count": len(names),
        "author_ids": ";".join([x for x in ids if x]),
        "author_names": ";".join([x for x in names if x]),
        "author_positions": ";".join([x for x in poss if x]),
        "author_orcids": ";".join([x for x in orcids if x]),
        "author_affiliations": ";".join([x for x in affs if x]),
    }

def fetch_topk_works_for_institution(inst_id: str, qname: str, dname: str, year: int = YEAR, topk: int = TOP_K):
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"institutions.id:{inst_id},publication_year:{year}",
        "per-page": PER_PAGE,          # 하이픈 표기
        "cursor": "*",
        "sort": "cited_by_count:desc"
    }
    out = []
    page = 0
    while True:
        page += 1
        print(f"[GET] {qname} | page={page} cursor={params.get('cursor')}")
        r = http_get(url, params)
        if not r:
            print(f"[GET]   -> no response (stop).")
            break

        j = r.json()
        batch = j.get("results", [])
        if not batch:
            print(f"[GET]   -> empty batch (stop).")
            break

        rep_title = (batch[0].get("title") or "")[:80]
        print(f"[GET]   -> fetched {len(batch)} (rep: '{rep_title}...')")

        for w in batch:
            abstract_text = reconstruct_abstract(w.get("abstract_inverted_index"))
            auth = extract_authors(w.get("authorships"))

            out.append({
                "openalex_id": w.get("id"),
                "title": w.get("title"),
                "publication_year": w.get("publication_year"),
                "publication_date": w.get("publication_date"),
                "type": w.get("type"),
                "host_venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
                "cited_by_count": w.get("cited_by_count"),
                "is_oa": (w.get("open_access") or {}).get("is_oa"),
                "oa_status": (w.get("open_access") or {}).get("oa_status"),
                "language": w.get("language"),
                "doi": (w.get("ids") or {}).get("doi") or w.get("doi"),
                "abstract": abstract_text,
                **auth,
            })
            if len(out) >= topk:
                print(f"[GET]   -> reached top-{topk}; stopping early.")
                return out[:topk]

        nxt = j.get("meta", {}).get("next_cursor")
        total = len(out)
        print(f"[GET]   -> total accumulated: {total} | next_cursor={nxt}")
        if not nxt:
            print("[GET]   -> no next_cursor; finished.")
            break
        params["cursor"] = nxt
        time.sleep(SLEEP)
    return out[:topk]

# ===================== (3) 실행(매핑 → 수집 → 저장) =====================
inst_map = build_institution_map(TARGET_UNIVS)
print("=== Institution mapping result ===")
print(inst_map.to_string(index=False))

all_dfs = []
for _, row in inst_map.iterrows():
    inst_id = row["institution_id"]
    qname   = row["query_name"]
    dname   = row["display_name"]
    if not inst_id:
        print(f"[SKIP] No institutions.id for '{qname}'")
        continue

    print(f"[RUN] Fetch top-{TOP_K} (year={YEAR}) for {qname} ({dname}) → institutions.id={inst_id}")
    works = fetch_topk_works_for_institution(inst_id, qname, dname, year=YEAR, topk=TOP_K)
    df_inst = pd.DataFrame(works)
    df_inst["query_name"] = qname
    df_inst["display_name"] = dname
    df_inst["institution_id"] = inst_id

    safe = qname.replace("/", "_").replace(" ", "_")
    path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_{safe}.csv")
    df_inst.to_csv(path, index=False, encoding="utf-8")
    print(f"[SAVE] {qname} -> {path} (n={len(df_inst)})")
    all_dfs.append(df_inst)
    time.sleep(0.4)

df_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
all_path = os.path.join(SAVE_DIR, f"works_top{TOP_K}_{YEAR}_ALL.csv")
df_all.to_csv(all_path, index=False, encoding="utf-8")
print(f"[DONE] Total rows: {len(df_all)}")
print(f"[DONE] Saved to: {SAVE_DIR}")

# 미리보기
df_all.head()


[MAP] ==== start institution mapping ====
[MAP] [1/23] resolving 'Duksung Women's University'
[MAP] Try EN: Duksung Women's University
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Duksung Women's University' (relax=False)
[MAP]   -> searching: 'Duksung Women's University' | filter='country_code:KR'
[MAP]   -> candidate: Duksung Women's University | id=https://openalex.org/I65832422
[MAP] OK (EN): Duksung Women's University -> Duksung Women's University | institutions.id=I65832422
[MAP] [2/23] resolving 'Seoul National University'
[MAP] Try EN: Seoul National University
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR,type:education'
[MAP]   -> no candidates for 'Seoul National University' (relax=False)
[MAP]   -> searching: 'Seoul National University' | filter='country_code:KR'
[MAP]   -> candidate: Seoul National University | id=https://openalex.org/I139264467
[MAP] OK (EN): Seoul 

Unnamed: 0,openalex_id,title,publication_year,publication_date,type,host_venue,cited_by_count,is_oa,oa_status,language,...,abstract,authors_count,author_ids,author_names,author_positions,author_orcids,author_affiliations,query_name,display_name,institution_id
0,https://openalex.org/W2141140503,Preclinical Development of the Novel Chk1 Inhi...,2011,2011-12-28,article,Molecular Cancer Therapeutics,116,True,bronze,en,...,Many anticancer agents damage DNA and arrest c...,5,A5061329302;A5071213463;A5005359924;A502813150...,Ryan Montano;Injae Chung;Kristen M. Garner;Dav...,first;middle;middle;middle;last,https://orcid.org/0000-0002-2902-4677;https://...,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
1,https://openalex.org/W2029113385,Sphingosine 1-phosphate regulates matrix metal...,2011,2011-06-08,article,Journal of Cell Science,94,True,green,en,...,Recent evidence suggests that inflammation is ...,6,A5074700953;A5022162975;A5089461702;A507073781...,Eun‐Sook Kim;Jong‐Sook Kim;Sang Geon Kim;Se Ji...,first;middle;middle;middle;middle;last,https://orcid.org/0009-0003-6205-1794;https://...,Duksung Women's University;Duksung Women's Uni...,Duksung Women's University,Duksung Women's University,I65832422
2,https://openalex.org/W2144126789,Unmet Needs of Breast Cancer Patients Relative...,2011,2011-12-19,article,Yonsei Medical Journal,94,True,gold,en,...,The present study aims to evaluate the prevale...,2,A5041491639;A5056460102,Byeong‐Woo Park;Sook Yeon Hwang,first;last,https://orcid.org/0000-0003-1353-2607;https://...,Yonsei University;Duksung Women's University,Duksung Women's University,Duksung Women's University,I65832422
3,https://openalex.org/W2088584602,"Genipin, a constituent of Gardenia jasminoides...",2011,2011-10-20,article,Oncology Reports,78,True,bronze,en,...,,1,A5000774534,Aree Moon,first,,Duksung Women's University,Duksung Women's University,Duksung Women's University,I65832422
4,https://openalex.org/W2145012839,Quality of Life Differences between Younger an...,2011,2011-01-01,article,Journal of Breast Cancer,71,True,gold,en,...,This study was designed to investigate quality...,5,A5041491639;A5071347304;A5052216064;A511024863...,Byeong‐Woo Park;Suyun Lee;Ah Reum Lee;Kyung-Hi...,first;middle;middle;middle;last,https://orcid.org/0000-0003-1353-2607;https://...,Yonsei University;Sejong University;Korea Labo...,Duksung Women's University,Duksung Women's University,I65832422
