In [2]:
import pandas as pd

raw_url = "https://raw.githubusercontent.com/Parkss0/251119/4f5d8d350f3c777d33b38827471292245092756d/260210_duplicate_remove.csv"

# 인코딩 자동 대응(utf-8 / utf-8-sig / cp949)
for enc in ["utf-8", "utf-8-sig", "cp949", "euc-kr"]:
    try:
        df = pd.read_csv(raw_url, encoding=enc)
        print("SUCCESS encoding =", enc, "| shape =", df.shape)
        break
    except UnicodeDecodeError as e:
        print("FAIL encoding =", enc, "|", e)

df.head()


SUCCESS encoding = utf-8 | shape = (10380, 8)


Unnamed: 0.1,Unnamed: 0,std_smiles,canonical_smiles,max_phase,oral,parenteral,prodrug,max_phase_num
0,0,Brc1c(NC2=NCCN2)ccc2nccnc12,Brc1c(NC2=NCCN2)ccc2nccnc12,4,0,0,0,4
1,1,C#CC(O)(/C=C/Cl)CC,C#CC(O)(/C=C/Cl)CC,4,1,0,0,4
2,2,C#CC1(O)CC[C@H]2[C@@H]3CC=C4CC(=O)CCC4[C@H]3CC...,C#CC1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@...,4,1,0,0,4
3,3,C#CC1(OC(N)=O)CCCCC1,C#CC1(OC(N)=O)CCCCC1,4,1,0,0,4
4,4,C#CCC(Cc1cnc2nc(N)nc(N)c2n1)c1ccc(C(=O)NC(CCC(...,C#CCC(Cc1cnc2nc(N)nc(N)c2n1)c1ccc(C(=O)N[C@@H]...,4,0,1,0,4


In [3]:
import time
import csv
import pandas as pd
import numpy as np
import requests

PUG = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (colab; pubchem xlogp lookup)"
})

def fetch_xlogp_batch(smiles_batch, timeout=60, max_retries=3):
    """
    smiles_batch: list[str]
    return: dict {canonical_smiles: xlogp or None}
    """
    url = f"{PUG}/compound/smiles/property/XLogP,CanonicalSMILES/CSV"
    payload = {"smiles": "\n".join(smiles_batch)}  # ✅ 여러 SMILES를 줄바꿈으로 전달

    for attempt in range(max_retries):
        try:
            r = session.post(url, data=payload, timeout=timeout)
            if r.status_code == 200 and r.text.strip():
                reader = csv.DictReader(r.text.splitlines())
                out = {}
                for row in reader:
                    csmi = (row.get("CanonicalSMILES") or "").strip()
                    x = (row.get("XLogP") or "").strip()
                    if not csmi:
                        continue
                    try:
                        out[csmi] = float(x) if x != "" else None
                    except ValueError:
                        out[csmi] = None
                return out

            # 일시적 오류면 backoff
            if r.status_code in (429, 500, 503):
                time.sleep((attempt + 1) * 2)
                continue

            # 그 외(400 등)는 그냥 빈 dict
            return {}

        except requests.RequestException:
            time.sleep((attempt + 1) * 2)

    return {}

def fetch_xlogp_single(smiles, timeout=60, max_retries=3):
    """
    단건 fallback: {smiles: xlogp}, {canonical_smiles: xlogp} 둘 다 채워주기
    """
    url = f"{PUG}/compound/smiles/property/XLogP,CanonicalSMILES/JSON"
    for attempt in range(max_retries):
        try:
            r = session.post(url, data={"smiles": smiles}, timeout=timeout)
            if r.status_code == 200:
                js = r.json()
                props = js.get("PropertyTable", {}).get("Properties", [])
                if not props:
                    return {}
                x = props[0].get("XLogP", None)
                csmi = props[0].get("CanonicalSMILES", None)
                try:
                    x = float(x) if x is not None else None
                except ValueError:
                    x = None
                out = {smiles: x}
                if csmi:
                    out[str(csmi)] = x
                return out

            if r.status_code in (429, 500, 503):
                time.sleep((attempt + 1) * 2)
                continue
            return {}

        except Exception:
            time.sleep((attempt + 1) * 2)
    return {}

# =========================
# 실행 파트
# =========================
assert "std_smiles" in df.columns, "df에 std_smiles 컬럼이 없어!"

smiles_series = df["std_smiles"].dropna().astype(str)
unique_smiles = smiles_series.unique().tolist()
print("unique std_smiles:", len(unique_smiles))

# 1) 배치 조회
batch_size = 50          # 너무 크면 400 뜰 수 있어 20~80 사이 추천
sleep_sec = 0.25         # 초당 4회 정도(안전하게)
mapping = {}

for i in range(0, len(unique_smiles), batch_size):
    batch = unique_smiles[i:i + batch_size]
    got = fetch_xlogp_batch(batch)
    mapping.update(got)

    if (i // batch_size) % 10 == 0:
        print(f"{min(i+batch_size, len(unique_smiles))} / {len(unique_smiles)} batches processed, mapping size={len(mapping)}")

    time.sleep(sleep_sec)

# 2) 1차 매핑
df["xlogp3_pubchem"] = df["std_smiles"].astype(str).map(mapping)

# 3) 누락값 fallback(단건)
missing = df.loc[df["xlogp3_pubchem"].isna() & df["std_smiles"].notna(), "std_smiles"].astype(str).unique().tolist()
print("missing after batch:", len(missing))

for j, smi in enumerate(missing):
    mapping.update(fetch_xlogp_single(smi))
    if (j + 1) % 200 == 0:
        print(f"fallback {j+1} / {len(missing)}")
    time.sleep(sleep_sec)

# 4) 최종 매핑 + 저장
df["xlogp3_pubchem"] = df["std_smiles"].astype(str).map(mapping)

df.to_csv("df_with_xlogp3_pubchem.csv", index=False)
pd.DataFrame({"smiles_key": list(mapping.keys()), "xlogp3_pubchem": list(mapping.values())}) \
  .to_csv("xlogp3_mapping.csv", index=False)

print("Saved: df_with_xlogp3_pubchem.csv / xlogp3_mapping.csv")
print("xlogp filled rate:", df["xlogp3_pubchem"].notna().mean())


unique std_smiles: 10380
50 / 10380 batches processed, mapping size=0
550 / 10380 batches processed, mapping size=0
1050 / 10380 batches processed, mapping size=0
1550 / 10380 batches processed, mapping size=0
2050 / 10380 batches processed, mapping size=0
2550 / 10380 batches processed, mapping size=0
3050 / 10380 batches processed, mapping size=0
3550 / 10380 batches processed, mapping size=0
4050 / 10380 batches processed, mapping size=0
4550 / 10380 batches processed, mapping size=0
5050 / 10380 batches processed, mapping size=0
5550 / 10380 batches processed, mapping size=0
6050 / 10380 batches processed, mapping size=0
6550 / 10380 batches processed, mapping size=0
7050 / 10380 batches processed, mapping size=0
7550 / 10380 batches processed, mapping size=0
8050 / 10380 batches processed, mapping size=0
8550 / 10380 batches processed, mapping size=0
9050 / 10380 batches processed, mapping size=0
9550 / 10380 batches processed, mapping size=0
10050 / 10380 batches processed, mappi