In [33]:
import os
import re
import seaborn as sns
from typing import Sequence, Dict, Tuple, Optional, List, Union
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from rdkit.Chem import rdMolDescriptors as rdMD
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
TARGETS = ["Tg", "FFV", "Tc", "Density", "Rg"] 

def get_data_paths():
    BASE_PATH = Path(os.getenv('NEURIPS_DATA_PATH', 'kaggle/input/neurips-open-polymer-prediction-2025'))
    EXTRA_BASE = Path(os.getenv('EXTRA_DATA_BASE', 'kaggle/input/smiles-extra-data'))
    TC_BASE = Path(os.getenv('TC_DATA_BASE', 'kaggle/input/tc-smiles'))

    paths = {
        'train_csv': Path(os.getenv('TRAIN_CSV_PATH', BASE_PATH / 'train.csv')),
        'test_csv': Path(os.getenv('TEST_CSV_PATH', BASE_PATH / 'test.csv')),
        'sample_submission': Path(os.getenv('SAMPLE_SUBMISSION_PATH', BASE_PATH / 'sample_submission.csv')),

        'tc_data': Path(os.getenv('TC_DATA_PATH', TC_BASE / 'Tc_SMILES.csv')),
        'tg_jcim_data': Path(os.getenv('TG_JCIM_PATH', EXTRA_BASE / 'JCIM_sup_bigsmiles.csv')),
        'tg_excel_data': Path(os.getenv('TG_EXCEL_PATH', EXTRA_BASE / 'data_tg3.xlsx')),
        'density_data': Path(os.getenv('DENSITY_PATH', EXTRA_BASE / 'data_dnst1.xlsx')),
        'ffv_data': Path(os.getenv('FFV_DATA_PATH', BASE_PATH / 'train_supplement' / 'dataset4.csv')),
        'dataset1': Path(os.getenv('DATASET1_PATH', BASE_PATH / 'train_supplement' / 'dataset1.csv')),
        'dataset2': Path(os.getenv('DATASET2_PATH', BASE_PATH / 'train_supplement' / 'dataset2.csv')),
        'dataset3': Path(os.getenv('DATASET3_PATH', BASE_PATH / 'train_supplement' / 'dataset3.csv')),
    }
    return paths

P = get_data_paths()
for k, v in P.items():
    print(f"{k}: {v}  {'[OK]' if v.exists() else '[MISSING]'}")

# 核心官方数据
train = pd.read_csv(P['train_csv'])
test  = pd.read_csv(P['test_csv'])
sub   = pd.read_csv(P['sample_submission'])
print("Train/Test/Sub:", train.shape, test.shape, sub.shape)

train_csv: kaggle\input\neurips-open-polymer-prediction-2025\train.csv  [OK]
test_csv: kaggle\input\neurips-open-polymer-prediction-2025\test.csv  [OK]
sample_submission: kaggle\input\neurips-open-polymer-prediction-2025\sample_submission.csv  [OK]
tc_data: kaggle\input\tc-smiles\Tc_SMILES.csv  [OK]
tg_jcim_data: kaggle\input\smiles-extra-data\JCIM_sup_bigsmiles.csv  [OK]
tg_excel_data: kaggle\input\smiles-extra-data\data_tg3.xlsx  [OK]
density_data: kaggle\input\smiles-extra-data\data_dnst1.xlsx  [OK]
ffv_data: kaggle\input\neurips-open-polymer-prediction-2025\train_supplement\dataset4.csv  [OK]
dataset1: kaggle\input\neurips-open-polymer-prediction-2025\train_supplement\dataset1.csv  [OK]
dataset2: kaggle\input\neurips-open-polymer-prediction-2025\train_supplement\dataset2.csv  [OK]
dataset3: kaggle\input\neurips-open-polymer-prediction-2025\train_supplement\dataset3.csv  [OK]
Train/Test/Sub: (7973, 7) (3, 2) (3, 6)


In [2]:
def add_extra_data(df_train: pd.DataFrame, df_extra: pd.DataFrame, target: str, source_name: str = "extra") -> pd.DataFrame:
    """
    将外部数据 merge 到 train：
    - 如果 train 里有相同 SMILES 且 target 缺失，用 extra 填补
    - 如果 extra 里有新 SMILES，追加到 train
    - 对同一个 SMILES target 出现多个值时取均值
    - 打印出重复 SMILES 的数量 & 差异
    """
    print(f"[INFO] Working on {source_name} (target={target})")
    before = len(df_train)

    # 只保留 SMILES + target
    df_extra = df_extra[['SMILES', target]].dropna(subset=[target]).copy()

    # 聚合外部数据，避免重复
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()

    # 检查重复
    common = set(df_train['SMILES']) & set(df_extra['SMILES'])
    if common:
        merged_common = pd.merge(
            df_train[df_train['SMILES'].isin(common)][['SMILES', target]],
            df_extra[df_extra['SMILES'].isin(common)],
            on='SMILES',
            suffixes=('_train', '_extra')
        )
        diffs = (merged_common[f"{target}_train"] - merged_common[f"{target}_extra"]).abs()
        print(f"  重复 SMILES: {len(merged_common)}, 其中 {sum(diffs>1e-6)} 条数值不同 (mean diff={diffs.mean():.3f})")

    # merge
    df_train = df_train.merge(df_extra, on='SMILES', how='outer', suffixes=('', '_extra'))

    # 填补缺失
    mask = df_train[target].isna() & df_train[f"{target}_extra"].notna()
    df_train.loc[mask, target] = df_train.loc[mask, f"{target}_extra"]

    # 删除多余列
    if f"{target}_extra" in df_train.columns:
        df_train = df_train.drop(columns=[f"{target}_extra"])

    after = len(df_train)
    print(f"  [{target}] 增强: {after - before:+d} 条新样本, 填补 {mask.sum()} 条缺失")

    return df_train


In [3]:
paths = get_data_paths()

if os.path.exists(paths['tc_data']):       
    train = add_extra_data(
        train,
        pd.read_csv(paths['tc_data']).rename(columns={'TC_mean':'Tc'}),
        target='Tc',
        source_name='tc_data'
    )

if os.path.exists(paths['tg_jcim_data']):  
    train = add_extra_data(
        train,
        pd.read_csv(paths['tg_jcim_data'], usecols=['SMILES','Tg (C)']).rename(columns={'Tg (C)':'Tg'}),
        target='Tg',
        source_name='tg_jcim'
    )

if os.path.exists(paths['tg_excel_data']):
    train = add_extra_data(
        train,
        pd.read_excel(paths['tg_excel_data']).rename(columns={'Tg [K]':'Tg'}).assign(Tg=lambda df: df['Tg'] - 273.15),
        target='Tg',
        source_name='tg_excel_K_to_C'
    )

if os.path.exists(paths['dataset3']):
    train = add_extra_data(
        train,
        pd.read_csv(paths['dataset3']),  # 已是列名 SMILES, Tg
        target='Tg',
        source_name='dataset3'
    )

if os.path.exists(paths['density_data']):
    train = add_extra_data(
        train,
        pd.read_excel(paths['density_data'])
          .rename(columns={'density(g/cm3)':'Density'})
          .assign(Density=lambda df: pd.to_numeric(df['Density'], errors='coerce') - 0.118),
        target='Density',
        source_name='density_extra_minus_0p118'
    )

if os.path.exists(paths['dataset1']):
    train = add_extra_data(
        train,
        pd.read_csv(paths['dataset1']).rename(columns={'TC_mean':'Tc'}),
        target='Tc',
        source_name='dataset1'
    )

if os.path.exists(paths['ffv_data']):
    train = add_extra_data(
        train,
        pd.read_csv(paths['ffv_data']).rename(columns={'FFV':'FFV'}),
        target='FFV',
        source_name='ffv_dataset4'
    )

print(f"train: {train.shape}")
print("targets non-null:", {t: int(train[t].notna().sum()) for t in TARGETS if t in train.columns})

[INFO] Working on tc_data (target=Tc)
  重复 SMILES: 737, 其中 2 条数值不同 (mean diff=0.000)
  [Tc] 增强: +130 条新样本, 填补 130 条缺失
[INFO] Working on tg_jcim (target=Tg)
  重复 SMILES: 7, 其中 0 条数值不同 (mean diff=0.000)
  [Tg] 增强: +655 条新样本, 填补 655 条缺失
[INFO] Working on tg_excel_K_to_C (target=Tg)
  [Tg] 增强: +499 条新样本, 填补 499 条缺失
[INFO] Working on dataset3 (target=Tg)
  [Tg] 增强: +46 条新样本, 填补 46 条缺失
[INFO] Working on density_extra_minus_0p118 (target=Density)
  重复 SMILES: 4, 其中 2 条数值不同 (mean diff=0.055)
  [Density] 增强: +782 条新样本, 填补 784 条缺失
[INFO] Working on dataset1 (target=Tc)
  重复 SMILES: 867, 其中 2 条数值不同 (mean diff=0.000)
  [Tc] 增强: +0 条新样本, 填补 0 条缺失
[INFO] Working on ffv_dataset4 (target=FFV)
  重复 SMILES: 37, 其中 0 条数值不同 (mean diff=nan)
  [FFV] 增强: +825 条新样本, 填补 862 条缺失
train: (10910, 7)
targets non-null: {'Tg': 1711, 'FFV': 7892, 'Tc': 867, 'Density': 1397, 'Rg': 614}


In [4]:
# Check for missing values
print("\nMissing values in train data:")
print(train.isnull().sum())


Missing values in train data:
id          2937
SMILES         0
Tg          9199
FFV         3018
Tc         10043
Density     9513
Rg         10296
dtype: int64


In [5]:
def compute_all_smiles_features(smiles: str) -> dict:
    """提取 SMILES 字符串的字符/化学符号/占位符特征（不依赖 RDKit）"""
    if not isinstance(smiles, str):
        smiles = str(smiles)
    feats = {}

    # 基础统计
    feats['smiles_length'] = len(smiles)
    feats['capital_letters'] = sum(c.isupper() for c in smiles)
    feats['lowercase_letters'] = sum(c.islower() for c in smiles)
    feats['digits'] = sum(c.isdigit() for c in smiles)

    # 符号统计
    feats['parentheses'] = smiles.count('(') + smiles.count(')')
    feats['brackets'] = smiles.count('[') + smiles.count(']')
    feats['braces'] = smiles.count('{') + smiles.count('}')
    feats['equals'] = smiles.count('=')
    feats['hashes'] = smiles.count('#')
    feats['colons'] = smiles.count(':')
    feats['ats'] = smiles.count('@')
    feats['slashes'] = smiles.count('/') + smiles.count('\\')
    feats['plus_minus'] = smiles.count('+') + smiles.count('-')

    # 元素计数
    feats['C_count'] = smiles.count('C') + smiles.count('c')
    feats['O_count'] = smiles.count('O') + smiles.count('o')
    feats['N_count'] = smiles.count('N') + smiles.count('n')
    feats['S_count'] = smiles.count('S') + smiles.count('s')
    feats['P_count'] = smiles.count('P') + smiles.count('p')
    feats['F_count'] = smiles.count('F') + smiles.count('f')
    feats['Cl_count'] = smiles.count('Cl') + smiles.count('cl')
    feats['Br_count'] = smiles.count('Br') + smiles.count('br')
    feats['I_count'] = smiles.count('I') + smiles.count('i')

    # 结构模式
    feats['has_ring'] = int(any(d in smiles for d in '123456789'))
    feats['has_double_bond'] = int('=' in smiles)
    feats['has_triple_bond'] = int('#' in smiles)
    feats['has_aromatic'] = int(any(c in smiles for c in 'cnos'))

    # 元素比例
    feats['O_to_C_ratio'] = feats['O_count'] / (feats['C_count'] + 1e-5)
    feats['N_to_C_ratio'] = feats['N_count'] / (feats['C_count'] + 1e-5)
    feats['heteroatom_ratio'] = (
        feats['O_count'] + feats['N_count'] + feats['S_count'] + feats['P_count']
    ) / (feats['C_count'] + 1e-5)

    # 占位符特征
    feats['star_count'] = smiles.count('*')
    feats['R_placeholder_count'] = len(re.findall(r"\[R[0-9']*\]", smiles))
    feats['any_placeholder'] = int((feats['star_count'] > 0) or (feats['R_placeholder_count'] > 0))

    return feats


def add_all_features(df: pd.DataFrame, col: str = 'SMILES') -> pd.DataFrame:
    """给 DataFrame 添加所有 SMILES 特征"""
    feats = df[col].astype(str).apply(compute_all_smiles_features).apply(pd.Series)
    feats.index = df.index
    return pd.concat([df, feats], axis=1)

In [6]:
train_basic = add_all_features(train, col='SMILES')
test_basic = add_all_features(test,  col='SMILES')

print("Train with features:", train_basic.shape)
print("Test with features:", test_basic.shape)

Train with features: (10910, 39)
Test with features: (3, 34)


In [7]:
def handle_missing_values(df, exclude_cols=None):
    """
    只对特征列做缺失值填充，排除 id / SMILES / targets 等
    """
    df = df.copy()
    if exclude_cols is None:
        exclude_cols = []

    # 找出需要处理的数值列
    numeric_cols = [
        c for c in df.select_dtypes(include=[np.number]).columns
        if c not in exclude_cols
    ]

    for col in numeric_cols:
        if df[col].isnull().any():
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)

    return df
EXCLUDE_BASE = ['id', 'SMILES'] + TARGETS

In [8]:


train_basic = handle_missing_values(train_basic, exclude_cols=EXCLUDE_BASE)
test_basic  = handle_missing_values(test_basic, exclude_cols=EXCLUDE_BASE)

# Check for missing values
print("\nMissing values in train data:")
print(f'train_basic.shape:{train_basic.shape}')
print(train_basic.isnull().sum())


Missing values in train data:
train_basic.shape:(10910, 39)
id                      2937
SMILES                     0
Tg                      9199
FFV                     3018
Tc                     10043
Density                 9513
Rg                     10296
smiles_length              0
capital_letters            0
lowercase_letters          0
digits                     0
parentheses                0
brackets                   0
braces                     0
equals                     0
hashes                     0
colons                     0
ats                        0
slashes                    0
plus_minus                 0
C_count                    0
O_count                    0
N_count                    0
S_count                    0
P_count                    0
F_count                    0
Cl_count                   0
Br_count                   0
I_count                    0
has_ring                   0
has_double_bond            0
has_triple_bond            0
has_aromati

In [9]:
from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.error')  # 关掉 RDKit 的报错输出

def _mol_from_smiles_robust(smi: str):
    """更稳的解析：失败时尝试 sanitize=False 再手动 Sanitize。"""
    if not isinstance(smi, str) or not smi.strip():
        return None
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        return mol
    mol = Chem.MolFromSmiles(smi, sanitize=False)
    if mol is None:
        return None
    try:
        Chem.SanitizeMol(mol)
        return mol
    except Exception:
        return None

def is_valid_smiles(smi: str) -> bool:
    """True 表示 RDKit 能成功解析。"""
    return _mol_from_smiles_robust(smi) is not None

def drop_invalid_smiles(df, smiles_col: str = "SMILES", show_examples: int = 5):
    """
    删除 RDKit 无法解析的 SMILES 行，并打印前后差异。
    返回：clean_df（已删除无效行，索引已重置）
    """
    df = df.copy()
    n_before = len(df)

    valid_mask = df[smiles_col].apply(is_valid_smiles)
    n_invalid = int((~valid_mask).sum())
    pct_invalid = n_invalid / n_before * 100 if n_before > 0 else 0.0

    # 打印摘要
    print(f"[SMILES 校验] 总计: {n_before}  | 无效: {n_invalid} ({pct_invalid:.3f}%)")
    if n_invalid > 0:
        bad_examples = df.loc[~valid_mask, smiles_col].dropna().astype(str).unique()[:show_examples]
        print("无效样例（最多展示几条）:")
        for s in bad_examples:
            print("  -", s)

    # 过滤并返回
    clean_df = df.loc[valid_mask].reset_index(drop=True)
    print(f"删除后行数: {len(clean_df)}  | 已删除: {n_invalid}")
    return clean_df

In [17]:
# 仅对训练集删除无效 SMILES（测试集不要删）
train_clean = drop_invalid_smiles(train_basic, smiles_col="SMILES")

# 如果想看看测试集的无效比例，但不删除：
test_clean = drop_invalid_smiles(test_basic, smiles_col="SMILES", show_examples=3)


[SMILES 校验] 总计: 10910  | 无效: 6 (0.055%)
无效样例（最多展示几条）:
  - *C(F)(F)CC(F)([R])C(*)(F)F
  - *CN([R'])Cc2cc([R]c1cc(*)c(O)c(CN([R'])C*)c1)cc(*)c2O
  - *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
  - *OC2OC(CO[R])C(OC1OC(CO[R])C(*)C(O[R])C1O[R])C(O[R])C2O[R]
  - *O[Si](*)([R])[R]
删除后行数: 10904  | 已删除: 6
[SMILES 校验] 总计: 3  | 无效: 0 (0.000%)
删除后行数: 3  | 已删除: 0


In [11]:
def rdkit_descriptors_from_list(smiles_series: pd.Series, desc_names: list, on_fail=np.nan) -> pd.DataFrame:
    """
    只支持 rdkit.Chem.Descriptors.descList 中的名字。
    - 未知名称会直接报错（帮助你发现拼写问题）
    - 解析失败的 SMILES 行填 on_fail（默认 NaN）
    返回：与输入索引对齐的 DataFrame
    """
    # descList 是 (name, func) 的列表
    avail = dict(Descriptors.descList)
    bad = [n for n in desc_names if n not in avail]
    if bad:
        raise ValueError(f"Unknown descriptor names (not in Descriptors.descList): {bad}")

    out_rows = []
    for smi in smiles_series.astype(str).tolist():
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            out_rows.append({n: on_fail for n in desc_names})
            continue
        row = {}
        for n in desc_names:
            try:
                row[n] = float(avail[n](mol))
            except Exception:
                row[n] = on_fail
        out_rows.append(row)

    return pd.DataFrame(out_rows, index=smiles_series.index)

In [12]:
from tqdm import tqdm

In [None]:
# ===== 1) 计算 RDKit 全量官方描述符（不含手工字段） =====
def compute_all_rdkit_descriptors(smiles_series: pd.Series) -> pd.DataFrame:
    """
    计算所有RDKit描述符，显示进度条和统计信息
    """
    names, funcs = zip(*Descriptors.descList)
    rows = []
    invalid_count = 0
    error_count = 0
    
    # 使用tqdm
    pbar = tqdm(smiles_series.astype(str), 
                total=len(smiles_series),
                desc="计算RDKit描述符",
                unit="mol")
    
    for s in pbar:
        mol = Chem.MolFromSmiles(s)
        if mol is None:
            rows.append([np.nan] * len(funcs))
            invalid_count += 1
            pbar.set_postfix({'无效SMILES': invalid_count, '计算错误': error_count})
            continue
        
        vals = []
        for f in funcs:
            try:
                vals.append(float(f(mol)))
            except Exception as e:
                vals.append(np.nan)
                error_count += 1
        
        rows.append(vals)
        pbar.set_postfix({'无效SMILES': invalid_count, '计算错误': error_count})
    
    # 打印统计信息
    print(f"\n完成！统计信息:")
    print(f"总分子数: {len(smiles_series)}")
    print(f"无效SMILES: {invalid_count}")
    print(f"描述符计算错误次数: {error_count}")
    
    return pd.DataFrame(rows, columns=list(names), index=smiles_series.index)

In [34]:
def _as_feature_list(feature_spec: Union[Sequence[str], Dict[str, Sequence[str]]], target: str) -> List[str]:
    """
    兼容两种写法：
    - 直接给一个 list/tuple 的描述符名
    - 给 {target: [names]} 的字典
    """
    if isinstance(feature_spec, dict):
        if target not in feature_spec:
            raise KeyError(f"feature_spec 没有键 {target}")
        names = feature_spec[target]
    else:
        names = feature_spec
    # 去重但保序
    return list(dict.fromkeys(map(str, names)))

def _align_columns(X_tr: pd.DataFrame, X_te: pd.DataFrame, fill_value=np.nan) -> Tuple[pd.DataFrame, pd.DataFrame]:
    cols = list(dict.fromkeys(list(X_tr.columns) + list(X_te.columns)))
    return (X_tr.reindex(columns=cols, fill_value=fill_value),
            X_te.reindex(columns=cols, fill_value=fill_value))

def make_dataset_for_target(
    *,
    train_df: pd.DataFrame,      # 含 target 列（例如 'Tg'）
    test_df: pd.DataFrame,       # 只用来对齐行数/方便将来扩展
    desc_train: pd.DataFrame,    # compute_all_rdkit_descriptors(train_smiles)
    desc_test: pd.DataFrame,     # compute_all_rdkit_descriptors(test_smiles)
    target: str,                 # 'Tg' / 'FFV' / 'Tc' / 'Density' / 'Rg'
    feature_spec: Union[Sequence[str], Dict[str, Sequence[str]]],
    prefix: Optional[str] = None,
    drop_missing: bool = True,
    verbose: bool = True,
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, Dict]:
    """
    用一行调用：按给定描述符名，直接产出 X_train, y_train, X_test。
    - feature_spec: 可是 ['MolWt','TPSA',...] 或 {'Tg': [...]} 两种形式
    - 会自动对齐 train/test 列，缺的列以 NaN 填；树模型可直接吃 NaN
    - 返回 info 内含 kept/missing/形状等信息
    """
    names = _as_feature_list(feature_spec, target)
    exist_tr = [c for c in names if c in desc_train.columns]
    exist_te = [c for c in names if c in desc_test.columns]
    missing  = [c for c in names if c not in desc_train.columns or c not in desc_test.columns]

    if missing and not drop_missing:
        raise KeyError(f"{target}: 这些描述符列缺失：{missing}")

    Xtr_raw = desc_train[exist_tr].copy()
    Xte_raw = desc_test[exist_te].copy()
    Xtr, Xte = _align_columns(Xtr_raw, Xte_raw, fill_value=np.nan)

    # 选出有标签的行
    mask = train_df[target].notna()
    y = train_df.loc[mask, target].astype(float).copy()
    Xtr = Xtr.loc[mask].copy()

    # 可选加前缀，避免后续多目标拼表时冲突
    if prefix:
        Xtr.columns = [f"{prefix}{c}" for c in Xtr.columns]
        Xte.columns = [f"{prefix}{c}" for c in Xte.columns]

    # 统一 dtype（对树模型友好）
    Xtr = Xtr.astype(np.float32)
    Xte = Xte.astype(np.float32)

    info = dict(
        target=target,
        requested=len(names),
        kept=len(Xtr.columns),
        missing=missing,
        Xtr_shape=Xtr.shape,
        Xte_shape=Xte.shape,
        y_len=len(y),
    )
    if verbose:
        print(f"[{target}] requested={len(names)}, kept={len(Xtr.columns)}, missing={len(missing)}")
        if missing:
            print(f" missing: {missing[:12]}{' ...' if len(missing)>12 else ''}")
        print(f" Xtr={Xtr.shape}, Xte={Xte.shape}, y={len(y)}")

    return Xtr, y, Xte, info

In [16]:
# 先一次性算好 RDKit 全量描述符
desc_train = compute_all_rdkit_descriptors(train_clean['SMILES'])

计算RDKit描述符:   0%|          | 0/10904 [00:00<?, ?mol/s]

计算RDKit描述符: 100%|██████████| 10904/10904 [02:04<00:00, 87.40mol/s, 无效SMILES=0, 计算错误=0] 



完成！统计信息:
总分子数: 10904
无效SMILES: 0
描述符计算错误次数: 0


In [18]:
desc_test = compute_all_rdkit_descriptors(test_clean['SMILES'])

计算RDKit描述符: 100%|██████████| 3/3 [00:00<00:00, 82.09mol/s, 无效SMILES=0, 计算错误=0]


完成！统计信息:
总分子数: 3
无效SMILES: 0
描述符计算错误次数: 0





In [27]:
# 只查看有缺失值的列
missing_stats = desc_train.isna().sum()
columns_with_missing = missing_stats[missing_stats > 0]

print(f"有缺失值的列数: {len(columns_with_missing)}")
print("有缺失值的列:")
print(columns_with_missing)

有缺失值的列数: 12
有缺失值的列:
MaxPartialCharge        9626
MinPartialCharge        9626
MaxAbsPartialCharge     9626
MinAbsPartialCharge     9626
BCUT2D_MWHI            10094
BCUT2D_MWLOW           10094
BCUT2D_CHGHI           10094
BCUT2D_CHGLO           10094
BCUT2D_LOGPHI          10094
BCUT2D_LOGPLOW         10094
BCUT2D_MRHI            10094
BCUT2D_MRLOW           10094
dtype: int64


In [26]:
# 只查看有缺失值的列
missing_stats = desc_test.isna().sum()
columns_with_missing = missing_stats[missing_stats > 0]

print(f"有缺失值的列数: {len(columns_with_missing)}")
print("有缺失值的列:")
print(columns_with_missing)

有缺失值的列数: 12
有缺失值的列:
MaxPartialCharge       3
MinPartialCharge       3
MaxAbsPartialCharge    3
MinAbsPartialCharge    3
BCUT2D_MWHI            3
BCUT2D_MWLOW           3
BCUT2D_CHGHI           3
BCUT2D_CHGLO           3
BCUT2D_LOGPHI          3
BCUT2D_LOGPLOW         3
BCUT2D_MRHI            3
BCUT2D_MRLOW           3
dtype: int64


In [35]:
feature_Tg = ['MolWt','MolLogP','TPSA','NumRotatableBonds','NumRings','FractionCSP3','NumHAcceptors','NumHDonors']

In [36]:
X_tr, y_tr, X_te, info = make_dataset_for_target(
    train_df=train_clean,
    test_df=test_clean,
    desc_train=desc_train,
    desc_test=desc_test,
    target='Tg',
    feature_spec=feature_Tg,   # 或 {'Tg': feature_Tg}
    prefix=None,               # 想要列名带前缀就填比如 'feat_Tg__'
    drop_missing=True,
    verbose=True
)

[Tg] requested=8, kept=7, missing=1
 missing: ['NumRings']
 Xtr=(1711, 7), Xte=(3, 7), y=1711
