In [1]:
import sys
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os

sys.path.append("kaggle/input/neurips-open-polymer-prediction-2025")
base_path = "kaggle/input/neurips-open-polymer-prediction-2025/"
supplement_path = "kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/"
tc_smiles_path = "kaggle/input/tc-smiles/"
smiles_extra_data_path = "kaggle/input/smiles-extra-data/"

In [2]:
import sys
print(sys.executable)

/scratch/e1350261/venvs/ai39/bin/python


In [3]:
import numpy, rdkit, openpyxl
import importlib, sys
print("numpy =", numpy.__version__)
print("rdkit =", importlib.import_module('rdkit').__version__)
print(sys.executable)

numpy = 1.26.4
rdkit = 2023.09.6
/scratch/e1350261/venvs/ai39/bin/python


In [None]:
from data_preparation import 

In [4]:
!ls

kaggle	new.ipynb  README.md


In [5]:
PROPERTIES = ["Tg", "FFV", "Tc", "Density", "Rg"]

In [6]:
train_df = pd.read_csv(base_path + "train.csv")
print(f"train data shape:{train_df.shape}")
train_df.columns

train data shape:(7973, 7)


Index(['id', 'SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg'], dtype='object')

In [7]:
test_df = pd.read_csv(os.path.join(base_path, "test.csv"))
dataset_1 = pd.read_csv(os.path.join(supplement_path, "dataset1.csv"))
dataset_2 = pd.read_csv(os.path.join(supplement_path, "dataset2.csv"))
dataset_3 = pd.read_csv(os.path.join(supplement_path, "dataset3.csv"))
dataset_4 = pd.read_csv(os.path.join(supplement_path, "dataset4.csv"))
tc_smiles = pd.read_csv(os.path.join(tc_smiles_path, "Tc_SMILES.csv"))
bigsmiles = pd.read_csv(os.path.join(smiles_extra_data_path, "JCIM_sup_bigsmiles.csv"))
dnst1 = pd.read_excel(os.path.join(smiles_extra_data_path, "data_dnst1.xlsx"))
tg3 = pd.read_excel(os.path.join(smiles_extra_data_path, "data_tg3.xlsx"))

# 放入dict
df_dict = {
    "test_df": test_df,
    "dataset_1": dataset_1,
    "dataset_2": dataset_2,
    "dataset_3": dataset_3,
    "dataset_4": dataset_4,
    "tc_smiles": tc_smiles,
    "bigsmiles": bigsmiles,
    "dnst1": dnst1,
    "tg3": tg3,
}

In [8]:
for name, df in df_dict.items():
    print(f"name:  {name},   columns:  {df.columns},      shape:{df.shape}\n")

name:  test_df,   columns:  Index(['id', 'SMILES'], dtype='object'),      shape:(3, 2)

name:  dataset_1,   columns:  Index(['SMILES', 'TC_mean'], dtype='object'),      shape:(874, 2)

name:  dataset_2,   columns:  Index(['SMILES'], dtype='object'),      shape:(7208, 1)

name:  dataset_3,   columns:  Index(['SMILES', 'Tg'], dtype='object'),      shape:(46, 2)

name:  dataset_4,   columns:  Index(['SMILES', 'FFV'], dtype='object'),      shape:(862, 2)

name:  tc_smiles,   columns:  Index(['TC_mean', 'SMILES'], dtype='object'),      shape:(874, 2)

name:  bigsmiles,   columns:  Index(['Unnamed: 0', 'SMILES', 'BigSMILES', 'Tg (C)'], dtype='object'),      shape:(662, 4)

name:  dnst1,   columns:  Index(['SMILES', 'uSMILES', 'std_name', 'density(g/cm3)',
       'density_std_err(g/cm3)', 'abbreviations', 'synonyms', 'tradenames'],
      dtype='object'),      shape:(787, 8)

name:  tg3,   columns:  Index(['SMILES', 'Tg [K]'], dtype='object'),      shape:(501, 2)



In [9]:
from rdkit import Chem


def standardize_dataset(df, reference_df, prefix, rename_map=None):
    """
    标准化 dataset，以便和 reference_df 合并。

    Args:
        df (pd.DataFrame): 要标准化的数据集（例如 dataset1、dataset3 等）。
        reference_df (pd.DataFrame): 用于参考列结构（如原始 train_df）。
        prefix (str): 添加到 `id` 前缀（如 'sup1_'）。
        rename_map (dict, optional): 要重命名的列名映射（如 {'TC_mean': 'Tc'}）。

    Returns:
        pd.DataFrame: 处理完的 DataFrame，列顺序与 reference_df 保持一致。
    """
    df_processed = df.copy()

    # 清洗 SMILES 列
    if "SMILES" in df_processed.columns:
        df_processed["SMILES"] = df_processed["SMILES"].apply(make_smile_canonical)
        # 处理无效 SMILES
        df_processed = df_processed[df_processed["SMILES"].notna()]
    # 重命名列（如 {'TC_mean': 'Tc'}）
    if rename_map:
        df_processed.rename(columns=rename_map, inplace=True)

    # 添加唯一 id（避免与原数据冲突）
    df_processed["id"] = prefix + df_processed.index.astype(str)

    # 补齐 reference_df 中的缺失列
    for col in reference_df.columns:
        if col not in df_processed.columns:
            df_processed[col] = np.nan

    # 确保列顺序一致
    df_processed = df_processed[reference_df.columns]

    return df_processed


def make_smile_canonical(smile):
    """清洗并标准化 SMILES"""
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print(f"Invalid SMILES: {smile}")
            return np.nan
        smi = Chem.MolToSmiles(mol, canonical=True)
        # print(f"Processed SMILES: {smi}")
        if not smi:
            print(f"Empty SMILES after processing: {smile}")
            return np.nan
        return smi
    except:
        print(f"Error processing SMILES: {smile}")
        return np.nan


def merge_with_check(data, new_data, target_col=None):
    """
    将 new_data 合并进 data，根据 SMILES 去重。
    如果 target_col 指定了，则会尝试将该列值合并，并统计已有值和新值的偏差。
    如果 target_col 为 None，则只合并 SMILES，不做属性值检查。
    """
    data = data.copy()

    no_add_count = 0
    existing_vals = []
    new_vals = []
    rows_to_append = []

    for idx, row in new_data.iterrows():
        smi = row["SMILES"]
        matched = data[data["SMILES"] == smi]

        if len(matched) > 0:
            if target_col is not None and target_col in row:
                existing_val = matched.iloc[0][target_col]
                new_val = row[target_col]

                if pd.isna(existing_val):
                    data.loc[matched.index[0], target_col] = new_val
                else:
                    no_add_count += 1
                    existing_vals.append(existing_val)
                    new_vals.append(new_val)
            else:
                # SMILES已存在，但不处理值
                continue
        else:
            # SMILES 不在 data 中，直接加
            rows_to_append.append(row)

    if rows_to_append:
        data = pd.concat([data, pd.DataFrame(rows_to_append)], ignore_index=True)

    if target_col is not None and no_add_count > 0 and existing_vals:
        existing_vals = np.array(existing_vals)
        new_vals = np.array(new_vals)
        mae = np.mean(np.abs(existing_vals - new_vals))
        rmse = np.sqrt(np.mean((existing_vals - new_vals) ** 2))
        mean_diff = np.mean(existing_vals - new_vals)

        print(f"共发现重复SMILES且未添加{target_col}的行数: {no_add_count}")
        print("\n📊 属性偏差统计（跳过更新的行）：")
        print(f"  ➤ 平均绝对误差 (MAE): {mae:.4f}")
        print(f"  ➤ 均方根误差 (RMSE): {rmse:.4f}")
        print(f"  ➤ 平均差（已有值 - 新值）: {mean_diff:.4f}")
    else:
        if target_col:
            print("✅ 没有重复且已有值的 SMILES，无需计算偏差。")
        else:
            print(f"✅ 已合并，未指定属性列（target_col=None），仅根据 SMILES 去重。")

    return data

In [10]:
data = train_df.copy()
print(data.columns)
data.describe()

Index(['id', 'SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg'], dtype='object')


Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
count,7973.0,511.0,7030.0,737.0,613.0,614.0
mean,1080050000.0,96.452314,0.367212,0.256334,0.985484,16.419787
std,621824100.0,111.228279,0.029609,0.089538,0.146189,4.60864
min,87817.0,-148.029738,0.226992,0.0465,0.748691,9.728355
25%,537664100.0,13.674509,0.349549,0.186,0.890243,12.540328
50%,1079079000.0,74.040183,0.364264,0.236,0.948193,15.052194
75%,1621708000.0,161.147595,0.38079,0.3305,1.062096,20.411067
max,2147438000.0,472.25,0.777097,0.524,1.840999,34.672906


In [11]:
data["SMILES"] = data["SMILES"].apply(make_smile_canonical)

In [12]:
# preaparing data

# dataset_1
dataset1_processed = standardize_dataset(
    dataset_1, data, prefix="sup1_", rename_map={"TC_mean": "Tc"}
)
# print(dataset1_processed.head(3))
data = merge_with_check(data, dataset1_processed, target_col="Tc")
print(data["SMILES"].count())
data.describe()

共发现重复SMILES且未添加Tc的行数: 744

📊 属性偏差统计（跳过更新的行）：
  ➤ 平均绝对误差 (MAE): 0.0005
  ➤ 均方根误差 (RMSE): 0.0070
  ➤ 平均差（已有值 - 新值）: 0.0000
8103


Unnamed: 0,Tg,FFV,Tc,Density,Rg
count,511.0,7030.0,867.0,613.0,614.0
mean,96.452314,0.367212,0.256539,0.985484,16.419787
std,111.228279,0.029609,0.101271,0.146189,4.60864
min,-148.029738,0.226992,0.0465,0.748691,9.728355
25%,13.674509,0.349549,0.186167,0.890243,12.540328
50%,74.040183,0.364264,0.236,0.948193,15.052194
75%,161.147595,0.38079,0.325,1.062096,20.411067
max,472.25,0.777097,1.59,1.840999,34.672906


In [13]:
# dataset_2
dataset_2_processed = dataset_2.copy()
dataset_2_processed["SMILES"] = dataset_2_processed["SMILES"].apply(
    make_smile_canonical
)
# 添加唯一 id（避免与原数据冲突）
dataset_2_processed["id"] = "sup2_" + dataset_2_processed.index.astype(str)
for col in PROPERTIES:
    if col not in dataset_2_processed.columns:
        dataset_2_processed[col] = np.nan
dataset_2_processed = dataset_2_processed[data.columns]
data = merge_with_check(data, dataset_2_processed)

✅ 已合并，未指定属性列（target_col=None），仅根据 SMILES 去重。


In [14]:
for col in data.columns:
    print(f"{col}: {data[col].isna().sum()}")

id: 0
SMILES: 0
Tg: 9527
FFV: 3008
Tc: 9171
Density: 9425
Rg: 9424


In [15]:
data["SMILES"].count()

10038

- computing data3

In [16]:
data_xgboost = data[["SMILES", "Tg"]].dropna(subset=["SMILES", "Tg"])
print("count:", data_xgboost["SMILES"].count())
print(data_xgboost.describe())
data_xgboost.head(3)

count: 511
               Tg
count  511.000000
mean    96.452314
std    111.228279
min   -148.029738
25%     13.674509
50%     74.040183
75%    161.147595
max    472.250000


Unnamed: 0,SMILES,Tg
40,*NC(C)C(=O)NCC(=O)NCC(*)=O,208.639749
57,*CCCCCCSSCCCCSS*,-41.266724
63,*C=CCCCCCCCC*,-17.282022


In [17]:
from mordred import Calculator, descriptors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

In [18]:
from tqdm import tqdm


def compute_mordred_features(smiles_list):
    calc = Calculator(descriptors, ignore_3D=True)
    mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]

    valid_indices = [i for i, mol in enumerate(mols) if mol is not None]
    valid_mols = [mols[i] for i in valid_indices]

    df = calc.pandas(valid_mols)
    print(
        f"Computed {len(df.columns)} Mordred descriptors for {len(valid_mols)} valid molecules."
    )
    return df, valid_indices

In [19]:
features, valid_indices = compute_mordred_features(data_xgboost["SMILES"].tolist())

  1%|▏         | 7/511 [00:01<01:43,  4.89it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 25%|██▌       | 130/511 [00:01<00:02, 127.77it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 34%|███▍      | 175/511 [00:02<00:03, 98.19it/s] 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 40%|████      | 206/511 [00:03<00:03, 85.67it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 43%|████▎     | 218/511 [00:04<00:05, 51.77it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 59%|█████▉    | 301/511 [00:04<00:01, 106.99it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 74%|███████▍  | 380/511 [00:06<00:02, 53.02it/s] 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 511/511 [00:07<00:00, 71.61it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
Computed 1613 Mordred descriptors for 511 valid molecules.





In [20]:
features_df = pd.DataFrame(features, index=valid_indices)

In [21]:
print(len(valid_indices))
print(type(features_df))
features_df.head(5)

511
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,17.515413,2.155647,4.311293,17.515413,1.167694,3.542772,...,8.794673,45.016869,185.080041,7.118463,450,16,62.0,64.0,7.694444,3.611111
1,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,19.675903,1.965946,3.931892,19.675903,1.229744,3.583262,...,8.089789,44.454249,268.044785,7.445688,680,13,58.0,56.0,5.5,4.25
2,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,14.59246,1.941884,3.883767,14.59246,1.216038,3.2911,...,7.720018,38.766486,138.140851,4.604695,286,9,42.0,40.0,4.5,3.25
3,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,48.658482,2.310942,4.621884,48.658482,1.280486,4.510102,...,10.071076,73.581233,494.241687,6.864468,7075,51,174.0,192.0,12.888889,9.027778
4,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,43.768312,2.509944,4.904268,43.768312,1.287303,4.448235,...,10.346088,86.051345,466.17487,7.283982,3815,50,174.0,203.0,10.111111,7.722222


In [22]:
# 检查每列的数据类型占比
for col in features_df.columns:
    unique_types = features_df[col].map(type).value_counts()
    if any(unique_types.index == str):
        print(f"{col} contains string values: {unique_types[str]}")