<a href="https://www.kaggle.com/code/raunakshrestha007/ppp-rdkit-preprocessing-neuroips?scriptVersionId=254776145" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train.csv
/kaggle/input/neurips-open-polymer-prediction-2025/test.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset2.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv
/kaggle/input/tc-smiles/Tc_SMILES.csv
/kaggle/input/tg-smiles-pid-polymer-class/TgSS_enriched_cleaned.csv
/kaggle/input/smiles-extra-data/data_dnst1.xlsx
/kaggle/input/smiles-extra-data/data_tg3.xlsx
/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv
/kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl


In [2]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


# Data Loading and Preprocessing Pipeline

In [3]:
# === Imports ===
import pandas as pd
import numpy as np
from rdkit import Chem

# === Config ===
BASE_PATH = '/kaggle/input/neurips-open-polymer-prediction-2025/'
TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
BAD_PATTERNS = ['[R]', '[R1]', '[R2]', '[R3]', '[R4]', '[R5]',
                "[R']", '[R"]', 'R1', 'R2', 'R3', 'R4', 'R5',
                '([R])', '([R1])', '([R2])']

# === SMILES Cleaner ===
def clean_and_validate_smiles(smiles):
    if not isinstance(smiles, str) or not smiles:
        return None
    for pattern in BAD_PATTERNS:
        if pattern in smiles:
            return None
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None
    return None

# === Load Train/Test ===
train = pd.read_csv(BASE_PATH + 'train.csv')
test = pd.read_csv(BASE_PATH + 'test.csv')

train['SMILES'] = train['SMILES'].apply(clean_and_validate_smiles)
test['SMILES'] = test['SMILES'].apply(clean_and_validate_smiles)

train.dropna(subset=['SMILES'], inplace=True)
test.dropna(subset=['SMILES'], inplace=True)

# === Load External Datasets (excluding dataset2) ===
external_datasets = []

def load_external(path, target, rename_map=None):
    try:
        df = pd.read_csv(path)
        if rename_map:
            df = df.rename(columns=rename_map)
        if 'SMILES' in df.columns and target in df.columns:
            df = df[['SMILES', target]].dropna()
            external_datasets.append((target, df))
            print(f"✅ Loaded {path} ({len(df)} entries for {target})")
        else:
            print(f"⚠️ Skipped {path}: required columns missing")
    except Exception as e:
        print(f"⚠️ Failed to load {path}: {e}")

load_external(BASE_PATH + 'train_supplement/dataset1.csv', 'Tc', rename_map={'TC_mean': 'Tc'})
load_external(BASE_PATH + 'train_supplement/dataset3.csv', 'Tg')
load_external(BASE_PATH + 'train_supplement/dataset4.csv', 'FFV')

# === Load Additional External Datasets ===
try:
    extra_data_tg3 = pd.read_excel("/kaggle/input/smiles-extra-data/data_tg3.xlsx")
    extra_data_dnst1 = pd.read_excel("/kaggle/input/smiles-extra-data/data_dnst1.xlsx")
    jcim_sup_bigsmiles = pd.read_csv("/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv")
    tc_smiles_df = pd.read_csv("/kaggle/input/tc-smiles/Tc_SMILES.csv")
except Exception as e:
    print(f"⚠️ Error loading extra data: {e}")

# Helper to standardize and append
def process_and_append_external(df, target, source_name):
    if 'SMILES' in df.columns and target in df.columns:
        df = df[['SMILES', target]].copy()
        df['SMILES'] = df['SMILES'].apply(clean_and_validate_smiles)
        df = df.dropna(subset=['SMILES'])

        # Ensure the target column is numeric
        df[target] = pd.to_numeric(df[target], errors='coerce')
        df = df.dropna(subset=[target])

        df = df.groupby('SMILES', as_index=False)[target].mean()
        external_datasets.append((target, df))
        print(f"✅ Integrated {source_name}: {len(df)} entries for {target}")
    else:
        print(f"⚠️ Skipped {source_name}: missing columns")

# Process each extra dataset (with correct column names)
process_and_append_external(extra_data_tg3.rename(columns={"Tg [K]": "Tg"}), "Tg", "data_tg3.xlsx")
process_and_append_external(extra_data_dnst1.rename(columns={"density(g/cm3)": "Density"}), "Density", "data_dnst1.xlsx")
process_and_append_external(tc_smiles_df.rename(columns={"TC_mean": "Tc"}), "Tc", "Tc_SMILES.csv")

# JCIM SMILES only (for future feature engineering)
jcim_smiles_only = jcim_sup_bigsmiles[['SMILES']].dropna()
jcim_smiles_only['SMILES'] = jcim_smiles_only['SMILES'].apply(clean_and_validate_smiles)
jcim_smiles_only = jcim_smiles_only.dropna().drop_duplicates()
print(f"✅ Loaded JCIM SMILES-only dataset: {len(jcim_smiles_only)} unique SMILES (no targets)")

# === Merge External Data ===
def merge_external(train_df, ext_df, target):
    ext_df['SMILES'] = ext_df['SMILES'].apply(clean_and_validate_smiles)
    ext_df = ext_df.dropna(subset=['SMILES', target])
    ext_df = ext_df.groupby('SMILES', as_index=False)[target].mean()

    # Fill missing target values in existing rows
    existing_smiles = set(train_df['SMILES'])
    to_fill = ext_df[ext_df['SMILES'].isin(existing_smiles)]
    for _, row in to_fill.iterrows():
        mask = (train_df['SMILES'] == row['SMILES']) & (train_df[target].isna())
        train_df.loc[mask, target] = row[target]

    # Add new rows
    new_smiles = set(ext_df['SMILES']) - existing_smiles
    new_rows = ext_df[ext_df['SMILES'].isin(new_smiles)].copy()
    for col in TARGETS:
        if col not in new_rows.columns:
            new_rows[col] = np.nan
    return pd.concat([train_df, new_rows[['SMILES'] + TARGETS]], ignore_index=True)

# === Apply Merges ===
train_extended = train[['SMILES'] + TARGETS].copy()
for target, ext in external_datasets:
    train_extended = merge_external(train_extended, ext, target)

# === Final Clean-Up ===
train_extended = train_extended.replace([np.inf, -np.inf], np.nan)
train_extended = train_extended.dropna(subset=TARGETS, how='all')
train_extended = train_extended.drop_duplicates(subset=['SMILES']).reset_index(drop=True)

# === Summary ===
print("\n📊 Final Summary:")
print(f"Train: {len(train)} | Extended: {len(train_extended)}")
for t in TARGETS:
    base = train[t].notna().sum()
    ext = train_extended[t].notna().sum()
    print(f"• {t:<8}: {ext} total ({ext - base:+} from supplements)")

print("\n✅ Data loading and preprocessing complete.")

smiles_list = train_extended['SMILES'].tolist()
# Clean SMILES column robustly
train_extended['SMILES'] = train_extended['SMILES'].apply(clean_and_validate_smiles)
# === Final Clean-Up ===
train_extended = train_extended.replace([np.inf, -np.inf], np.nan)
train_extended = train_extended.dropna(subset=TARGETS, how='all')
train_extended = train_extended.drop_duplicates(subset=['SMILES']).reset_index(drop=True)

# === Drop constant columns ===
constant_cols = [col for col in train_extended.columns if train_extended[col].nunique() == 1]
train_extended.drop(columns=constant_cols, inplace=True)
print(f"Dropped {len(constant_cols)} constant columns from train_extended")


train_extended.shape
train_extended


✅ Loaded /kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv (874 entries for Tc)
✅ Loaded /kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv (46 entries for Tg)
✅ Loaded /kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv (862 entries for FFV)
✅ Integrated data_tg3.xlsx: 499 entries for Tg
✅ Integrated data_dnst1.xlsx: 778 entries for Density
✅ Integrated Tc_SMILES.csv: 866 entries for Tc
✅ Loaded JCIM SMILES-only dataset: 662 unique SMILES (no targets)

📊 Final Summary:
Train: 7973 | Extended: 9990
• Tg      : 1056 total (+545 from supplements)
• FFV     : 7892 total (+862 from supplements)
• Tc      : 866 total (+129 from supplements)
• Density : 1247 total (+634 from supplements)
• Rg      : 614 total (+0 from supplements)

✅ Data loading and preprocessing complete.
Dropped 0 constant columns from train_extended


Unnamed: 0,SMILES,Tg,FFV,Tc,Density,Rg
0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,1.05,
1,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.370410,,,
2,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.378860,,,
3,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.355470,,,
...,...,...,...,...,...,...
9985,c1ccc(-c2ccccn2)nc1,,,,1.31,
9986,c1ccc(-c2nc3cc4ncoc4cc3o2)cc1,,,,1.43,
9987,c1ccc2oc(-c3ccc4ncoc4c3)nc2c1,,,,1.43,
9988,c1ccsc1,,,,1.51,


# Preprocessing Each Property Separately

In [4]:
from rdkit.Chem import AllChem, Descriptors
from sklearn.preprocessing import MinMaxScaler
from rdkit.Chem import rdDistGeom
import random
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

# === Tg: Glass Transition Temperature Preprocessing ===
def preprocess_tg(df):
    print(f"[Tg] Starting with {len(df)} unique SMILES")
    df = df.drop_duplicates(subset='SMILES').copy()
    df['mol'] = df['SMILES'].apply(lambda s: Chem.MolFromSmiles(s, sanitize=False))

    # Sanitize molecules but allow wildcard '*' (polymerization points)
    def is_valid_mol(mol):
        try:
            if mol is None:
                return False
            Chem.SanitizeMol(mol, catchErrors=True)
            return True
        except Exception as e:
            return False

    df['is_valid'] = df['mol'].apply(is_valid_mol)
    invalid_count = (~df['is_valid']).sum()
    print(f"[Tg] Invalid mols removed (e.g. parsing/sanitization issues): {invalid_count}")
    df = df[df['is_valid']].drop(columns='is_valid')

    # Generate 3D conformers
    def generate_conformers(mol):
        try:
            mol = Chem.AddHs(mol)
            params = AllChem.ETKDGv3()
            params.randomSeed = 42
            ids = AllChem.EmbedMultipleConfs(mol, numConfs=3, params=params)
            return ids if ids else []
        except:
            return []

    df['conformers'] = df['mol'].apply(generate_conformers)
    no_conf = (df['conformers'].apply(len) == 0).sum()
    print(f"[Tg] Molecules with 0 conformers: {no_conf}")
    df = df[df['conformers'].apply(len) > 0]

    print(f"[Tg] Final Tg samples: {len(df)}")
    return df


# === FFV: Fractional Free Volume Preprocessing ===
def preprocess_ffv(df):
    df = df.copy()
    # Remove invalid structures and extreme outliers
    df = df[df['FFV'].between(0.0, 1.0)]  # plausible physical bounds
    return df

# === Tc: Thermal Conductivity Preprocessing ===
def preprocess_tc(df):
    df = df.copy()
    # Remove noise: remove outliers using IQR
    q1 = df['Tc'].quantile(0.25)
    q3 = df['Tc'].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    df = df[df['Tc'].between(lower, upper)]

    # Data augmentation (e.g., duplicate underrepresented values)
    # Here: oversample rare Tc ranges (<0.1, >1.5)
    low_samples = df[df['Tc'] < 0.1]
    high_samples = df[df['Tc'] > 1.5]
    df = pd.concat([df, low_samples, high_samples], ignore_index=True)

    # Normalize with MinMax
    scaler = MinMaxScaler()
    df['Tc_scaled'] = scaler.fit_transform(df[['Tc']])
    return df

# === Density: Polymer Density Preprocessing ===
def preprocess_density(df, cap_per_bin=300):
    print(f"[Density] Starting with {len(df)} samples")
    df = df.copy()

    # Step 1: Filter valid physical density range
    df = df[df['Density'].between(0.5, 2.0)]
    print(f"[Density] After bounds filter (0.5–2.0): {len(df)} samples")

    # Step 2: Normalize SMILES (canonicalize & validate)
    def normalize_smiles(smi):
        try:
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                return None
            return Chem.MolToSmiles(mol, canonical=True)
        except:
            return None

    df['SMILES_norm'] = df['SMILES'].apply(normalize_smiles)
    num_invalid = df['SMILES_norm'].isnull().sum()
    print(f"[Density] Invalid SMILES removed during normalization: {num_invalid}")

    df = df[df['SMILES_norm'].notnull()].copy()
    df['SMILES'] = df['SMILES_norm']
    df = df.drop(columns='SMILES_norm')

    # Step 3: Bin densities
    df['density_bin'] = pd.cut(df['Density'], bins=[0.5, 1.0, 1.5, 2.0])
    bin_counts = df['density_bin'].value_counts().sort_index()
    print("[Density] Bin counts before capping:")
    print(bin_counts)

    # Step 4: Cap each bin to avoid large imbalance
    def cap_bin(group):
        return group.sample(min(len(group), cap_per_bin), random_state=42)

    df = (
        df.groupby('density_bin', observed=False)
        .apply(cap_bin)
        .reset_index(drop=True)
    )

    print(f"[Density] Final capped samples: {len(df)}")
    return df


# === Rg: Radius of Gyration Preprocessing ===
def preprocess_rg(df):
    from rdkit import Chem
    from rdkit.Chem import AllChem

    df = df.copy()

    def smiles_to_3d_polymer_safe(smiles):
        try:
            # Parse SMILES without sanitizing (to allow wildcard *)
            mol = Chem.MolFromSmiles(smiles, sanitize=False)
            if mol is None:
                return None

            # Manually update valence information (avoids implicit Hs error)
            for atom in mol.GetAtoms():
                atom.UpdatePropertyCache(strict=False)

            # Add explicit hydrogens
            mol = Chem.AddHs(mol)

            # Generate 3D conformer
            params = AllChem.ETKDGv3()
            params.randomSeed = 42
            success = AllChem.EmbedMolecule(mol, params)
            if success != 0:
                return None

            # Optimize geometry
            AllChem.UFFOptimizeMolecule(mol)

            return mol

        except Exception as e:
            return None

    # Apply 3D generation to Rg SMILES
    df['mol_3d'] = df['SMILES'].apply(smiles_to_3d_polymer_safe)

    # Drop failed conversions
    df = df[df['mol_3d'].notnull()].reset_index(drop=True)

    return df



# === Apply All Preprocessing Steps ===
tg_df = preprocess_tg(train_extended[train_extended['Tg'].notna()])
ffv_df = preprocess_ffv(train_extended[train_extended['FFV'].notna()])
tc_df = preprocess_tc(train_extended[train_extended['Tc'].notna()])
density_df = preprocess_density(train_extended[train_extended['Density'].notna()], cap_per_bin=300)
rg_df = preprocess_rg(train_extended[train_extended['Rg'].notna()])

print("✅ All property-specific preprocessing complete.")
print(f"Tg samples: {len(tg_df)}")
print(f"FFV samples: {len(ffv_df)}")
print(f"Tc samples: {len(tc_df)}")
print(f"Density samples: {len(density_df)}")
print(f"Rg samples: {len(rg_df)}")


[Tg] Starting with 1056 unique SMILES
[Tg] Invalid mols removed (e.g. parsing/sanitization issues): 0
[Tg] Molecules with 0 conformers: 15
[Tg] Final Tg samples: 1041
[Density] Starting with 1247 samples
[Density] After bounds filter (0.5–2.0): 1246 samples
[Density] Invalid SMILES removed during normalization: 0
[Density] Bin counts before capping:
density_bin
(0.5, 1.0]    456
(1.0, 1.5]    673
(1.5, 2.0]    117
Name: count, dtype: int64
[Density] Final capped samples: 717
✅ All property-specific preprocessing complete.
Tg samples: 1041
FFV samples: 7892
Tc samples: 879
Density samples: 717
Rg samples: 597


# Feature Engineering Pipeline

In [5]:
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem
import numpy as np
import pandas as pd
from tqdm import tqdm

# === Tg Feature Engineering ===
def featurize_tg(df):
    df = df.copy()
    df['mol'] = df['SMILES'].apply(Chem.MolFromSmiles)

    # Morgan fingerprints (radius=2, nBits=1024)
    df['MorganFP'] = df['mol'].apply(lambda m: AllChem.GetMorganFingerprintAsBitVect(m, radius=2, nBits=1024) if m else None)

    # Physico-chemical descriptors
    desc_funcs = {
        'MolWt': Descriptors.MolWt,
        'NumHDonors': Descriptors.NumHDonors,
        'NumHAcceptors': Descriptors.NumHAcceptors,
        'TPSA': rdMolDescriptors.CalcTPSA,
        'MolLogP': Descriptors.MolLogP,
        'NumRotatableBonds': Descriptors.NumRotatableBonds,
    }
    
    feats = {name: [] for name in desc_funcs}
    for m in tqdm(df['mol'], desc='Featurizing Tg'):
        if m:
            for name, func in desc_funcs.items():
                feats[name].append(func(m))
        else:
            for name in desc_funcs:
                feats[name].append(np.nan)
    
    feats_df = pd.DataFrame(feats, index=df.index)
    df = pd.concat([df, feats_df], axis=1)

    return df

# === FFV Feature Engineering ===
def featurize_ffv(df):
    df = df.copy()
    df['mol'] = df['SMILES'].apply(Chem.MolFromSmiles)

    feats = {
        'MolWt': [],
        'TPSA': [],
        'LabuteASA': [],
        'PEOE_VSA1': [],
    }
    for m in tqdm(df['mol'], desc='Featurizing FFV'):
        if m:
            feats['MolWt'].append(Descriptors.MolWt(m))
            feats['TPSA'].append(rdMolDescriptors.CalcTPSA(m))
            feats['LabuteASA'].append(rdMolDescriptors.CalcLabuteASA(m))
            feats['PEOE_VSA1'].append(Descriptors.PEOE_VSA1(m)) 
        else:
            for k in feats:
                feats[k].append(np.nan)

    feats_df = pd.DataFrame(feats, index=df.index)
    df = pd.concat([df, feats_df], axis=1)
    return df

# === Tc Feature Engineering ===
def featurize_tc(df):
    df = df.copy()
    df['mol'] = df['SMILES'].apply(Chem.MolFromSmiles)
    
    feats = {
        'MolSurfaceArea': [],
        'MolVolume': [],
        'NumAtoms': [],
        'InteratomicDistancesMean': [],
    }

    for m in tqdm(df['mol'], desc='Featurizing Tc'):
        if m:
            # Molecular surface area & volume using Crippen descriptors as proxy
            feats['MolSurfaceArea'].append(Descriptors.MolMR(m))
            feats['MolVolume'].append(Descriptors.MolLogP(m))  # substitute for volume, or replaceww with 3D volume if available
            feats['NumAtoms'].append(m.GetNumAtoms())
            
            # Mean pairwise interatomic distances (from 3D conf if present)
            try:
                mol3d = Chem.AddHs(m)
                AllChem.EmbedMolecule(mol3d)
                conf = mol3d.GetConformer()
                dists = []
                n_atoms = mol3d.GetNumAtoms()
                for i in range(n_atoms):
                    pos_i = conf.GetAtomPosition(i)
                    for j in range(i+1, n_atoms):
                        pos_j = conf.GetAtomPosition(j)
                        dist = pos_i.Distance(pos_j)
                        dists.append(dist)
                feats['InteratomicDistancesMean'].append(np.mean(dists) if dists else np.nan)
            except:
                feats['InteratomicDistancesMean'].append(np.nan)
        else:
            for k in feats:
                feats[k].append(np.nan)
    
    feats_df = pd.DataFrame(feats, index=df.index)
    df = pd.concat([df, feats_df], axis=1)
    return df

# === Density Feature Engineering ===
def featurize_density(df):
    df = df.copy()
    df['mol'] = df['SMILES'].apply(Chem.MolFromSmiles)
    
    feats = {
        'MolVolume3D': [],
        'MolSurfaceArea3D': [],
        'Density_MD': [],
    }

    for m in tqdm(df['mol'], desc='Featurizing Density'):
        if m:
            try:
                mol3d = Chem.AddHs(m)
                AllChem.EmbedMolecule(mol3d, randomSeed=42)
                AllChem.UFFOptimizeMolecule(mol3d)
                vol = AllChem.ComputeMolVolume(mol3d)
                sa = AllChem.ComputeMolSurfaceArea(mol3d)
                feats['MolVolume3D'].append(vol)
                feats['MolSurfaceArea3D'].append(sa)
                feats['Density_MD'].append(np.nan)  # replace with MD density if available
            except:
                feats['MolVolume3D'].append(np.nan)
                feats['MolSurfaceArea3D'].append(np.nan)
                feats['Density_MD'].append(np.nan)
        else:
            for k in feats:
                feats[k].append(np.nan)
    
    feats_df = pd.DataFrame(feats, index=df.index)
    df = pd.concat([df, feats_df], axis=1)
    return df

# === Rg Feature Engineering ===
def featurize_rg(df):
    df = df.copy()
    df['mol_3d'] = df['SMILES'].apply(lambda smi: Chem.MolFromSmiles(smi))
    
    feats = {
        'Radius': [],
        'MomentOfInertia': [],
        'ConformerCoordsMeanX': [],
        'ConformerCoordsMeanY': [],
        'ConformerCoordsMeanZ': [],
    }
    
    for m in tqdm(df['mol_3d'], desc='Featurizing Rg'):
        if m:
            try:
                mol3d = Chem.AddHs(m)
                AllChem.EmbedMolecule(mol3d, randomSeed=42)
                AllChem.UFFOptimizeMolecule(mol3d)
                conf = mol3d.GetConformer()
                n_atoms = mol3d.GetNumAtoms()
                coords = np.array([list(conf.GetAtomPosition(i)) for i in range(n_atoms)])
                
                center = coords.mean(axis=0)
                dists = np.linalg.norm(coords - center, axis=1)
                radius = dists.max()
                
                mass = np.array([atom.GetMass() for atom in mol3d.GetAtoms()])
                rel_coords = coords - center
                moi = np.sum(mass * np.sum(rel_coords**2, axis=1))
                
                feats['Radius'].append(radius)
                feats['MomentOfInertia'].append(moi)
                feats['ConformerCoordsMeanX'].append(center[0])
                feats['ConformerCoordsMeanY'].append(center[1])
                feats['ConformerCoordsMeanZ'].append(center[2])
            except:
                for k in feats:
                    feats[k].append(np.nan)
        else:
            for k in feats:
                feats[k].append(np.nan)
    
    feats_df = pd.DataFrame(feats, index=df.index)
    df = pd.concat([df, feats_df], axis=1)
    return df


# === Example usage ===
tg_features = featurize_tg(tg_df)
ffv_features = featurize_ffv(ffv_df)
tc_features = featurize_tc(tc_df)
density_features = featurize_density(density_df)
rg_features = featurize_rg(rg_df)

print("Feature engineering complete.")
print(f"Tg features shape: {tg_features.shape}")
print(f"FFV features shape: {ffv_features.shape}")
print(f"Tc features shape: {tc_features.shape}")
print(f"Density features shape: {density_features.shape}")
print(f"Rg features shape: {rg_features.shape}")


Featurizing Tg: 100%|██████████| 1041/1041 [00:00<00:00, 1899.91it/s]
Featurizing FFV: 100%|██████████| 7892/7892 [00:00<00:00, 12737.28it/s]
Featurizing Tc: 100%|██████████| 879/879 [01:03<00:00, 13.90it/s]
Featurizing Density: 100%|██████████| 717/717 [02:17<00:00,  5.20it/s]
Featurizing Rg: 100%|██████████| 597/597 [01:10<00:00,  8.52it/s]

Feature engineering complete.
Tg features shape: (1041, 15)
FFV features shape: (7892, 11)
Tc features shape: (879, 12)
Density features shape: (717, 11)
Rg features shape: (597, 12)





# Full Install Command (PyTorch 2.1.0 + CPU)

In [9]:
!pip install -q torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0
!pip install -q rdkit-pypi

# Install PyTorch Geometric and dependencies
!pip install -q torch-scatter torch-sparse torch-geometric -f https://data.pyg.org/whl/torch-2.1.0+cpu.html

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7

# Required Imports

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from rdkit.Chem import rdmolops
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tqdm import tqdm
import pandas as pd

# Data Preprocessing: Convert SMILES to Graph

In [None]:
# Atom-level features
def atom_features(atom):
    return torch.tensor([
        atom.GetAtomicNum(),           
        atom.GetDegree(),              
        atom.GetImplicitValence(),     
        atom.GetFormalCharge(),        
        float(atom.GetHybridization().real)
    ], dtype=torch.float)

# SMILES to PyG graph
def mol_to_graph(smiles, label):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    x = torch.stack([atom_features(atom) for atom in mol.GetAtoms()])
    
    edge_index = []
    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index += [[start, end], [end, start]]

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    y = torch.tensor([label], dtype=torch.float)

    return Data(x=x, edge_index=edge_index, y=y)

# Prepare Dataset

In [None]:
# Example: df = pd.read_csv("your_dataset.csv")
# df.head()

graph_data = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    graph = mol_to_graph(row['SMILES'], row['FFV'])  # <- Change 'FFV' to your target if needed
    if graph is not None:
        graph_data.append(graph)


# Train/Test Split + DataLoaders

In [None]:
train_idx, test_idx = train_test_split(range(len(graph_data)), test_size=0.2, random_state=42)

train_data = [graph_data[i] for i in train_idx]
test_data = [graph_data[i] for i in test_idx]

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

# GCN Model Definition

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = global_mean_pool(x, batch)
        x = self.lin(x)
        return x

# Training Loop

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GCN(
    in_channels=train_data[0].x.shape[1],
    hidden_channels=64,
    out_channels=1
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

# Train
for epoch in range(1, 51):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index, batch.batch).view(-1)
        loss = loss_fn(out, batch.y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch.num_graphs
    print(f"Epoch {epoch:02d} | Loss: {total_loss / len(train_loader.dataset):.4f}")

# Evaluation (R² Score)

In [None]:
model.eval()
preds, trues = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        out = model(batch.x, batch.edge_index, batch.batch).view(-1).cpu()
        preds.append(out)
        trues.append(batch.y.view(-1).cpu())

preds = torch.cat(preds).numpy()
trues = torch.cat(trues).numpy()

print("R² Score:", r2_score(trues, preds))