In [2]:
import pandas as pd 
import numpy as np


In [3]:
import sklearn
from sklearn.model_selection import train_test_split

csv_path = 'D:/Skills/NeurIPS/data/train.csv'
train_df = pd.read_csv(csv_path)

# 1. split off 20% for dev_test
temp_df, dev_test = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,  # for reproducibility
    shuffle=True
)

In [4]:
dev_train, dev_val = train_test_split(
    temp_df,
    test_size=0.25,  # 0.25 * 0.8 = 0.2 of the original
    random_state=42,
    shuffle=True
)

# Verify sizes
print(f"Total rows:   {len(train_df)}")
print(f"Dev train:    {len(dev_train)} ({len(dev_train)/len(train_df):.2%})")
print(f"Dev valid:    {len(dev_val)} ({len(dev_val)/len(train_df):.2%})")
print(f"Dev test:     {len(dev_test)} ({len(dev_test)/len(train_df):.2%})")
print(f"Polymer example:{dev_train['SMILES'].to_list()[:3]}")
print(f"Columns:{dev_train.columns}")

Total rows:   7973
Dev train:    4783 (59.99%)
Dev valid:    1595 (20.01%)
Dev test:     1595 (20.01%)
Polymer example:['*Nc1ccc(CC(CC(C)(C)c2ccc(N*)cc2)=C(C)C)cc1', '*CC(*)(CC(=O)OC)C(=O)OC12CC3CC(C)(CC(C)(C3)C1)C2', '*OP(=O)(Oc1c(Cl)cc(Cl)cc1Cl)Oc1c(Cl)c(Cl)c(*)c(Cl)c1Cl']
Columns:Index(['id', 'SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg'], dtype='object')


In [None]:
## concept of radius
## small radius will mean more sensitive to local changes
## bigger radii will mean holistic view of the molecule
##


In [6]:
import rdkit
print(rdkit.__version__)

2025.03.4


### Coordinates can be generated and stored with the molecule using functionality in the rdkit.Chem.AllChem module 


## function 1

In [7]:
from rdkit import Chem
from rdkit.Chem import AllChem
#from rdkit.Chem.rdFingerprintGenerator import MorganFingerprintGenerator
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error



#AllChem.GetMorganFingerprintAsBitVect(...)


def smiles_to_fp(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        #raise ValueError(f"Invalid SMILES: {smiles}")
        return np.zeros((1024,), dtype=np.int8)
    
    generator = AllChem.GetMorganGenerator(radius=radius, nBits=nBits)
    #MorganFingerprintGenerator(radius=radius, nBits=nBits)
    return np.array(generator.GetSparseCountFingerprint(mol))


## function 2

In [8]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import numpy as np

# Define generator once
generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)

def smiles_to_fp_2(smiles, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros((nBits,), dtype=np.int32)
    
    fp_sparse = generator.GetCountFingerprint(mol)  # returns a dict-like sparse vector
    
    arr = np.zeros((nBits,), dtype=np.int32)
    for idx, count in fp_sparse.GetNonzeroElements().items():
        if idx < nBits:
            arr[idx] = count
    return arr



In [9]:
X_train = dev_train['SMILES'].to_list()
y_train = dev_train[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()
X_val = dev_val['SMILES'].to_list()
y_val = dev_val[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()

In [10]:
print(y_val)

[[       nan 0.37750321        nan        nan        nan]
 [       nan 0.41117013        nan        nan        nan]
 [       nan 0.39689903        nan        nan        nan]
 ...
 [       nan 0.32579382        nan        nan        nan]
 [       nan 0.37329926        nan        nan        nan]
 [       nan 0.446758          nan        nan        nan]]


In [11]:

# Convert SMILES to fingerprint features
X_train_feats = np.vstack([smiles_to_fp_2(s) for s in X_train])
X_val_feats   = np.vstack([smiles_to_fp_2(s) for s in X_val])
X_test = dev_test['SMILES'].to_list()
X_test_feats  = np.vstack([smiles_to_fp_2(s) for s in X_test])

# Combine train and validation sets
X_dev_feats = np.vstack([X_train_feats, X_val_feats])
y_dev = np.vstack([y_train, y_val])

# Test targets
y_test = dev_test[['Tg', 'FFV', 'Tc', 'Density', 'Rg']].to_numpy()



In [12]:
print(y_test[2])

[       nan 0.35557991 0.18366667        nan        nan]


In [13]:
print(X_test)

['*C=Cc1ccc2c3ccc(*)cc3n(-c3ccc(OCCCCCCCCCC)c(OCCCCCCCCCC)c3)c2c1', '*CC(=O)NCCCCCCNC(=O)Cc1ccc(O*)cc1', '*CC(*)c1ccccc1C(=O)NC', '*c1ccc2c(c1)C(=O)N(c1ccc(Oc3ccc(N4C(=O)c5ccc(-c6c(-c7ccccc7)c(-c7ccccc7)c(-c7ccc(Sc8ccc(-c9c(-c%10ccccc%10)c(-c%10ccccc%10)c(*)c(-c%10ccccc%10)c9-c9ccccc9)cc8)cc7)c(-c7ccccc7)c6-c6ccccc6)cc5C4=O)cc3)cc1)C2=O', '*CC(*)OC(=O)c1ccc(-c2ccccc2)cc1', '*c1ccc(C(=O)Nc2cccc(S(=O)(=O)c3cccc(NC(=O)c4ccc(N5C(=O)c6ccc(-c7ccc8c(c7)C(=O)N(*)C8=O)cc6C5=O)cc4)c3)c2)cc1', '*CCCCCCCC(=O)OCC1COC(*)O1', '*c1ccc(Oc2ccc(N3C(=O)c4ccc(Oc5ccc(C(=O)c6ccc(Oc7ccc8c(c7)C(=O)N(*)C8=O)cc6)cc5)cc4C3=O)cc2)cc1', '*CCCCNC(=O)c1cccc(C(=O)N*)c1', '*c1ccc(N2C(=O)c3ccc(C(=O)c4ccc5c(c4)C(=O)N(c4ccc(-c6nc7cc(S(=O)(=O)c8ccc9nc(-c%10ccccc%10)c(*)nc9c8)ccc7nc6-c6ccccc6)cc4)C5=O)cc3C2=O)cc1', '*C(=O)Nc1ccc(Cc2ccc(N3C(=O)c4ccc(*)cc4C3=O)cc2)cc1', '*c1ccc(C(=O)c2ccc(N3C(=O)c4ccc(Oc5ccc(Oc6ccc7c(c6)C(=O)N(*)C7=O)cc5)cc4C3=O)cc2)cc1', '*CC(*)(C)C(=O)OCCCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F', '*CC(

In [15]:
print(X_val_feats)

[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [28]:
task_names = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
models = {}
y_pred = np.zeros_like(y_test)

# Train one random forest per task
for idx, name in enumerate(task_names):
    print('Training random forest for the task:', name)
    y_col = y_dev[:, idx]
    mask  = ~np.isnan(y_col)
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_dev_feats[mask], y_col[mask])
    models[name] = rf
    # Predict on test set
    y_pred[:, idx] = rf.predict(X_test_feats)

# Compute MSE per task, skipping NaNs
mse_per_task = {}
for i, name in enumerate(task_names):
    print('Predicting for the task:', name)
    mask = ~np.isnan(y_test[:, i])
    if mask.sum() > 0:
        mse = mean_squared_error(y_test[mask, i], y_pred[mask, i])
        mse_per_task[name] = mse
    else:
        mse_per_task[name] = np.nan

print("MSE per task:")
for name, mse in mse_per_task.items():
    print(f"  {name}: {mse:.4f}")

# Compute overall MSE across all tasks, skipping NaNs
mask_all = ~np.isnan(y_test)
y_true_flat = y_test[mask_all]
y_pred_flat = y_pred[mask_all]
mse_overall = mean_squared_error(y_true_flat, y_pred_flat)
print(f"Overall MSE: {mse_overall:.4f}")

Training random forest for the task: Tg
Training random forest for the task: FFV
Training random forest for the task: Tc
Training random forest for the task: Density
Training random forest for the task: Rg
Predicting for the task: Tg
Predicting for the task: FFV
Predicting for the task: Tc
Predicting for the task: Density
Predicting for the task: Rg
MSE per task:
  Tg: 3644.1967
  FFV: 0.0001
  Tc: 0.0018
  Density: 0.0091
  Rg: 7.1606
Overall MSE: 167.5103
