In [1]:
from IPython.core.display import HTML
import os
import rdkit
from rdkit import Chem
from rdkit.Chem import PandasTools
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from scikit_mol.fingerprints import MorganFingerprintTransformer
from scikit_mol.conversions import SmilesToMolTransformer
from scikit_mol.standardizer import Standardizer
from scikit_mol.descriptors import MolecularDescriptorTransformer


In [3]:
import yaml

# Path to your config file
CONFIG_PATH = r"D:\Skills\new\NeurIPS2\config.yaml"

# Load it
with open(CONFIG_PATH, 'r') as file:
    config = yaml.safe_load(file)


In [4]:
ffv_merged_csv = config['output']['ffv_merged_csv']

In [4]:
print(config['output'].keys())



dict_keys(['ffv_merged_csv', 'Tc_csv', 'Tg_csv'])


In [5]:
raw_train_csv = config['data']['raw_train_csv'] 
raw_train_df = pd.read_csv(raw_train_csv)


In [11]:
len(raw_train_df)

7973

In [19]:
raw_train_df['Density'][0]

nan

In [7]:
print(len(raw_train_df))

print(f"FFV nan {raw_train_df['FFV'].isna().sum()}")

print(raw_train_df['Tg'].isna().sum())

print(raw_train_df['Tc'].isna().sum())

print(raw_train_df['Rg'].isna().sum())

print(raw_train_df['Density'].isna().sum())

7973
FFV nan 943
7462
7236
7359
7360


## FFV Dataset

In [None]:
raw_train_df.dropna(subset = ['FFV'], inplace=True)

In [None]:
raw_train_df

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.370410,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.378860,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.355470,,,
...,...,...,...,...,...,...,...
7967,2146213237,*CCOC(=O)c1ccc(C(C#N)=C(c2ccc(OC)cc2)N2CCC(*)C...,,0.385608,,,
7968,2146592435,*Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1,,0.367498,,,
7969,2146810552,*C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3...,,0.353280,,,
7970,2147191531,*c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(...,,0.369411,,,


In [10]:
ffv_csv = config['data']['ffv_csv']
ffv_df = pd.read_csv(ffv_csv)


In [11]:
ffv_df

Unnamed: 0,SMILES,FFV
0,*C(=O)NNC(=O)c1ccc([Si](c2ccccc2)(c2ccccc2)c2c...,0.372725
1,*C(=O)NNC(=O)c1ccc([Si](c2ccccc2)(c2ccccc2)c2c...,0.365478
2,*C(=O)Nc1cc(NC(=O)c2ccc3[nH]c(-c4cc(-c5nc6cc(*...,0.376377
3,*C(=O)Nc1ccc(-c2cc(-c3ccccc3)cc(-c3ccc(NC(=O)c...,0.376939
4,*C(=O)Nc1ccc(-c2ccc(NC(=O)c3ccc4c(c3)C(=O)N(c3...,0.355235
...,...,...
857,*c1cccc(OCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5...,0.349095
858,*c1cccc(OCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C...,0.350892
859,*c1cccc(OCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C(...,0.345386
860,*c1cccc(Oc2cccc(Oc3cccc(N4C(=O)c5ccc(Oc6ccc(Sc...,0.362224


In [12]:
# Step 1: Ensure SMILES are treated as strings
raw_train_df['SMILES'] = raw_train_df['SMILES'].astype(str)
ffv_df['SMILES'] = ffv_df['SMILES'].astype(str)

# Step 2: Identify common and unique SMILES
common_smiles = set(raw_train_df['SMILES']) & set(ffv_df['SMILES'])
unique_smiles = set(ffv_df['SMILES']) - set(raw_train_df['SMILES'])



In [None]:

# 4. Find common and unique SMILES
common_smiles = set(raw_train_df['SMILES']) & set(ffv_df['SMILES'])
unique_ffv_smiles = set(ffv_df['SMILES']) - set(raw_train_df['SMILES'])

# 5. Extract only unique rows from ffv_df
unique_ffv_rows = ffv_df[ffv_df['SMILES'].isin(unique_ffv_smiles)]



In [None]:
# 6. Align column format with raw_train_df (fill empty target cols)
for col in raw_train_df.columns:
    if col not in unique_ffv_rows.columns:
        unique_ffv_rows[col] = np.nan

# 7. Reorder columns to match raw_train_df
unique_ffv_rows = unique_ffv_rows[raw_train_df.columns]


In [15]:

# 8. Concatenate the unique rows
ffv_merged_df = pd.concat([raw_train_df, unique_ffv_rows], ignore_index=True)

In [23]:
ffv_merged_df.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817.0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919.0,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772.0,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,
3,519416.0,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187.0,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,


In [17]:
# 5. Save to your desired location
output_path = ffv_merged_csv  # ✅ customize path
ffv_merged_df.to_csv(output_path, index=False)

print(f"✅ Merged and saved successfully to:\n{output_path}")


✅ Merged and saved successfully to:
D:\Skills\new\NeurIPS-1 - Copy\outputs\csv\ffv_merged.csv


## TC dataset

In [27]:
raw_train_df = pd.read_csv(raw_train_csv)

In [28]:
TG_csv = config['data']['TG_csv']
TG_df = pd.read_csv(TG_csv)

In [38]:
# 1. Drop NaNs from FFV in raw_train_df (you already did this)
raw_train_df.dropna(subset=['Tg'], inplace=True)



# 3. Standardize SMILES columns (strip whitespace etc.)
raw_train_df['SMILES'] = raw_train_df['SMILES'].str.strip()
TG_df['SMILES'] = TG_df['SMILES'].str.strip()

# 4. Find common and unique SMILES
common_smiles = set(raw_train_df['SMILES']) & set(TG_df['SMILES'])
unique_TG_smiles = set(TG_df['SMILES']) - set(raw_train_df['SMILES'])

# 5. Extract only unique rows from ffv_df
unique_TG_rows = TG_df[TG_df['SMILES'].isin(unique_TG_smiles)]

# 6. Align column format with raw_train_df (fill empty target cols)
for col in raw_train_df.columns:
    if col not in unique_TG_rows.columns:
        unique_TG_rows[col] = np.nan

# 7. Reorder columns to match raw_train_df
unique_TG_rows = unique_TG_rows[raw_train_df.columns]

# 8. Concatenate the unique rows
merged_df = pd.concat([raw_train_df, unique_TG_rows], ignore_index=True)


In [39]:
merged_df.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,10142210.0,*NC(C)C(=O)NCC(=O)NCC(*)=O,208.639749,,,,
1,13838538.0,*CCCCCCSSCCCCSS*,-41.266724,,0.192,,
2,16498242.0,*C=CCCCCCCCC*,-17.282022,,,,
3,30582999.0,*CCCCCCCCCCOC(=O)c1ccc(C(=O)NCCNC(=O)c2ccc(C(=...,4.250403,,,,
4,36217683.0,*c1nc2cc3sc(-c4cc(OCCCCCC)c(*)cc4OCCCCCC)nc3cc2s1,168.526313,,,,


In [43]:
Tg_cleaned = config['output']['Tg_csv']

In [44]:
# 5. Save to your desired location
output_path = Tg_cleaned  # ✅ customize path
merged_df.to_csv(output_path, index=False)

print(f"✅ Merged and saved successfully to:\n{output_path}")

✅ Merged and saved successfully to:
D:\Skills\new\NeurIPS-1 - Copy\outputs\csv\Tg_cleaned.csv


In [54]:
raw_train_df = pd.read_csv(raw_train_csv)

In [55]:
import pandas as pd
import numpy as np

def merge_unique_target_data(main_df, supplement_df, target_col="Tg"):
    """
    Merges a supplementary dataframe into the main dataframe by adding unique rows (based on SMILES)
    that contain values for a specific target column. Missing columns will be filled with NaN to align with main_df.
    
    Parameters:
        main_df (pd.DataFrame): Original dataset with standard columns like SMILES and multiple targets.
        supplement_df (pd.DataFrame): Supplementary dataset with SMILES and at least one target column.
        target_col (str): The target column to use for filtering and merging (e.g., 'Tg', 'FFV', etc.)

    Returns:
        pd.DataFrame: Updated dataframe with unique rows from supplement_df added in.
    """

    # Step 1: Drop NaNs from the target column in the main dataframe
    main_df = main_df.dropna(subset=[target_col]).copy()

    # Step 2: Strip whitespace from SMILES
    main_df['SMILES'] = main_df['SMILES'].str.strip()
    supplement_df['SMILES'] = supplement_df['SMILES'].str.strip()

    # Step 3: Identify SMILES present only in the supplement
    common_smiles = set(main_df['SMILES']) & set(supplement_df['SMILES'])
    unique_smiles = set(supplement_df['SMILES']) - set(main_df['SMILES'])

    # Step 4: Extract rows from supplement with unique SMILES
    unique_rows = supplement_df[supplement_df['SMILES'].isin(unique_smiles)].copy()

    # Step 5: Align column format with main_df
    for col in main_df.columns:
        if col not in unique_rows.columns:
            unique_rows[col] = np.nan

    # Step 6: Reorder columns to match
    unique_rows = unique_rows[main_df.columns]

    # Step 7: Concatenate
    merged_df = pd.concat([main_df, unique_rows], ignore_index=True)

    return merged_df


In [56]:
Tc_csv = config['data']['TC_csv']
Tc_df = pd.read_csv(Tc_csv)

In [57]:
Tc_merged_df = merge_unique_target_data(raw_train_df, Tc_df, 'Tc')

In [58]:
Tc_merged_df.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817.0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,2986007.0,*c1ccc(-c2ccc3c(c2)C(CCCCCCC#N)(CCCCCCC#N)c2cc...,,0.402397,0.487,0.901123,28.682441
2,3013292.0,*CC(*)c1ccc(C(=O)O)c(C(=O)O)c1,,,0.171,1.184354,13.534248
3,6645418.0,*CCCCCNC(=O)CCCCC(=O)N*,,0.332741,0.327,,
4,7687820.0,*CCCCCCCCCCCCCCCCCCNC(=O)NCCCCCCNC(=O)N*,,,0.383,,


In [59]:
# 5. Save to your desired location
output_path = Tc_cleaned  # ✅ customize path
Tc_merged_df.to_csv(output_path, index=False)

print(f"✅ Merged and saved successfully to:\n{output_path}")

✅ Merged and saved successfully to:
D:\Skills\new\NeurIPS-1 - Copy\outputs\csv\Tc_cleaned.csv


In [13]:
ffv_df.isna().count()


SMILES    862
FFV       862
dtype: int64

In [14]:
TG_df.isna().count()

SMILES    46
Tg        46
dtype: int64

In [15]:
TC_df.isna().count()

SMILES     874
TC_mean    874
dtype: int64

In [19]:
def check_uniqueness(df, name="df"):
    duplicates = df.duplicated(subset=[ "SMILES"], keep=False)
    if duplicates.any():
        print(f"⚠️ Duplicate ( SMILES) pairs found in {name}:")
        print(df[duplicates])
    else:
        print(f"✅ No duplicates in {name}.")

check_uniqueness(ffv_df, "FFV")
check_uniqueness(TC_df, "Tc")
check_uniqueness(TG_df, "Tg")


✅ No duplicates in FFV.
⚠️ Duplicate ( SMILES) pairs found in Tc:
              SMILES   TC_mean
4            */C=C/*  0.526000
5            */C=C/*  0.262000
11         */C=C/CC*  0.248667
12         */C=C/CC*  0.272000
13         */C=C/CC*  0.244000
14        */C=C/CCC*  0.253500
15        */C=C/CCC*  0.279600
19   */C=C/CCCCCCCC*  0.357167
20   */C=C/CCCCCCCC*  0.380500
445     *CC/C=C(/*)C  0.256000
446     *CC/C=C(/*)C  0.258000
447     *CC/C=C(/*)C  0.235000
✅ No duplicates in Tg.
