# NeurIPS_Open-Polymer-Prediction-2025

### Goal 
Predicting polymer properties with machine learning to accelerate sustainable materials research.

### Data
In this competition, your task is to use polymer structure data (SMILES) to predict five key chemical properties derived from molecular dynamics simulation: glass transition temperature (Tg), fractional free volume (FFV), thermal conductivity (Tc), polymer density, and radius of gyration (Rg). Successfully predicting these properties is crucial for scientists to accelerate the design of novel polymers with targeted characteristics, which can be used in various applications.

In [1]:
#import
import numpy as np
import pandas as pd
from rdkit import Chem
import seaborn as sns
import matplotlib.pyplot as plt
from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, DataStructs

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
def generate_2d_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return {desc_name: func(mol) for desc_name, func in Descriptors.descList}

In [3]:
def generate_3d_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None 
    mol = Chem.AddHs(mol)  
    try:
        AllChem.EmbedMolecule(mol, AllChem.ETKDG())
        AllChem.UFFOptimizeMolecule(mol)
        descriptors_3d = {
            "radius_of_gyration": rdMolDescriptors.CalcRadiusOfGyration(mol),
            "asphericity": rdMolDescriptors.CalcAsphericity(mol),
            "spherocity_index": rdMolDescriptors.CalcSpherocityIndex(mol),
            "eccentricity": rdMolDescriptors.CalcEccentricity(mol),
            "NPR1": rdMolDescriptors.CalcNPR1(mol),
            "NPR2": rdMolDescriptors.CalcNPR2(mol),
            "PMI1": rdMolDescriptors.CalcPMI1(mol),
            "PMI2": rdMolDescriptors.CalcPMI2(mol),
            "PMI3": rdMolDescriptors.CalcPMI3(mol)}
        return descriptors_3d
    except:
        return None 

In [4]:
datafile = pd.read_csv("processed_train.csv")

In [5]:
smiles_list = datafile["SMILES"].to_list()

In [None]:
data = []
for smiles in smiles_list:
    descriptors_2d = generate_2d_descriptors(smiles)
    descriptors_3d = generate_3d_descriptors(smiles)
    if descriptors_2d and descriptors_3d:
        combined_descriptors = {**descriptors_2d, **descriptors_3d}
        combined_descriptors["SMILES"] = smiles
        data.append(combined_descriptors)

[23:27:02] UFFTYPER: Unrecognized atom type: *_ (0)
[23:27:02] UFFTYPER: Unrecognized atom type: *_ (3)
[23:27:02] UFFTYPER: Unrecognized atom type: *_ (0)
[23:27:02] UFFTYPER: Unrecognized atom type: *_ (3)
[23:27:02] UFFTYPER: Unrecognized atom type: *_ (0)
[23:27:02] UFFTYPER: Unrecognized atom type: *_ (28)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (0)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (28)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (0)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (51)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (0)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (51)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (0)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (25)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (0)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (25)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (0)
[23:27:03] UFFTYPER: Unrecognized atom type: *_ (40)
[23:27:04] UFFTYPER: Unrecognized atom type: *_ (0)
[23:2

In [None]:
df = pd.DataFrame(data)

In [None]:
data = datafile.merge(df,on="SMILES").drop_duplicates()

In [None]:
data.describe