In [8]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor

df = pd.read_csv('result/data/melting_point_features.csv')

y = df['Tm']
X = df.drop(columns=['Tm'])

X = X.select_dtypes(include=[np.number])
X.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy='median')
X_clean = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)

base_model = LGBMRegressor(random_state=42, n_jobs=-1)

In [12]:
from sklearn.feature_selection import RFE
import time

print("\n---START RFE ---")
start = time.time()

model = LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)

rfe = RFE(estimator=model, n_features_to_select=30, step=0.1)

rfe.fit(X_train, y_train)

selected_rfe = X_train.columns[rfe.support_]
print(f"⏱️ Time Run: {time.time() - start:.2f} giây")
print(f"✅ RFE Choosen {len(selected_rfe)} features:")
print(list(selected_rfe))


---START RFE ---
⏱️ Time Run: 5.78 giây
✅ RFE Choosen 30 features:
['MinEStateIndex', 'qed', 'SPS', 'MolWt', 'NumValenceElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MinAbsPartialCharge', 'BCUT2D_MRHI', 'BalabanJ', 'Chi0n', 'Chi0v', 'HallKierAlpha', 'Kappa2', 'Kappa3', 'PEOE_VSA14', 'SMR_VSA10', 'SlogP_VSA2', 'TPSA', 'EState_VSA2', 'Phi', 'MolMR', 'SlogP_VSA0', 'SMR_VSA0', 'Gasteiger_q_std', 'HeteroAtomFrac', 'Flexibility_Score', 'Complexity_per_MW', 'FracSingle', 'FracDouble']
