In [17]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor

df = pd.read_csv('result/data/melting_point_features.csv')

y = df['Tm']
X = df.drop(columns=['Tm'])

X = X.select_dtypes(include=[np.number])
X.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy='median')
X_clean = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)

base_model = LGBMRegressor(random_state=42, n_jobs=-1)

In [18]:
from sklearn.feature_selection import RFE
import time

print("\n---START RFE ---")
start = time.time()

model = LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)

rfe = RFE(estimator=model, n_features_to_select=30, step=0.1)

rfe.fit(X_train, y_train)

selected_rfe = X_train.columns[rfe.support_]
print(f"‚è±Ô∏è Time Run: {time.time() - start:.2f} gi√¢y")
print(f"‚úÖ RFE Choosen {len(selected_rfe)} features:")
print(list(selected_rfe))


---START RFE ---
‚è±Ô∏è Time Run: 5.56 gi√¢y
‚úÖ RFE Choosen 30 features:
['MinEStateIndex', 'qed', 'SPS', 'MolWt', 'NumValenceElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MinAbsPartialCharge', 'BCUT2D_MRHI', 'BalabanJ', 'Chi0n', 'Chi0v', 'HallKierAlpha', 'Kappa2', 'Kappa3', 'PEOE_VSA14', 'SMR_VSA10', 'SlogP_VSA2', 'TPSA', 'EState_VSA2', 'Phi', 'MolMR', 'SlogP_VSA0', 'SMR_VSA0', 'Gasteiger_q_std', 'HeteroAtomFrac', 'Flexibility_Score', 'Complexity_per_MW', 'FracSingle', 'FracDouble']


In [37]:
import warnings
warnings.filterwarnings('ignore')

print("\n--- üß¨ B·∫ÆT ƒê·∫¶U CH·∫†Y GENETIC ALGORITHM ---")
model = LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)

ga = GAFeatureSelectionCV(
    estimator=model,
    cv=2,                     
    scoring="neg_root_mean_squared_error",
    population_size=100,         
    generations=5,         
    tournament_size=3,
    elitism=True,
    keep_top_k=2,
    crossover_probability=0.5,
    mutation_probability=0.1,
    algorithm="eaSimple",
    n_jobs=1,
    verbose=True              
)

ga.fit(X_train, y_train)

selected_ga = X_train.columns[ga.support_]
print(f"\n‚úÖ GA ƒë√£ ch·ªçn {len(selected_ga)} features:")
print(list(selected_ga))


--- üß¨ B·∫ÆT ƒê·∫¶U CH·∫†Y GENETIC ALGORITHM ---




gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	100   	-292.247	4.93319    	-282.947   	-307.208   




1  	48    	-288.387	3.82909    	-280.127   	-296.875   




2  	58    	-285.231	2.78012    	-280.127   	-292.898   




3  	47    	-283.197	2.23427    	-276.813   	-290.561   




4  	48    	-281.846	2.44973    	-276.813   	-293.15    




5  	54    	-281.059	2.62521    	-276.813   	-289.126   

‚úÖ GA ƒë√£ ch·ªçn 440 features:
['MinAbsEStateIndex', 'SPS', 'HeavyAtomMolWt', 'NumValenceElectrons', 'MinAbsPartialCharge', 'FpDensityMorgan2', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0v', 'Chi1', 'Chi1n', 'Chi2n', 'Chi4v', 'HallKierAlpha', 'Kappa1', 'LabuteASA', 'PEOE_VSA10', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA6', 'SMR_VSA8', 'SlogP_VSA1', 'SlogP_VSA2', 'SlogP_VSA9', 'EState_VSA1', 'EState_VSA11', 'EState_VSA4', 'EState_VSA5', 'EState_VSA7', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState5', 'VSA_EState6', 'VSA_EState8', 'HeavyAtomCount', 'NumAliphaticCarbocycles', 'NumAromaticHeterocycles', 'NumHAcceptors', 'NumRotatableBonds', 'NumSpiroAtoms', 'RingCount', 'MolLogP', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_OH', 'fr_COO', '

In [38]:
common_features = set(selected_rfe) & set(selected_ga)
print(f"\nüíé C√°c features quan tr·ªçng ƒë∆∞·ª£c c·∫£ 2 thu·∫≠t to√°n c√πng ch·ªçn ({len(common_features)}):")
print(common_features)


üíé C√°c features quan tr·ªçng ƒë∆∞·ª£c c·∫£ 2 thu·∫≠t to√°n c√πng ch·ªçn (15):
{'HeteroAtomFrac', 'MinAbsPartialCharge', 'SMR_VSA10', 'FracDouble', 'SlogP_VSA0', 'NumValenceElectrons', 'HallKierAlpha', 'Gasteiger_q_std', 'Flexibility_Score', 'BalabanJ', 'Complexity_per_MW', 'Chi0v', 'SPS', 'SlogP_VSA2', 'SMR_VSA0'}


In [42]:
print("\n--- üöÄ HU·∫§N LUY·ªÜN MODEL FINAL TR√äN FULL DATA ---")


best_features = selected_rfe


final_model = LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
final_model.fit(X_clean[best_features], y)

print("‚úÖ ƒê√£ hu·∫•n luy·ªán xong m√¥ h√¨nh cu·ªëi c√πng!")
print(f"üëâ M√¥ h√¨nh ƒëang s·ª≠ d·ª•ng {len(best_features)} features.")

import joblib


joblib.dump(final_model, 'final_melting_point_model.pkl')
joblib.dump(list(best_features), 'final_features_list.pkl')

print("üíæ ƒê√£ l∆∞u model th√†nh c√¥ng v√†o file 'final_melting_point_model.pkl'")


--- üöÄ HU·∫§N LUY·ªÜN MODEL FINAL TR√äN FULL DATA ---
‚úÖ ƒê√£ hu·∫•n luy·ªán xong m√¥ h√¨nh cu·ªëi c√πng!
üëâ M√¥ h√¨nh ƒëang s·ª≠ d·ª•ng 30 features.
üíæ ƒê√£ l∆∞u model th√†nh c√¥ng v√†o file 'final_melting_point_model.pkl'


In [43]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import gc
model = joblib.load('final_melting_point_model.pkl')
features = joblib.load('final_features_list.pkl')

df = pd.read_csv('result/data/melting_point_features.csv')

needed_cols = list(features) + ['Tm']

existing_cols = [c for c in needed_cols if c in df.columns]

df_reduced = df[existing_cols].copy()

del df
gc.collect()

y = df_reduced['Tm']
X = df_reduced.drop(columns=['Tm'])

X = X.select_dtypes(include=[np.number])
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.mask(X > 1e308, np.nan)

print("‚öôÔ∏è(Imputing)...")
imputer = SimpleImputer(strategy='median')
X_clean = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

_, X_test, _, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)

y_pred = model.predict(X_test[features])

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n--- üèÅ RESULT ---")
print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")

‚öôÔ∏è(Imputing)...

--- üèÅ RESULT ---
RMSE: 176.4221
R2: 0.8244
