In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [19]:
import sys
import os
sys.path.append(os.path.abspath(".."))
from utils_project import generate_csv,create_dataframe_from_xyz_files,create_X_y_from_dataframe


csv_path = "../../data/energies/train.csv"
path_data = "../../data/atoms/train"
df_train=create_dataframe_from_xyz_files(path_data,csv_path,inv_only=True)
X=df_train[['positions', 'energy', 'charges']]

qm7 = X.to_dict("list")

#qm7 = fetch_qm7(align=True)
pos = np.array(qm7['positions'])
full_charges = np.array(qm7['charges'])

n_molecules = pos.shape[0]

In [20]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


In [21]:
M, N, O = 64, 64, 64 #192, 128, 96
grille = "64-64-64"
grid = np.mgrid[-M//2:-M//2+M, -N//2:-N//2+N, -O//2:-O//2+O]
grid = np.fft.ifftshift(grid)


In [22]:
saved_data = torch.load( f'../models_scattering/scattering_outputs_{grille}.pt', map_location=device)
order_0 = saved_data['order_0']
orders_1_and_2 = saved_data['orders_1_and_2']
order_0 = order_0.cpu().numpy()
orders_1_and_2 = orders_1_and_2.cpu().numpy()

order_0 = order_0.reshape((n_molecules, -1))
orders_1_and_2 = orders_1_and_2.reshape((n_molecules, -1))
scattering_coef = np.concatenate([order_0, orders_1_and_2], axis=1)
target = qm7['energy']



In [23]:
meta_cols = ['id', 'energy','positions','charges']  # Colonnes à ne pas inclure
extra_features = df_train.drop(columns=meta_cols, errors='ignore')

# 4. Convertir les features pandas en numpy
extra_features_np = extra_features.to_numpy()
print(f"Shape extra features: {extra_features_np.shape}")

# 5. Concaténation finale
all_features = np.concatenate([scattering_coef, extra_features_np], axis=1)

Shape extra features: (6591, 24)


In [7]:
import numpy as np
from sklearn import linear_model, preprocessing, pipeline, model_selection
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import joblib

# Supposons que scattering_coef et target soient déjà définis
cross_val_folds = 5  # Assurez-vous que cross_val_folds est défini

# Liste des modèles à tester
models = [
    ("Ridge Regression with alpha=0.1", linear_model.Ridge(alpha=0.1)),
    ("Ridge Regression with alpha=1", linear_model.Ridge(alpha=1)),
    ("Ridge Regression with alpha=10", linear_model.Ridge(alpha=10)),
    ("Lasso Regression", linear_model.Lasso()),
    ("ElasticNet Regression", linear_model.ElasticNet()),
    ("Random Forest Regression", RandomForestRegressor()),
    ("Support Vector Regression", SVR()),
    ("XGBoost Regression", XGBRegressor()),
    ("MLP Regressor", MLPRegressor(random_state=1, max_iter=2000, tol=0.1))
]

results = []

for name, model in models:
    scaler = preprocessing.StandardScaler()
    regressor = pipeline.make_pipeline(scaler, model)

    target_prediction = model_selection.cross_val_predict(regressor, X=all_features, y=target, cv=cross_val_folds)

    MAE = np.mean(np.abs(target_prediction - target))
    RMSE = np.sqrt(np.mean((target_prediction - target) ** 2))

    results.append((name, model, MAE, RMSE))

    print('{}: MAE: {}, RMSE: {}'.format(name, MAE, RMSE))

# Trouver le modèle avec le RMSE le plus bas
best_result = min(results, key=lambda x: x[3])
best_model_name, best_model, best_mae, best_rmse = best_result

print(f"Le meilleur modèle est {best_model_name} avec un RMSE de {best_rmse}.")

Ridge Regression with alpha=0.1: MAE: 0.1387703819613177, RMSE: 0.2150951668779308
Ridge Regression with alpha=1: MAE: 0.18065564737887996, RMSE: 0.2624037290813566
Ridge Regression with alpha=10: MAE: 0.270762485273798, RMSE: 0.4085741765439033
Lasso Regression: MAE: 1.928145973458486, RMSE: 2.3722755070621377
ElasticNet Regression: MAE: 1.8559096229301562, RMSE: 2.3348436628999636
Random Forest Regression: MAE: 0.20192070997372716, RMSE: 0.650127411556418
Support Vector Regression: MAE: 0.9036075834655585, RMSE: 2.115671516784985
XGBoost Regression: MAE: 0.21574020057615265, RMSE: 0.53474857906012
MLP Regressor: MAE: 1.6991380544775994, RMSE: 3.9585146350444624
Le meilleur modèle est Ridge Regression with alpha=0.1 avec un RMSE de 0.2150951668779308.


In [24]:
scaler = preprocessing.StandardScaler()
best_regressor = pipeline.make_pipeline(scaler, best_model)
best_regressor.fit(all_features, target)

# Enregistrer le meilleur modèle
joblib.dump(best_regressor, f'../models_scattering/best_model_all_features_{grille}.pkl')

['../models_scattering/best_model_all_features_64-64-64.pkl']

# Test

In [25]:
csv_path = None
path_data = "../../data/atoms/test"
df_test=create_dataframe_from_xyz_files(path_data,csv_path,inv_only=True)

X=df_test[['positions', 'charges','id']]
qm7 = X.to_dict("list")

#qm7 = fetch_qm7(align=True)
pos = np.array(qm7['positions'])
full_charges = np.array(qm7['charges'])

n_molecules = pos.shape[0]

In [26]:
saved_data = torch.load( f'../models_scattering/scattering_outputs_test_{grille}.pt', map_location=device)
order_0 = saved_data['order_0']
orders_1_and_2 = saved_data['orders_1_and_2']

order_0 = order_0.cpu().numpy()
orders_1_and_2 = orders_1_and_2.cpu().numpy()

order_0 = order_0.reshape((n_molecules, -1))
orders_1_and_2 = orders_1_and_2.reshape((n_molecules, -1))

scattering_coef = np.concatenate([order_0, orders_1_and_2], axis=1)

meta_cols = ['id','positions','charges']  # Colonnes à ne pas inclure
extra_features = df_test.drop(columns=meta_cols, errors='ignore')

# 4. Convertir les features pandas en numpy
extra_features_np = extra_features.to_numpy()
print(f"Shape extra features: {extra_features_np.shape}")

# 5. Concaténation finale
all_features = np.concatenate([scattering_coef, extra_features_np], axis=1)

Shape extra features: (1647, 24)


In [27]:
import joblib

# Charger le modèle enregistré
model = joblib.load(f'../models_scattering/best_model_all_features_{grille}.pkl')
y_pred = model.predict(all_features)

generate_csv(df_test['id'],y_pred,f'best_af_{grille}')