In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [2]:
import sys
import os
sys.path.append(os.path.abspath(".."))
from utils_project import generate_csv,create_dataframe_from_xyz_files,create_X_y_from_dataframe


csv_path = "../../data/energies/train.csv"
path_data = "../../data/atoms/train"
df_train=create_dataframe_from_xyz_files(path_data,csv_path,inv_only=True)
X=df_train[['positions', 'energy', 'charges']]

qm7 = X.to_dict("list")

#qm7 = fetch_qm7(align=True)
pos = np.array(qm7['positions'])
full_charges = np.array(qm7['charges'])

n_molecules = pos.shape[0]

In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


In [10]:
M, N, O = 64, 64, 64 #192, 128, 96
grille = "64-64-64"
grille1 = "96-64-48"

grid = np.mgrid[-M//2:-M//2+M, -N//2:-N//2+N, -O//2:-O//2+O]
grid = np.fft.ifftshift(grid)


In [11]:
# saved_data = torch.load( f'../models_scattering/scattering_outputs_{grille}.pt', map_location=device)
# order_0 = saved_data['order_0']
# orders_1_and_2 = saved_data['orders_1_and_2']
# order_0 = order_0.cpu().numpy()
# orders_1_and_2 = orders_1_and_2.cpu().numpy()

# order_0 = order_0.reshape((n_molecules, -1))
# orders_1_and_2 = orders_1_and_2.reshape((n_molecules, -1))
# scattering_coef = np.concatenate([order_0, orders_1_and_2], axis=1)
# target = qm7['energy']

def load_scattering(gril, device, n_molecules):
    saved_data = torch.load(f'../models_scattering/scattering_outputs_{gril}.pt', map_location=device)
    order_0 = saved_data['order_0'].cpu().numpy().reshape((n_molecules, -1))
    orders_1_and_2 = saved_data['orders_1_and_2'].cpu().numpy().reshape((n_molecules, -1))
    return np.concatenate([order_0, orders_1_and_2], axis=1)

scattering_16 = load_scattering("16-16-16", device, n_molecules)
scattering_32= load_scattering("32-32-32", device, n_molecules)
scattering_1 = load_scattering(grille1, device, n_molecules)
scattering_64 = load_scattering(grille, device, n_molecules)

print(scattering_64.shape)
# Concaténation horizontale (par feature)
scattering_coef = np.concatenate([scattering_64,scattering_1,scattering_32], axis=1)

# Optionnel : garder la cible (énergie)
target = qm7['energy']

(6591, 300)


In [12]:
meta_cols = ['id', 'energy','positions','charges']  # Colonnes à ne pas inclure
extra_features = df_train.drop(columns=meta_cols, errors='ignore')

# 4. Convertir les features pandas en numpy
extra_features_np = extra_features.to_numpy()
print(f"Shape extra features: {extra_features_np.shape}")

# 5. Concaténation finale
all_features = np.concatenate([scattering_coef, extra_features_np], axis=1)

Shape extra features: (6591, 36)


In [13]:
n_folds = 3

P = np.random.permutation(n_molecules).reshape((n_folds, -1))

cross_val_folds = []

for i_fold in range(n_folds):
    fold = (np.concatenate(P[np.arange(n_folds) != i_fold], axis=0),
            P[i_fold])
    cross_val_folds.append(fold)

In [14]:
class ElementwiseProd(nn.Module):
    def __init__(self, input_dim, q, k, act='sigmoid'):
        super().__init__()
        self.q = q
        self.k = k
        
        # Sélection de la fonction d'activation
        if act == 'sigmoid':
            self.activation = torch.sigmoid
        elif act == 'tanh':
            self.activation = torch.tanh
        elif act == 'relu':
            self.activation = F.relu
        else:
            raise ValueError(f"Activation '{act}' non supportée.")
        
        # Création des k couches linéaires
        self.hidden_layers = nn.ModuleList([
            nn.Linear(input_dim, q) for _ in range(k)
        ])

    def forward(self, x):
        output = torch.ones(x.size(0), self.q, device=x.device)
        for layer in self.hidden_layers:
            out = self.activation(layer(x))
            output *= out  # Produit élément par élément
        return output

from sklearn.base import BaseEstimator, RegressorMixin
import torch
import torch.nn as nn
import torch.optim as optim

class ElementwiseProdRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, input_dim=1, q=10, k=3, act='sigmoid', epochs=100, lr=1e-3, verbose=False):
        self.input_dim = input_dim
        self.q = q
        self.k = k
        self.act = act
        self.epochs = epochs
        self.lr = lr
        self.verbose = verbose
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self._build_model()

    def _build_model(self):
        class FullModel(nn.Module):
            def __init__(self, input_dim, q, k, act):
                super().__init__()
                self.core = ElementwiseProd(input_dim, q, k, act)
                self.output = nn.Linear(q, 1)
            
            def forward(self, x):
                x = self.core(x)
                x = self.output(x)
                return x
        
        self.model = FullModel(self.input_dim, self.q, self.k, self.act).to(self.device)

    def fit(self, X, y):
        X = torch.tensor(X, dtype=torch.float32).to(self.device)
        y = torch.tensor(y, dtype=torch.float32).view(-1, 1).to(self.device)

        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        self.model.train()
        for epoch in range(self.epochs):
            optimizer.zero_grad()
            output = self.model(X)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            if self.verbose and epoch % 10 == 0:
                print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
        return self

    def predict(self, X):
        self.model.eval()
        X = torch.tensor(X, dtype=torch.float32).to(self.device)
        with torch.no_grad():
            output = self.model(X)
        return output.cpu().numpy().flatten()


In [15]:
import numpy as np
from sklearn import linear_model, preprocessing, pipeline, model_selection
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import joblib

# Supposons que scattering_coef et target soient déjà définis

# Liste des modèles à tester
models = [
    ("Ridge Regression with alpha=0.000001", linear_model.Ridge(alpha=0.000001)),
    ("Ridge Regression with alpha=0.00001", linear_model.Ridge(alpha=0.00001)),
    ("Ridge Regression with alpha=0.0001", linear_model.Ridge(alpha=0.0001)),
    ("Ridge Regression with alpha=0.001", linear_model.Ridge(alpha=0.001)),
    ("Ridge Regression with alpha=0.01", linear_model.Ridge(alpha=0.01)),
    ("Ridge Regression with alpha=0.1", linear_model.Ridge(alpha=0.1)),
    ("Ridge Regression with alpha=1", linear_model.Ridge(alpha=1)),
    ("Ridge Regression with alpha=10", linear_model.Ridge(alpha=10)),
    # ("Lasso Regression", linear_model.Lasso()),
    # ("ElasticNet Regression", linear_model.ElasticNet()),
    # #("Random Forest Regression", RandomForestRegressor()),
    # ('ElementwiseProdRegressor',ElementwiseProdRegressor(input_dim=all_features.shape[1], q=1000, k=3, epochs=1000, lr=2e-2)),
    # ("Support Vector Regression", SVR()),
    # ("XGBoost Regression", XGBRegressor()),
    # ("MLP Regressor", MLPRegressor(random_state=1, max_iter=2000, tol=0.1))
]

results = []

for name, model in models:
    scaler = preprocessing.StandardScaler()
    regressor = pipeline.make_pipeline(scaler, model)

    target_prediction = model_selection.cross_val_predict(regressor, X=all_features, y=target, cv=cross_val_folds)

    MAE = np.mean(np.abs(target_prediction - target))
    RMSE = np.sqrt(np.mean((target_prediction - target) ** 2))

    results.append((name, model, MAE, RMSE))

    print('{}: MAE: {}, RMSE: {}'.format(name, MAE, RMSE))

# Trouver le modèle avec le RMSE le plus bas
best_result = min(results, key=lambda x: x[3])
best_model_name, best_model, best_mae, best_rmse = best_result

print(f"Le meilleur modèle est {best_model_name} avec un RMSE de {best_rmse}.")

Ridge Regression with alpha=0.000001: MAE: 0.10811650533129939, RMSE: 0.9650277265840086
Ridge Regression with alpha=0.00001: MAE: 0.11745377096239198, RMSE: 1.2800857875842233
Ridge Regression with alpha=0.0001: MAE: 0.11797133855146712, RMSE: 1.1568754257439517
Ridge Regression with alpha=0.001: MAE: 0.11869727952004207, RMSE: 0.9127008703261875
Ridge Regression with alpha=0.01: MAE: 0.12863494047012597, RMSE: 0.7805401213436096
Ridge Regression with alpha=0.1: MAE: 0.14390499797111928, RMSE: 0.5920822968242887
Ridge Regression with alpha=1: MAE: 0.1739451321548111, RMSE: 0.3679754429735754
Ridge Regression with alpha=10: MAE: 0.256714543638282, RMSE: 0.37486298039046084
Le meilleur modèle est Ridge Regression with alpha=1 avec un RMSE de 0.3679754429735754.


In [10]:
scaler = preprocessing.StandardScaler()
best_regressor = pipeline.make_pipeline(scaler, best_model)
best_regressor.fit(all_features, target)

# Enregistrer le meilleur modèle
joblib.dump(best_regressor, f'../models_scattering/best_model_all_features_{grille}_{grille1}.pkl')

['../models_scattering/best_model_all_features_64-64-64_96-64-48.pkl']

In [11]:
csv_path = None
path_data = "../../data/atoms/test"
df_test=create_dataframe_from_xyz_files(path_data,csv_path,inv_only=True)

X=df_test[['positions', 'charges','id']]
qm7 = X.to_dict("list")

#qm7 = fetch_qm7(align=True)
pos = np.array(qm7['positions'])
full_charges = np.array(qm7['charges'])

n_molecules = pos.shape[0]

In [12]:
# def load_scattering_test(gril, device, n_molecules):
#     saved_data = torch.load(f'../models_scattering/scattering_test_outputs_{gril}.pt', map_location=device)
#     order_0 = saved_data['order_0'].cpu().numpy().reshape((n_molecules, -1))
#     orders_1_and_2 = saved_data['orders_1_and_2'].cpu().numpy().reshape((n_molecules, -1))
#     return np.concatenate([order_0, orders_1_and_2], axis=1)

# # scattering_16 = load_scattering("16-16-16", device, n_molecules)
# # scattering_32= load_scattering("32-32-32", device, n_molecules)
# scattering_1 = load_scattering(grille1, device, n_molecules)
# scattering_64 = load_scattering(grille, device, n_molecules)

# # Concaténation horizontale (par feature)
# scattering_coef = np.concatenate([scattering_64,scattering_1], axis=1)

# # Optionnel : garder la cible (énergie)
# target = qm7['energy']

In [13]:
saved_data = torch.load( f'../models_scattering/scattering_outputs_test_{grille}.pt', map_location=device)
order_0 = saved_data['order_0']
orders_1_and_2 = saved_data['orders_1_and_2']

order_0 = order_0.cpu().numpy()
orders_1_and_2 = orders_1_and_2.cpu().numpy()

order_0 = order_0.reshape((n_molecules, -1))
orders_1_and_2 = orders_1_and_2.reshape((n_molecules, -1))

scattering_coef_64 = np.concatenate([order_0, orders_1_and_2], axis=1)

saved_data = torch.load( f'../models_scattering/scattering_outputs_test_{grille1}.pt', map_location=device)
order_0 = saved_data['order_0']
orders_1_and_2 = saved_data['orders_1_and_2']

order_0 = order_0.cpu().numpy()
orders_1_and_2 = orders_1_and_2.cpu().numpy()

order_0 = order_0.reshape((n_molecules, -1))
orders_1_and_2 = orders_1_and_2.reshape((n_molecules, -1))

scattering_coef_1 = np.concatenate([order_0, orders_1_and_2], axis=1)

scattering_coef = np.concatenate([scattering_coef_64,scattering_coef_1], axis=1)

In [14]:
meta_cols = ['id', 'energy','positions','charges']  # Colonnes à ne pas inclure
extra_features = df_test.drop(columns=meta_cols, errors='ignore')

# 4. Convertir les features pandas en numpy
extra_features_np = extra_features.to_numpy()
print(f"Shape extra features: {extra_features_np.shape}")

# 5. Concaténation finale
all_features = np.concatenate([scattering_coef, extra_features_np], axis=1)

Shape extra features: (1647, 36)


In [15]:
import joblib

# Charger le modèle enregistré
model = joblib.load(f'../models_scattering/best_model_all_features_{grille}_{grille1}.pkl')
y_pred = model.predict(all_features)

generate_csv(df_test['id'],y_pred,f'best_multiscat_{grille}_{grille1}')