In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [2]:
import sys
import os
sys.path.append(os.path.abspath(".."))
from utils_project import generate_csv,create_dataframe_from_xyz_files,create_X_y_from_dataframe


csv_path = "../../data/energies/train.csv"
path_data = "../../data/atoms/train"
df_train=create_dataframe_from_xyz_files(path_data,csv_path,inv_only=True)
X=df_train[['positions', 'energy', 'charges']]

qm7 = X.to_dict("list")

#qm7 = fetch_qm7(align=True)
pos = np.array(qm7['positions'])
full_charges = np.array(qm7['charges'])

n_molecules = pos.shape[0]

In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


In [4]:
M, N, O = 80, 80, 80 #192, 128, 96
grille = "80-80-80"
grid = np.mgrid[-M//2:-M//2+M, -N//2:-N//2+N, -O//2:-O//2+O]
grid = np.fft.ifftshift(grid)


In [5]:
saved_data = torch.load( f'../models_scattering/scattering_outputs_{grille}.pt', map_location=device)
order_0 = saved_data['order_0']
orders_1_and_2 = saved_data['orders_1_and_2']
order_0 = order_0.cpu().numpy()
orders_1_and_2 = orders_1_and_2.cpu().numpy()

order_0 = order_0.reshape((n_molecules, -1))
orders_1_and_2 = orders_1_and_2.reshape((n_molecules, -1))
scattering_coef = np.concatenate([order_0, orders_1_and_2], axis=1)
target = qm7['energy']



In [6]:
meta_cols = ['id', 'energy','positions','charges']  # Colonnes à ne pas inclure
extra_features = df_train.drop(columns=meta_cols, errors='ignore')

extra_features_np = extra_features.to_numpy()
print(f"Shape extra features: {extra_features_np.shape}")

all_features = np.concatenate([scattering_coef, extra_features_np], axis=1)

Shape extra features: (6591, 36)


In [7]:
n_folds = 3

P = np.random.permutation(n_molecules).reshape((n_folds, -1))

cross_val_folds = []

for i_fold in range(n_folds):
    fold = (np.concatenate(P[np.arange(n_folds) != i_fold], axis=0),
            P[i_fold])
    cross_val_folds.append(fold)

In [8]:
class ElementwiseProd(nn.Module):
    def __init__(self, input_dim, q, k, act='sigmoid'):
        super().__init__()
        self.q = q
        self.k = k
        
        # Sélection de la fonction d'activation
        if act == 'sigmoid':
            self.activation = torch.sigmoid
        elif act == 'tanh':
            self.activation = torch.tanh
        elif act == 'relu':
            self.activation = F.relu
        else:
            raise ValueError(f"Activation '{act}' non supportée.")
        
        # Création des k couches linéaires
        self.hidden_layers = nn.ModuleList([
            nn.Linear(input_dim, q) for _ in range(k)
        ])

    def forward(self, x):
        output = torch.ones(x.size(0), self.q, device=x.device)
        for layer in self.hidden_layers:
            out = self.activation(layer(x))
            output *= out  # Produit élément par élément
        return output

from sklearn.base import BaseEstimator, RegressorMixin
import torch
import torch.nn as nn
import torch.optim as optim

class ElementwiseProdRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, input_dim=1, q=10, k=3, act='sigmoid', epochs=100, lr=1e-3, verbose=False):
        self.input_dim = input_dim
        self.q = q
        self.k = k
        self.act = act
        self.epochs = epochs
        self.lr = lr
        self.verbose = verbose
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self._build_model()

    def _build_model(self):
        class FullModel(nn.Module):
            def __init__(self, input_dim, q, k, act):
                super().__init__()
                self.core = ElementwiseProd(input_dim, q, k, act)
                self.output = nn.Linear(q, 1)
            
            def forward(self, x):
                x = self.core(x)
                x = self.output(x)
                return x
        
        self.model = FullModel(self.input_dim, self.q, self.k, self.act).to(self.device)

    def fit(self, X, y):
        X = torch.tensor(X, dtype=torch.float32).to(self.device)
        y = torch.tensor(y, dtype=torch.float32).view(-1, 1).to(self.device)

        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        self.model.train()
        for epoch in range(self.epochs):
            optimizer.zero_grad()
            output = self.model(X)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            if self.verbose and epoch % 10 == 0:
                print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
        return self

    def predict(self, X):
        self.model.eval()
        X = torch.tensor(X, dtype=torch.float32).to(self.device)
        with torch.no_grad():
            output = self.model(X)
        return output.cpu().numpy().flatten()


In [None]:
import numpy as np
from sklearn import linear_model, preprocessing, pipeline, model_selection
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import joblib

cross_val_folds = 5  

models = [
    ("Linear Regression", linear_model.LinearRegression()),
    ("Ridge Regression with alpha=0.000001", linear_model.Ridge(alpha=0.000001)),
    ("Ridge Regression with alpha=0.00001", linear_model.Ridge(alpha=0.00001)),
    ("Ridge Regression with alpha=0.0001", linear_model.Ridge(alpha=0.0001)),
    ("Ridge Regression with alpha=0.001", linear_model.Ridge(alpha=0.001)),
    ("Ridge Regression with alpha=0.01", linear_model.Ridge(alpha=0.01)),
    ("Ridge Regression with alpha=0.1", linear_model.Ridge(alpha=0.1)),
    ("Ridge Regression with alpha=1", linear_model.Ridge(alpha=1)),
    ("Ridge Regression with alpha=10", linear_model.Ridge(alpha=10)),
    # ("Lasso Regression", linear_model.Lasso()),
    # ("ElasticNet Regression", linear_model.ElasticNet()),
    # ("Random Forest Regression", RandomForestRegressor()),
    # # ('ElementwiseProdRegressor',ElementwiseProdRegressor(input_dim=all_features.shape[1], q=1000, k=3, epochs=100, lr=2e-2)),
    # ("Support Vector Regression", SVR()),
    # ("XGBoost Regression", XGBRegressor()),
    # ("MLP Regressor", MLPRegressor(random_state=1, max_iter=2000, tol=0.1))
]

results = []

for name, model in models:
    scaler = preprocessing.StandardScaler()
    regressor = pipeline.make_pipeline(scaler, model)

    target_prediction = model_selection.cross_val_predict(regressor, X=all_features, y=target, cv=cross_val_folds)

    MAE = np.mean(np.abs(target_prediction - target))
    RMSE = np.sqrt(np.mean((target_prediction - target) ** 2))

    results.append((name, model, MAE, RMSE))

    print('{}: MAE: {}, RMSE: {}'.format(name, MAE, RMSE))

best_result = min(results, key=lambda x: x[3])
best_model_name, best_model, best_mae, best_rmse = best_result

print(f"Le meilleur modèle est {best_model_name} avec un RMSE de {best_rmse}.")

Linear Regression: MAE: 0.043829389530923524, RMSE: 0.2956201975365672
Ridge Regression with alpha=0.000001: MAE: 0.045497794168110596, RMSE: 0.2084337357710272
Ridge Regression with alpha=0.00001: MAE: 0.045764474676671774, RMSE: 0.1348089892363223
Ridge Regression with alpha=0.0001: MAE: 0.04843557573210504, RMSE: 0.09720949801724661
Ridge Regression with alpha=0.001: MAE: 0.05951120595068713, RMSE: 0.2001597941612615
Ridge Regression with alpha=0.01: MAE: 0.07868835501208245, RMSE: 0.24303928868493627
Ridge Regression with alpha=0.1: MAE: 0.10589456967343623, RMSE: 0.16298092294522473
Ridge Regression with alpha=1: MAE: 0.15276649965484726, RMSE: 0.21590612013530258
Ridge Regression with alpha=10: MAE: 0.24782694260936025, RMSE: 0.37088007588129757
Le meilleur modèle est Ridge Regression with alpha=0.0001 avec un RMSE de 0.09720949801724661.


In [23]:
import numpy as np
from sklearn import linear_model, preprocessing, pipeline, model_selection
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import joblib

cross_val_folds = 3

models = [
    ("MLP Regressor", MLPRegressor(alpha=0.0,hidden_layer_sizes=(335),learning_rate_init=1e-4,activation='identity',verbose=True,random_state=1, max_iter=1000))
]

results = []

for name, model in models:
    scaler = preprocessing.StandardScaler()
    regressor = pipeline.make_pipeline(scaler, model)

    target_prediction = model_selection.cross_val_predict(regressor, X=all_features, y=target, cv=cross_val_folds)

    MAE = np.mean(np.abs(target_prediction - target))
    RMSE = np.sqrt(np.mean((target_prediction - target) ** 2))

    results.append((name, model, MAE, RMSE))

    print('{}: MAE: {}, RMSE: {}'.format(name, MAE, RMSE))


Iteration 1, loss = 3085.21191287
Iteration 2, loss = 3049.69246071
Iteration 3, loss = 3037.65741128
Iteration 4, loss = 3029.06819390
Iteration 5, loss = 3022.72143341
Iteration 6, loss = 3016.17066376
Iteration 7, loss = 3010.00337786
Iteration 8, loss = 3005.70475097
Iteration 9, loss = 2998.34291246
Iteration 10, loss = 2991.54184934
Iteration 11, loss = 2986.23181816
Iteration 12, loss = 2979.34054004
Iteration 13, loss = 2972.84504975
Iteration 14, loss = 2966.25208829
Iteration 15, loss = 2959.55967627
Iteration 16, loss = 2952.77024961
Iteration 17, loss = 2946.20970072
Iteration 18, loss = 2939.15214395
Iteration 19, loss = 2931.18828212
Iteration 20, loss = 2923.60533442
Iteration 21, loss = 2916.83026808
Iteration 22, loss = 2908.32742825
Iteration 23, loss = 2901.13286492
Iteration 24, loss = 2892.29619265
Iteration 25, loss = 2882.52105553
Iteration 26, loss = 2874.23734482
Iteration 27, loss = 2864.62515684
Iteration 28, loss = 2857.18076990
Iteration 29, loss = 2846.298

In [10]:
scaler = preprocessing.StandardScaler()
best_regressor = pipeline.make_pipeline(scaler, best_model)
best_regressor.fit(all_features, target)

# Enregistrer le meilleur modèle
joblib.dump(best_regressor, f'../models_scattering/best_model_all_features_{grille}_v5.pkl')

['../models_scattering/best_model_all_features_80-80-80_v5.pkl']

# Test

In [11]:
csv_path = None
path_data = "../../data/atoms/test"
df_test=create_dataframe_from_xyz_files(path_data,csv_path,inv_only=True)

X=df_test[['positions', 'charges','id']]
qm7 = X.to_dict("list")

#qm7 = fetch_qm7(align=True)
pos = np.array(qm7['positions'])
full_charges = np.array(qm7['charges'])

n_molecules = pos.shape[0]

In [None]:
saved_data = torch.load( f'../models_scattering/scattering_outputs_test_{grille}.pt', map_location=device)
order_0 = saved_data['order_0']
orders_1_and_2 = saved_data['orders_1_and_2']

order_0 = order_0.cpu().numpy()
orders_1_and_2 = orders_1_and_2.cpu().numpy()

order_0 = order_0.reshape((n_molecules, -1))
orders_1_and_2 = orders_1_and_2.reshape((n_molecules, -1))

scattering_coef = np.concatenate([order_0, orders_1_and_2], axis=1)

meta_cols = ['id','positions','charges']  # Colonnes à ne pas inclure
extra_features = df_test.drop(columns=meta_cols, errors='ignore')

extra_features_np = extra_features.to_numpy()
print(f"Shape extra features: {extra_features_np.shape}")

all_features = np.concatenate([scattering_coef, extra_features_np], axis=1)

Shape extra features: (1647, 22)


In [None]:
import joblib

model = joblib.load(f'../models_scattering/best_model_all_features_{grille}_v5.pkl')
y_pred = model.predict(all_features)

generate_csv(df_test['id'],y_pred,f'best_af_v5_{grille}')