# Matrice de Coulomb

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Paramètre : nombre maximum d'atomes autorisé (taille max des matrices de Coulomb)
MAX_ATOMS = 30

# Fonction pour lire un fichier .xyz
def read_xyz(file_path):
    atoms = []
    positions = []
    with open(file_path, 'r') as file:
        lines = file.readlines()
        num_atoms = int(lines[0].strip())
        for line in lines[2:2 + num_atoms]:
            parts = line.split()
            atom = parts[0]
            position = np.array([float(x) for x in parts[1:4]])
            atoms.append(atom)
            positions.append(position)
    return atoms, np.array(positions)

# Fonction pour calculer la matrice de Coulomb
def compute_coulomb_matrix(positions):
    num_atoms = len(positions)
    coulomb_matrix = np.zeros((num_atoms, num_atoms))
    for i in range(num_atoms):
        for j in range(num_atoms):
            if i != j:
                dist = np.linalg.norm(positions[i] - positions[j])
                coulomb_matrix[i, j] = 1.0 / dist
            else:
                coulomb_matrix[i, j] = 0.0  # Valeur diagonale
    return coulomb_matrix

# Padding des matrices à taille fixe
def pad_coulomb_matrix(matrix, max_atoms):
    padded = np.zeros((max_atoms, max_atoms))
    size = matrix.shape[0]
    padded[:size, :size] = matrix
    return padded

# Lecture des données + association aux énergies
def prepare_data_from_directory(xyz_directory, energy_file):
    energy_df = pd.read_csv(energy_file)
    energy_dict = dict(zip(energy_df['id'], energy_df['energy']))

    X = []
    y = []

    xyz_files = [f for f in os.listdir(xyz_directory) if f.endswith('.xyz')]

    for file in xyz_files:
        try:
            # Extraire l'ID de type id_123.xyz -> 123
            molecule_id = int(file.split('_')[1].split('.')[0])
        except (IndexError, ValueError):
            print(f"Nom de fichier inattendu : {file}")
            continue

        if molecule_id in energy_dict:
            file_path = os.path.join(xyz_directory, file)
            atoms, positions = read_xyz(file_path)
            coulomb_matrix = compute_coulomb_matrix(positions)
            padded_matrix = pad_coulomb_matrix(coulomb_matrix, MAX_ATOMS)
            X.append(padded_matrix.flatten())
            y.append(energy_dict[molecule_id])
        else:
            print(f"Aucune énergie trouvée pour {file} (ID {molecule_id})")

    return np.array(X), np.array(y)

# === Chemins vers les fichiers ===
xyz_directory = '/home/gris/N7/Semestre_10/App_cont_phys/Projet_Molecules/data/atoms/train'
energy_file = '/home/gris/N7/Semestre_10/App_cont_phys/Projet_Molecules/data/energies/train.csv'

# Préparation des données
X, y = prepare_data_from_directory(xyz_directory, energy_file)

# Séparation train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entraînement modèle de régression
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Prédiction + évaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.4f}')

# Affichage des résultats
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.xlabel('Énergie réelle')
plt.ylabel('Énergie prédite')
plt.title('Prédiction de l\'énergie moléculaire')
plt.grid(True)
plt.tight_layout()
plt.show()
