In [2]:
from library.functions_to_abstract_data import extract_qm9_data
from torch_geometric.datasets import QM9

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the QM9 dataset or install if not already installed
dataset = QM9(root='../data/QM9')
print("Number of molecules:", len(dataset))

Number of molecules: 130831


In [None]:
# Convert to panda dataframe
df = extract_qm9_data(dataset)
df

   index             smiles  num_atoms   mu  alpha       HOMO      LUMO  \
0      0  [H]C([H])([H])[H]          5  0.0  13.21 -10.549854  3.186453   

         gap         r2      zpve  ...  U298_atom  H298_atom  G298_atom  \
0  13.736308  35.364101  1.217682  ... -17.286823 -17.389656 -16.151918   

            A           B           C  \
0  157.711807  157.709976  157.706985   

                                               atoms  \
0  [tensor(6), tensor(1), tensor(1), tensor(1), t...   

                                                 pos  \
0  [[tensor(-0.0127), tensor(1.0858), tensor(0.00...   

                                            edge_idx  \
0  [[tensor(0), tensor(0), tensor(0), tensor(0), ...   

                                           edge_attr  
0  [[tensor(1.), tensor(0.), tensor(0.), tensor(0...  

[1 rows x 26 columns]


In [4]:
# List of all attributes of a molecule present in the QM9 database
df.columns

Index(['index', 'smiles', 'num_atoms', 'mu', 'alpha', 'HOMO', 'LUMO', 'gap',
       'r2', 'zpve', 'U0', 'U298', 'H298', 'G298', 'Cv', 'U0_atom',
       'U298_atom', 'H298_atom', 'G298_atom', 'A', 'B', 'C', 'atoms', 'pos',
       'edge_idx', 'edge_attr'],
      dtype='object')

In [5]:
# Check if any NaN values are present
print(df.isna().sum())

index        0
smiles       0
num_atoms    0
mu           0
alpha        0
HOMO         0
LUMO         0
gap          0
r2           0
zpve         0
U0           0
U298         0
H298         0
G298         0
Cv           0
U0_atom      0
U298_atom    0
H298_atom    0
G298_atom    0
A            0
B            0
C            0
atoms        0
pos          0
edge_idx     0
edge_attr    0
dtype: int64


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# --- Step 1: Choose features and target ---
features = ['num_atoms','mu','alpha','r2','zpve','U0','U298','H298','G298','Cv']
X = df[features].values
y = df['gap'].values

# --- Step 2: Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Step 3: Scale features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Step 4: Train Linear Regression ---
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# --- Step 5: Predict and evaluate ---
y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression MAE (eV): {mae:.3f}")
print(f"Linear Regression R²: {r2:.3f}")

# --- Step 6: Inspect coefficients ---
print("\nFeature coefficients:")
for feat, coef in zip(features, model.coef_):
    print(f"{feat}: {coef:.3f}")


Linear Regression MAE (eV): 0.706
Linear Regression R²: 0.539

Feature coefficients:
num_atoms: 2.287
mu: -0.067
alpha: -1.232
r2: 0.490
zpve: -1.260
U0: 60004.976
U298: -30543.389
H298: -24689.690
G298: -4771.157
Cv: 1.561
