In [1]:
from torch_geometric.datasets import QM9


dataset = QM9(root='data/QM9')
print("Number of molecules:", len(dataset))

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/molnet_publish/qm9.zip
Extracting data/QM9/raw/qm9.zip
Downloading https://ndownloader.figshare.com/files/3195404
Processing...
100%|██████████| 133885/133885 [03:28<00:00, 642.48it/s]
Done!


Number of molecules: 130831


In [12]:
from functions_to_abstract_data import extract_qm9_data
df = extract_qm9_data(dataset)
print(df)


Processed 1000/130831 molecules...
Processed 2000/130831 molecules...
Processed 3000/130831 molecules...
Processed 4000/130831 molecules...
Processed 5000/130831 molecules...
Processed 6000/130831 molecules...
Processed 7000/130831 molecules...
Processed 8000/130831 molecules...
Processed 9000/130831 molecules...
Processed 10000/130831 molecules...
Processed 11000/130831 molecules...
Processed 12000/130831 molecules...
Processed 13000/130831 molecules...
Processed 14000/130831 molecules...
Processed 15000/130831 molecules...
Processed 16000/130831 molecules...
Processed 17000/130831 molecules...
Processed 18000/130831 molecules...
Processed 19000/130831 molecules...
Processed 20000/130831 molecules...
Processed 21000/130831 molecules...
Processed 22000/130831 molecules...
Processed 23000/130831 molecules...
Processed 24000/130831 molecules...
Processed 25000/130831 molecules...
Processed 26000/130831 molecules...
Processed 27000/130831 molecules...
Processed 28000/130831 molecules...
P

In [13]:
df.columns

Index(['index', 'smiles', 'num_atoms', 'mu', 'alpha', 'HOMO', 'LUMO', 'gap',
       'r2', 'zpve', 'U0', 'U298', 'H298', 'G298', 'Cv', 'U0_atom',
       'U298_atom', 'H298_atom', 'G298_atom', 'A', 'B', 'C', 'atoms', 'pos',
       'edge_idx', 'edge_attr'],
      dtype='object')

In [17]:
print(df["atoms"][0])
print(df["pos"][0])

tensor([6, 1, 1, 1, 1])
tensor([[-1.2700e-02,  1.0858e+00,  8.0000e-03],
        [ 2.2000e-03, -6.0000e-03,  2.0000e-03],
        [ 1.0117e+00,  1.4638e+00,  3.0000e-04],
        [-5.4080e-01,  1.4475e+00, -8.7660e-01],
        [-5.2380e-01,  1.4379e+00,  9.0640e-01]])


In [14]:
print(df.isna().sum())

index        0
smiles       0
num_atoms    0
mu           0
alpha        0
HOMO         0
LUMO         0
gap          0
r2           0
zpve         0
U0           0
U298         0
H298         0
G298         0
Cv           0
U0_atom      0
U298_atom    0
H298_atom    0
G298_atom    0
A            0
B            0
C            0
atoms        0
pos          0
edge_idx     0
edge_attr    0
dtype: int64


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# --- Step 1: Choose features and target ---
features = ['num_atoms','mu','alpha','r2','zpve','U0','U298','H298','G298','Cv']
X = df[features].values
y = df['gap'].values

# --- Step 2: Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Step 3: Scale features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Step 4: Train Linear Regression ---
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# --- Step 5: Predict and evaluate ---
y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression MAE (eV): {mae:.3f}")
print(f"Linear Regression R²: {r2:.3f}")

# --- Step 6: Inspect coefficients ---
print("\nFeature coefficients:")
for feat, coef in zip(features, model.coef_):
    print(f"{feat}: {coef:.3f}")


Linear Regression MAE (eV): 0.706
Linear Regression R²: 0.539

Feature coefficients:
num_atoms: 2.287
mu: -0.067
alpha: -1.232
r2: 0.490
zpve: -1.260
U0: 60004.976
U298: -30543.389
H298: -24689.690
G298: -4771.157
Cv: 1.561
