## 📦 Download the necessary feature and model files from the Zenodo repository

### `processed_data.zip` (661.7 MB) contains:
- **`singlePPD_full_bins_features.csv`**: Generated TopoDockQ features for model training, validation, and testing in the SinglePPD dataset.
- **`singlePPD_DockQ.csv`**: Contains the full set of models in the SinglePPD dataset, including all 50 predictions per complex generated by AlphaFold2-Multimer.
- **`singlePPD_filtered_DockQ.csv`**: A curated subset in which mutually highly similar models (DockQ ≥ 0.98) have been removed from the training set.  
  Only the training set is filtered; validation and test sets remain unfiltered to reflect real-world model selection scenarios.

📂 **Installation**: Extract this zip file to the `./data/processed_data/` folder.

---

### `trained_model.zip` (66.9 MB) contains:
- **`best_model.pth`**: Optimal pre-trained model for inference.

📂 **Installation**: Extract this zip file to the `./models/` folder.


In [1]:
from src.model import *
from src.train import *

In [2]:
df1_file='./data/processed_data/singlePPD_full_bins_features.csv'
df2_file='./data/processed_data/singlePPD_DockQ.csv'


model_path="./models/best_model.pth"

In [3]:
# Standard libraries
import argparse

# Numerical and data handling
import numpy as np
import pandas as pd

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Scikit-learn
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt



lr=0.0005
batch_size=512

# num_epochs=1000
# patience=1000


num_epochs=20
patience=20


dropout=0.0



df1=pd.read_csv(df1_file)



df_train=df1[df1['data_class']!='test']



############################################################################################################
identical_columns_count = sum(df_train.nunique() == 1)

# print(f"Number of columns with all identical values: {identical_columns_count}")
#####


identical_columns =df_train.columns[df_train.nunique() == 1]

# Print the names of these columns
# print("Columns with all identical values:", identical_columns.tolist())

# Remove these columns from the DataFrame
df1= df1.drop(columns=identical_columns)
################################################################################################################


df2=pd.read_csv(df2_file)


df2=df2[['pdb_id','af_model_id','af_confidence','pdb2sql_DockQ','data_class']]

df1=pd.merge(df1,df2,on=['pdb_id','af_model_id','pdb2sql_DockQ','data_class'],how='inner')


df1_for_eval=df1
# print(df1.shape)



df_train=df1[df1['data_class']=='train']


df_val=df1[df1['data_class']=='valid']
df_test=df1[df1['data_class']=='test']

################################################################################################################


list_train=list(set(df_train['pdb_id'].to_list()))
list_val=list(set(df_val['pdb_id'].to_list()))
list_test=list(set(df_test['pdb_id'].to_list()))
# print(len(list_train),len(list_val),len(list_test))
for i in list_train:
    if (i in list_val) or (i in list_test):
        print(i)
for i in list_test:
    if (i in list_val) or (i in list_train):
        print(i)

################################################################################################################
import numpy as np
filtration_values = [f"{x:.1f}" if x.is_integer() else f"{x:.2f}".rstrip("0") for x in np.arange(2.0, 10.25, 0.25)]
filtration_values
# Generate persistent feature names for each filtration value
persistent_features = []
for filtration in filtration_values:
    persistent_features += [f"persistent_{filtration}_{str(i+1).zfill(2)}" for i in range(72)]  # Adjust 72 to the actual count if different

# Generate static feature names
static_features = [f"static_{str(i+1).zfill(2)}" for i in range(378)]  # Adjust 378 to the actual count if different

# Combine both lists
combined_feature_name_list = persistent_features + static_features


feature_name_list=[
                


                   'pdb2sql_DockQ'] +combined_feature_name_list

valid_columns = [col for col in feature_name_list if col in df_train.columns]

################################################################################################################


# Subset the DataFrame with the valid columns
df_train = df_train[valid_columns]
df_val = df_val[valid_columns]
df_test = df_test[valid_columns]

# print(df_train.shape,df_val.shape,df_test.shape)


####


def get_features_and_target(df):
    y = df['pdb2sql_DockQ'].values  # Select the 'pdb2sql_DockQ' column as the target
    X = df.drop(columns=['pdb2sql_DockQ']).values  # Drop the 'pdb2sql_DockQ' column to get the features
    return X, y




# Standardized input features before conducting inference

In [4]:
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

# Extract features and targets
X_train, y_train = get_features_and_target(df_train)
X_val, y_val = get_features_and_target(df_val)
X_test, y_test = get_features_and_target(df_test)



from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)



# Convert arrays to PyTorch tensors and create datasets
train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
val_data = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=1000000)
test_loader = DataLoader(test_data, batch_size=1000000)

In [5]:

input_dim = 2646
neurons1 = 2048     
neurons2 = 2048  
neurons3 = 2048 
neurons4 = 2048 

dropout = dropout   
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Inference on the pre-trained model:
## The example model used in this step was trained and saved in tutorial_train.ipynb

In [6]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert X_val and y_val to tensors
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# Create DataLoader for validation set
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=1000000)

# Load the best saved model
best_model = TopoDockQ(input_dim, neurons1, neurons2, neurons3, neurons4, dropout).to(device)
best_model.load_state_dict(torch.load(model_path))
best_model.eval()

# Inference
with torch.no_grad():
    predictions = []
    true_values = []

    for inputs, targets in val_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = best_model(inputs)
        predictions.append(outputs.cpu())
        true_values.append(targets.cpu())

# Concatenate results
predictions = torch.cat(predictions).numpy()
true_values = torch.cat(true_values).numpy()

# Save predictions to CSV
import pandas as pd

val_results_df = pd.DataFrame({
    'True_DockQ': true_values.flatten(),
    'Predicted_DockQ(p-DockQ)': predictions.flatten()
})
val_results_df.to_csv("val_inference_results.csv", index=False)

print("Validation inference completed. Results saved to 'val_inference_results.csv'.")


Validation inference completed. Results saved to 'val_inference_results.csv'.
