In [1]:
!pip install duckdb
!pip install rdkit

Collecting duckdb
  Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.0.0
Collecting rdkit
  Downloading rdkit-2024.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.1/35.1 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.1


In [2]:
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


In [3]:
# Import necessary libraries
from sklearn.metrics import average_precision_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import duckdb


In [4]:
# File paths
train_parquet_path = '/kaggle/input/leash-BELKA/train.parquet'
test_parquet_path = '/kaggle/input/leash-BELKA/test.parquet'
train_csv_path = '/kaggle/input/leash-BELKA/train.csv'
test_csv_path = '/kaggle/input/leash-BELKA/test.csv'

# Connect to DuckDB
con = duckdb.connect()

# Sample 30,000 rows each for binds = 0 and binds = 1 from train.parquet
query_train = f"""
    (SELECT *
    FROM parquet_scan('{train_parquet_path}')
    WHERE binds = 0
    ORDER BY random()
    LIMIT 30000)
    UNION ALL
    (SELECT *
    FROM parquet_scan('{train_parquet_path}')
    WHERE binds = 1
    ORDER BY random()
    LIMIT 30000)
"""

# Load data into DataFrame
train_df = con.from_query(query_train).df()

# Load test data without sampling
query_test = f"""
    SELECT *
    FROM parquet_scan('{test_parquet_path}')
"""

test_df = con.from_query(query_test).df()

# Close DuckDB connection
con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [5]:
# Perform feature engineering
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None, None
    num_atoms = mol.GetNumAtoms()
    mol_weight = Descriptors.MolWt(mol)
    return num_atoms, mol_weight

# Calculate descriptors for train and test data
train_df['num_atoms'], train_df['mol_weight'] = zip(*train_df['molecule_smiles'].apply(calculate_descriptors))
test_df['num_atoms'], test_df['mol_weight'] = zip(*test_df['molecule_smiles'].apply(calculate_descriptors))


In [6]:
# Define features and target
features = ['num_atoms', 'mol_weight']
target = 'binds'

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]


In [7]:
# Define the GNN model
class GNNModel(nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return torch.sigmoid(x)

In [8]:
# Initialize a basic GNN model
num_features = len(features)
hidden_dim = 64
num_classes = 1  # Binary classification

model = GNNModel(num_features, hidden_dim, num_classes)

# Dummy edge index (assumed fully connected graph for simplicity)
edge_index = torch.tensor([[0, 1, 2], [1, 0, 2]], dtype=torch.long)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)


In [9]:
# Train the model
model.train()
for epoch in range(10):  # Adjust epochs as needed
    optimizer.zero_grad()
    output = model(X_train_tensor, edge_index)
    loss = criterion(output, y_train_tensor)
    loss.backward()
    optimizer.step()

# Set model to evaluation mode
model.eval()

GNNModel(
  (conv1): GCNConv(2, 64)
  (conv2): GCNConv(64, 1)
)

In [10]:
# Convert test data to tensor
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)


In [11]:
# Make predictions on the test set
with torch.no_grad():
    y_pred = model(X_test_tensor, edge_index).numpy().flatten()  # Flatten y_pred



In [12]:
# Create submission DataFrame
submission_df = pd.DataFrame({'id': test_df['id'], 'binds': y_pred})

# Save submission file
submission_df.to_csv('submission_gnn.csv', index=False)
submission_df.head()

Unnamed: 0,id,binds
0,295246830,1.0
1,295246831,1.0
2,295246832,1.0
3,295246833,1.0
4,295246834,1.0
