In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


In [4]:
!pip -q install pyarrow duckdb

In [5]:
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle competitions files -c leash-BELKA

name                          size  creationDate                
---------------------  -----------  --------------------------  
test.parquet              30197000  2024-03-20 23:16:46.779000  
train.parquet           3757779095  2024-03-20 23:16:46.779000  
sample_submission.csv     23448553  2024-03-20 23:16:46.779000  
test.csv                 311601330  2024-03-20 23:16:46.779000  
train.csv              53885931568  2024-03-20 23:16:46.779000  


In [7]:
!kaggle competitions download -c leash-BELKA -p /content/data -f train.parquet --force
!kaggle competitions download -c leash-BELKA -p /content/data -f test.parquet --force
!kaggle competitions download -c leash-BELKA -p /content/data -f sample_submission.csv --force

Downloading train.parquet to /content/data
 99% 1.60G/1.62G [00:29<00:00, 28.6MB/s]
100% 1.62G/1.62G [00:29<00:00, 59.3MB/s]
Downloading test.parquet to /content/data
  0% 0.00/18.2M [00:00<?, ?B/s]
100% 18.2M/18.2M [00:00<00:00, 662MB/s]
Downloading sample_submission.csv to /content/data
  0% 0.00/3.76M [00:00<?, ?B/s]
100% 3.76M/3.76M [00:00<00:00, 856MB/s]


In [8]:
!ls -lh /content/data
!file -b /content/data/train.parquet || true
!head -c 200 /content/data/train.parquet | cat

total 1.7G
-rw-r--r-- 1 root root 3.8M Mar 21  2024 sample_submission.csv
-rw-r--r-- 1 root root  19M Mar 21  2024 test.parquet
-rw-r--r-- 1 root root 1.7G Mar 21  2024 train.parquet
Zip archive data, at least v4.5 to extract, compression method=deflate
xT��6���=_�d�!@Bd @BY�ADBو�QT0����fFQ�X˶jY;Tk-���,�ֱ֊��m�c�c�c�X�������\�{�繞��O�=��5�E������O&I댶q�r]ם�<M���

In [9]:
train_path = "/content/data/train.parquet"
test_path  = "/content/data/test.parquet"

In [10]:
!mv /content/data/train.parquet /content/data/train.parquet.zip

!mkdir -p /content/data/extracted
!unzip -o /content/data/train.parquet.zip -d /content/data/extracted

!ls -lh /content/data/extracted
!mv -f /content/data/extracted/train.parquet /content/data/train.parquet

!file -b /content/data/train.parquet
!python - <<'PY'
import pyarrow.parquet as pq
pf = pq.ParquetFile("/content/data/train.parquet")
print("OK. row_groups =", pf.num_row_groups)


Archive:  /content/data/train.parquet.zip
  inflating: /content/data/extracted/train.parquet  
total 3.5G
-rw-r--r-- 1 root root 3.5G Mar 21  2024 train.parquet
Apache Parquet
OK. row_groups = 282


In [11]:
import duckdb

con = duckdb.connect()
proteins = con.execute(f"SELECT DISTINCT protein_name FROM parquet_scan('{train_path}')").df()["protein_name"].tolist()
con.close()

protein_to_idx = {p:i for i,p in enumerate(sorted(proteins))}
num_proteins = len(protein_to_idx)
num_proteins

3

In [12]:
import numpy as np
import pyarrow.parquet as pq
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

class LeashIterable:
    def __init__(self, parquet_path, protein_to_idx, nbits=1024, radius=2,
                 need_target=True, batch_size=4096):
        self.path = parquet_path
        self.protein_to_idx = protein_to_idx
        self.nbits = nbits
        self.radius = radius
        self.need_target = need_target
        self.batch_size = batch_size

    def __iter__(self):
        pf = pq.ParquetFile(self.path)
        cols = ["molecule_smiles", "protein_name"]
        if self.need_target:
            cols.append("binds")

        X_batch = []
        y_batch = []

        for rg in range(pf.num_row_groups):
            tbl = pf.read_row_group(rg, columns=cols)
            smiles_arr = tbl.column("molecule_smiles").to_pylist()
            protein_arr = tbl.column("protein_name").to_pylist()
            if self.need_target:
                target_arr  = tbl.column("binds").to_numpy(zero_copy_only=False)

            nbits = self.nbits
            radius = self.radius
            p2i = self.protein_to_idx
            nprot = len(p2i)

            for i, (sm, prot) in enumerate(zip(smiles_arr, protein_arr)):
                m = Chem.MolFromSmiles(sm)
                if m is None:
                    continue

                fp = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=nbits)
                bv = np.zeros((nbits,), dtype=np.uint8)
                DataStructs.ConvertToNumpyArray(fp, bv)

                onehot = np.zeros((nprot,), dtype=np.uint8)
                idx = p2i.get(prot, None)
                if idx is not None:
                    onehot[idx] = 1

                x = np.concatenate([bv, onehot]).astype(np.float32)
                X_batch.append(x)

                if self.need_target:
                    y_batch.append(np.float32(target_arr[i]))

                if len(X_batch) == self.batch_size:
                    if self.need_target:
                        yield np.stack(X_batch, 0), np.array(y_batch, dtype=np.float32)
                    else:
                        yield np.stack(X_batch, 0)
                    X_batch.clear()
                    y_batch.clear()

        if X_batch:
            if self.need_target:
                yield np.stack(X_batch, 0), np.array(y_batch, dtype=np.float32)
            else:
                yield np.stack(X_batch, 0)


In [None]:
from sklearn.linear_model import SGDClassifier

bits = 1024
n_features = bits + num_proteins
classes = np.array([0,1], dtype=np.int64)

train_stream = LeashIterable(train_path, protein_to_idx, nbits=bits, radius=2,
                             need_target=True, batch_size=4096)

clf = SGDClassifier(loss="log_loss", penalty="l2", alpha=1e-4, learning_rate="optimal")

first = True
for Xb, yb in train_stream:
    if first:
        clf.partial_fit(Xb, yb.astype(int), classes=classes)
        first = False
    else:
        clf.partial_fit(Xb, yb.astype(int))


[1;30;43mПоказано результат, скорочений до останніх рядків (5000).[0m
