In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import joblib
import os
import polars as pl
from concurrent.futures import as_completed, ProcessPoolExecutor


In [3]:
!pip install schrodinger openbabel rdkit joblib

Collecting schrodinger
  Downloading schrodinger-0.1.tar.gz (3.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting openbabel
  Downloading openbabel-3.1.1.1.tar.gz (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.8/82.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting botocore<1.30.0,>=1.29.100 (from boto3>=1.4.4->schrodinger)
  Downloading botocore-1.29.165-py3-none-any.whl.metadata (5.9 kB)
Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading botocore-1.29.165-py3-none-any.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11

In [None]:
!conda install -c conda-forge -c schrodinger joblib openbabel rdkit pymol-bundle -y

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from catboost import CatBoostClassifier
from rdkit.Chem import rdMolDescriptors
import joblib

In [None]:
test_file = '/kaggle/input/leash-BELKA/test.csv'

In [None]:
with open('/kaggle/input/list-of-ojects/score_model_list.joblib', 'rb') as f:
    models = joblib.load(f)

In [None]:
best_model = models[1][1]

In [None]:
def add_attributes(smiles, brd4, hsa, seh, ii):

    mol = Chem.rdmolfiles.MolFromSmiles(smiles)
    
    # Calculate the number of atoms and bonds
    atom_count = mol.GetNumAtoms()
    bond_count = mol.GetNumBonds()

    # Calculate the molecular weight
    molecular_weight = Descriptors.ExactMolWt(mol)
    molecular_weight = round(molecular_weight, 2)
    
    
    # logP = rdMolDescriptors. MolLogP(mol)
    # logP = round(logP , 2)
    hb_acceptor = rdMolDescriptors.CalcNumHBA(mol)
    hb_donor = rdMolDescriptors.CalcNumHBD(mol)

    tpsa = rdMolDescriptors.CalcTPSA(mol)

    rotatable_bond = rdMolDescriptors.CalcNumRotatableBonds(mol)

    # Calculate the number of charges (positive and negative)
    try:
        charges = [atom.GetProp('_Charge') for atom in mol.GetAtoms()]
        pos_charges = len([c for c in charges if c > 0])
        neg_charges = len([c for c in charges if c < 0])

    except KeyError:
        pos_charges = 0
        neg_charges = 0

    return {
        'id':ii,
        'atoms': atom_count,
        'bonds': bond_count,
        'positive_charges': pos_charges,
        'negative_charges': neg_charges,
        'molecular_weight' : molecular_weight,
        # 'LogP' : logP ,
        'HBA' : hb_acceptor ,
        'HBD' : hb_donor,
        'TPSA' : tpsa ,
        'Rotatable Bonds' : rotatable_bond,
        'BRD4':brd4,
        'HSA':hsa,
        'sEH':seh,
#         'binds': label
    }

In [None]:
import os
test_file = '/kaggle/input/leash-BELKA/test.csv'
# Process the test.parquet file chunk by chunk
output_file = 'submission.csv'  # Specify the path and filename for the output file

# Read the test.parquet file into a pandas DataFrame
for df_test in pd.read_csv(test_file, chunksize=100000):
    df_test['HSA'] = 0
    df_test['BRD4'] = 0
    df_test['sEH'] = 0
    df_test.loc[df_test['protein_name']=='HSA', 'HSA'] = 1
    df_test.loc[df_test['protein_name']=='BRD4', 'BRD4'] = 1
    df_test.loc[df_test['protein_name']=='sEH', 'sEH'] = 1
    feats = []
    with ProcessPoolExecutor(max_workers=8) as pool:
        futures = [pool.submit(add_attributes, row['molecule_smiles'], row['BRD4'], row['HSA'], row['sEH'],
                                row['id']) 
                  for ii, row in df_test.iterrows()]
        for f in as_completed(futures):
            feats.append(f.result())
    df_X = pd.DataFrame(feats)
    
    probabilities = best_model.predict_proba(df_X[best_model.feature_names_])[:, 1]

    # Create a DataFrame with 'id' and 'probability' columns
    output_df = pd.DataFrame({'id': df_X['id'], 'binds': probabilities})

    # Save the output DataFrame to a CSV file
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))