In [1]:
import os
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/HIV_inhibitors_classification_and_generation/research'

In [2]:
os.chdir("../")
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/HIV_inhibitors_classification_and_generation'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import numpy as np
from rdkit import Chem
import deepchem as dc

In [4]:
hiv = pd.read_csv('artifacts/data_ingestion/HIV.csv')

In [5]:
hiv['HIV_active'].value_counts()

HIV_active
0    39684
1     1443
Name: count, dtype: int64

In [None]:
def t_t_split_balance(df, split_size):
    # Separate positive and negative cases
    p_val = df[df.HIV_active == 1].to_numpy()
    n_val = df[df.HIV_active == 0].to_numpy()

    # Ensure class balance by selecting the smaller group as the target
    if len(p_val) >= len(n_val):
        big, small = p_val, n_val
    else:
        big, small = n_val, p_val

    # Stratified test split
    small_train, small_test = train_test_split(small, test_size=split_size, random_state=42)
    big_train, big_test = train_test_split(big, test_size=(split_size * len(small) / len(big)), random_state=42)

    test = np.concatenate([small_test, big_test])
    
    # Ensure the train set remains balanced by oversampling the smaller class
    train = np.concatenate([big_train, random.choices(small_train, k=len(big_train) - len(small_train))])

    # Convert back to DataFrame
    train_df = pd.DataFrame(train, columns=df.columns)
    test_df = pd.DataFrame(test, columns=df.columns)

    return train_df.sample(frac=1, random_state=42), test_df.sample(frac=1, random_state=42)

In [None]:
train_df, test_df = t_t_split_balance(hiv, 0.2)

In [None]:
train_df.head()

In [None]:
p_list = p_df.values.tolist()

In [None]:
test = random.choices(p_list, k=int(0.2*len(p_list)))
test

In [None]:
test = pd.DataFrame(test)

In [None]:
for row in hiv.head().iterrows():
    if hiv.eq(row[1]).all(axis=1).any():
        print(True)

In [None]:
if list(test_df.columns) == ['smils', 'activity', 'HIV_active']:
    print(True)

In [None]:
test_df['HIV_active'][0] = None

In [None]:
bool(test_df['HIV_active'].isnull().sum())

In [12]:
train_pth = 'artifacts/data_transformation/train.csv'
train = pd.read_csv(train_pth)
# list(train.index)
train.HIV_active.value_counts()
len(train)
train.tail()

Unnamed: 0,name,smiles,activity,HIV_active
77631,train_77631,Cn1c2ccccc2c2ccc3c(c21)C(=O)C=CC3=O,CI,0
77632,train_77632,Cn1cc(NC(=O)Nc2cc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)N...,CA,1
77633,train_77633,O=C(Nc1ccc(CCC2=NCCCN2)cc1)c1ccc(C(=O)Nc2ccc(C...,CM,1
77634,train_77634,CN1C(=O)C(=NN(c2ccccc2)c2ccccc2)c2ccccc21,CI,0
77635,train_77635,CCOC(=O)C(Cc1c(C=O)[nH]c2ccccc12)(NC(=O)c1cccc...,CI,0


In [17]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train, test_size=0.2, random_state=42, stratify=train.HIV_active)

In [22]:
val_df.HIV_active.value_counts()

HIV_active
0    7879
1    7649
Name: count, dtype: int64

In [28]:
type(val_df.index.tolist()[0])

int

In [None]:
name = [f'test_{i+1}' for i in train.index]
train.insert(0, 'name', name)
train.head()

In [None]:
mol_obj = Chem.MolFromSmiles(train.smiles[1])

In [None]:
for atom in mol_obj.GetAtoms():
    print(atom.GetSymbol(), atom.GetChiralTag())

In [None]:
bonds = mol_obj.GetBonds()

for bond in bonds:
    print(bond.GetBeginAtom().GetAtomicNum())
    print(bond.GetIsConjugated())
    print(bond.GetEndAtom().GetAtomicNum())
    print('___')

In [None]:
featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
f = featurizer._featurize(mol_obj)
# data = f.to_pyg_graph()
f.node_features

In [None]:
datasets = {name: pd.read_csv(csv) for name, csv in 
                zip(["train", "test"], ['artifacts/data_transformation/train.csv', 'artifacts/data_transformation/test.csv'])}

In [None]:
datasets

In [4]:
from hivclass.utils.molecule_dataset import MoleculeDataset

In [5]:
train_dataset = MoleculeDataset(
    'artifacts/data_ingestion',
    'artifacts/data_transformation',
    'HIV.csv',
    ['test', 'train', 'test.csv', 'train.csv']
)

In [6]:
sample = train_dataset.get(0)

In [10]:
train_dataset.len()

578

In [None]:
mol = 'train_0.pt'

idx = int