In [2]:
import os
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/HIV_inhibitors_classification_and_generation/research'

In [3]:
os.chdir("../")
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/HIV_inhibitors_classification_and_generation'

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import numpy as np

In [5]:
hiv = pd.read_csv('artifacts/data_ingestion/HIV.csv')

In [6]:
hiv.HIV_active.value_counts()

HIV_active
0    39684
1     1443
Name: count, dtype: int64

In [7]:
def t_t_split_balance(df, split_size):
    # Separate positive and negative cases
    p_val = df[df.HIV_active == 1].to_numpy()
    n_val = df[df.HIV_active == 0].to_numpy()

    # Ensure class balance by selecting the smaller group as the target
    if len(p_val) >= len(n_val):
        big, small = p_val, n_val
    else:
        big, small = n_val, p_val

    # Stratified test split
    small_train, small_test = train_test_split(small, test_size=split_size, random_state=42)
    big_train, big_test = train_test_split(big, test_size=(split_size * len(small) / len(big)), random_state=42)

    test = np.concatenate([small_test, big_test])
    
    # Ensure the train set remains balanced by oversampling the smaller class
    train = np.concatenate([big_train, random.choices(small_train, k=len(big_train) - len(small_train))])

    # Convert back to DataFrame
    train_df = pd.DataFrame(train, columns=df.columns)
    test_df = pd.DataFrame(test, columns=df.columns)

    return train_df.sample(frac=1, random_state=42), test_df.sample(frac=1, random_state=42)

In [8]:
train_df, test_df = t_t_split_balance(hiv, 0.2)

In [12]:
train_df['HIV_active'].value_counts()

HIV_active
0    39395
1    38241
Name: count, dtype: int64

In [15]:
p_list = p_df.values.tolist()

In [74]:
test = random.choices(p_list, k=int(0.2*len(p_list)))
test

[['CC(C)OC1N(C2CC(N=[N+]=[N-])C(CO)O2)C(=O)NC(=O)C1(C)Br', 'CA', 1],
 ['COc1cc2ccc1OCc1cccc(n1)COc1ccc(cc1OC)C=NCCNCCN=C2', 'CM', 1],
 ['CCOC(=O)CCC(NC(=O)c1ccc(Nc2nc3ccccc3nc2C(=O)OCC)cc1)C(=O)OCC', 'CM', 1],
 ['COC(=O)c1cc(C(=CCC2OCCO2)c2cc(Cl)c(OC)c(C(=O)OC)c2)cc(Cl)c1OC', 'CM', 1],
 ['COc1ccc(C2OC(N)=C(C#N)C3=C2C(C)CCC3)cc1OC', 'CM', 1],
 ['CC(C)(C)c1cc(C(=O)C=Cc2ccc(C(=O)O)cc2)cc(C(C)(C)C)c1', 'CM', 1],
 ['C[n+]1c(-c2ccc(C=NNC(=O)NN=Cc3ccc(-c4cn5ccccc5[n+]4C)cc3)cc2)cn2ccccc21.Cc1ccc(S(=O)(=O)O)cc1',
  'CA',
  1],
 ['Cc1cc(S(=O)(=O)Nc2nc3cccnc3[nH]2)c(S)cc1Cl', 'CM', 1],
 ['Cc1cc(NC(=O)c2ccc(N=Nc3c(S(=O)(=O)O)cc4cc(NC(=O)c5ccc(N)cc5)ccc4c3O)cc2)ccc1N=Nc1ccc2cc(S(=O)(=O)O)cc(S(=O)(=O)O)c2c1',
  'CA',
  1],
 ['Cl.Nc1cc([As]=O)ccc1O', 'CA', 1],
 ['C[n+]1c(-c2ccc(C=NNC(=O)NN=Cc3ccc(-c4cn5ccccc5[n+]4C)cc3)cc2)cn2ccccc21.Cc1ccc(S(=O)(=O)O)cc1',
  'CA',
  1],
 ['O=C(Nc1ccc2c(O)c(N=Nc3ccc4c(O)cc(S(=O)(=O)O)cc4c3)c(S(=O)(=O)O)cc2c1)c1ccccc1',
  'CM',
  1],
 ['O=C1CN=C(c2ccc[nH]2)c2cc(Cl)cc

In [24]:
test = pd.DataFrame(test)

In [65]:
for row in hiv.head().iterrows():
    if hiv.eq(row[1]).all(axis=1).any():
        print(True)

True
True
True
True
True


In [14]:
if list(test_df.columns) == ['smils', 'activity', 'HIV_active']:
    print(True)

In [25]:
test_df['HIV_active'][0] = None

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  test_df['HIV_active'][0] = None


In [27]:
bool(test_df['HIV_active'].isnull().sum())

True

In [26]:
test_df

Unnamed: 0,smiles,activity,HIV_active
257,OCC1(CO)CSSC1,CM,
574,CC=C(C)C(=O)OCC12C(OC(C)=O)CC(C)C(C)(CCC3=CC(=...,CI,0
485,COC(=O)CCCc1ccc2c(c1)CC1(Cc3ccccc3C1)C2,CI,0
101,Cn1cc(NC(=O)c2cc([N+](=O)[O-])cn2C)cc1C(=O)Nc1...,CM,1
566,COC(C(=O)NCCCCNC(=O)C(OC)c1ccccc1)c1ccccc1,CI,0
...,...,...,...
71,CCOC(=O)C(=Cc1c(C)[nH]c2ccccc12)P(=O)(OCC)OCC,CM,1
106,O=S(=O)(c1ccccc1)c1cccc2c1NCCC2,CM,1
270,COC(=O)c1cc(CC2Cc3cc4c(cc3C2=O)CCC4)cc2c1CCC2,CM,1
435,CC(C)(O)C=CP(C)(=O)c1ccccc1,CI,0
