In [1]:
import pandas as pd

In [2]:
data_path = '../data/permeability/permeability_raw_mod.csv'

df = pd.read_csv(data_path)

In [3]:
columns_to_remove = ['PUBCHEM_SID', 'PUBCHEM_CID', 'ID']

df = df.drop(columns=columns_to_remove)

In [4]:
print("Sample of rows showing SMILES_ISO and SMILES:")
print(df[['SMILES_ISO', 'SMILES']].head(10))

Sample of rows showing SMILES_ISO and SMILES:
                                          SMILES_ISO SMILES
0                         CCCCOC1=CC=C(C=C1)CC(=O)NO    NaN
1            COC1=C(C=C(C=C1)CCN2C(=CC(=O)NC2=S)N)OC    NaN
2  COC1=C(C=C(C=C1)Cl)C(=O)NC2=CC=C(C=C2)NC(=O)C3...    NaN
3      CC(C)(C)N1C2=NC=NC(=C2C(=N1)C3=CC=C(C=C3)Cl)N    NaN
4  CN1C2=CC=CC=C2C(=O)C3=C1N=C(N(C3=O)C4=CC=CC=C4...    NaN
5  CC1=C(C=C(C=C1)NS(=O)(=O)C2=CC=CC(=C2)C(=O)O)S...    NaN
6  COC1=CC(=CC(=C1O)OC)C2=NC(=C(N2)C3=CC=CS3)C4=C...    NaN
7                  CN(C)C1=C2C(=C(SC2=NC=C1)C(=O)N)N    NaN
8  CC(C)(C)C1CCC2=C(C1)SC(=C2C(=O)N)NC(=O)C3=CC=N...    NaN
9    C1=CC=C2C(=C1)C(=N)N(N=N2)C3=CC=C(C=C3)C(F)(F)F    NaN


In [5]:
df['SMILES_combined'] = df['SMILES_ISO'].fillna(df['SMILES'])

In [6]:
print(f"Number of missing values in SMILES_combined: {df['SMILES_combined'].isna().sum()}")
print(f"Number of missing values in SMILES: {df['SMILES'].isna().sum()}")
print(f"Number of missing values in SMILES_ISO: {df['SMILES_ISO'].isna().sum()}")


Number of missing values in SMILES_combined: 0
Number of missing values in SMILES: 5631
Number of missing values in SMILES_ISO: 786


In [7]:
df = df.drop(['SMILES_ISO', 'SMILES'], axis=1)

In [8]:
df = df.rename(columns={'SMILES_combined': 'SMILES'})

In [9]:
columns = df.columns.tolist()
columns.remove('SMILES')
columns = ['SMILES'] + columns
df = df[columns]

In [10]:
print(f"Number of rows in CLASSES: {len(df)}")
df = df.dropna(subset=['CLASSES'])
print(f"Number of rows after dropping NA in CLASSES: {len(df)}")
print(f"Number of NA values check: {df['CLASSES'].isna().sum()}")


Number of rows in CLASSES: 6417
Number of rows after dropping NA in CLASSES: 5922
Number of NA values check: 0


In [11]:
# First, let's check duplicates based on SMILES column only
print(f"Number of duplicate SMILES: {df.duplicated(subset=['SMILES']).sum()}")

# Remove duplicates based only on SMILES column (keeping the first occurrence)
df_no_duplicates = df.drop_duplicates(subset=['SMILES'])

# Print the shape of the DataFrame before and after removing duplicates
print(f"\nShape before removing duplicates: {df.shape}")
print(f"Shape after removing duplicates: {df_no_duplicates.shape}")

# Assign back to df
df = df_no_duplicates

# Verify no SMILES duplicates remain
print(f"\nVerifying - Number of remaining duplicate SMILES: {df.duplicated(subset=['SMILES']).sum()}")


Number of duplicate SMILES: 2855

Shape before removing duplicates: (5922, 16)
Shape after removing duplicates: (3067, 16)

Verifying - Number of remaining duplicate SMILES: 0


In [12]:
# Get value counts for CLASSES
df['CLASSES'].value_counts()

CLASSES
1.0    2196
0.0     871
Name: count, dtype: int64

In [13]:
df['CLASSES_2'] = df['CLASSES']
phenotype_col = 'Phenotype (0-10 = Low Permeability; 10-100 = Moderate Permeability; >100 = High Permeability)'
df.loc[df[phenotype_col] == 'Moderate', 'CLASSES_2'] = 0

In [14]:
# Get value counts for CLASSES
df['CLASSES_2'].value_counts()

CLASSES_2
1.0    1772
0.0    1295
Name: count, dtype: int64

In [15]:
# Save the cleaned dataframe to a CSV file
output_path = '../data/permeability/permeability_cleaned.csv'
df.to_csv(output_path, index=False, na_rep='N/A')

