In [12]:
df = pd.read_csv('FABP4_03_bioactivity_data_curated.csv')

# Clean SMILES
df_no_smiles = df.drop(columns='canonical_smiles')
smiles = []
for i in df.canonical_smiles.tolist():
    cpd = str(i).split('.')
    cpd_longest = max(cpd, key=len)
    smiles.append(cpd_longest)
smiles = pd.Series(smiles, name='canonical_smiles')
df_clean_smiles = pd.concat([df_no_smiles, smiles], axis=1)

# Calculate Lipinski descriptors
def lipinski(smiles, verbose=False):
    moldata = []
    for elem in smiles:
        mol = Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData = np.arange(1,1)
    i = 0  
    for mol in moldata:        
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                       desc_MolLogP,
                       desc_NumHDonors,
                       desc_NumHAcceptors])   
    
        if(i==0):
            baseData = row
        else:
            baseData = np.vstack([baseData, row])
        i = i+1      
    
    columnNames = ["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData, columns=columnNames)
    
    return descriptors

# Calculate descriptors
df_lipinski = lipinski(df_clean_smiles.canonical_smiles)
df_combined = pd.concat([df, df_lipinski], axis=1)

# Normalize standard values and convert to pIC50
def norm_value(input):
    norm = []
    for i in input['standard_value']:
        if i > 100000000:
            i = 100000000
        norm.append(i)
    input['standard_value_norm'] = norm
    x = input.drop(columns=['standard_value'])
    return x

def pIC50(input):
    pIC50 = []
    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))
    input['pIC50'] = pIC50
    x = input.drop(columns=['standard_value_norm'])
    return x

# Apply normalization and pIC50 conversion
df_norm = norm_value(df_combined)
df_final = pIC50(df_norm)

# Save processed data
df_final.to_csv('FABP4_04_bioactivity_data_3class_pIC50.csv')

# Remove intermediate class
df_2class = df_final[df_final['class'] != 'intermediate']
df_2class.to_csv('FABP4_05_bioactivity_data_2class_pIC50.csv')

# Statistical analysis function
def mannwhitney(descriptor, verbose=False):
    seed(1)
    
    selection = [descriptor, 'class']
    df = df_2class[selection]
    active = df[df['class'] == 'active'][descriptor]
    inactive = df[df['class'] == 'inactive'][descriptor]
    
    stat, p = mannwhitneyu(active, inactive)
    
    alpha = 0.05
    interpretation = 'Different distribution (reject H0)' if p <= alpha else 'Same distribution (fail to reject H0)'
    
    results = pd.DataFrame({
        'Descriptor': descriptor,
        'Statistics': stat,
        'p': p,
        'alpha': alpha,
        'Interpretation': interpretation
    }, index=[0])
    
    filename = f'FABP4_mannwhitneyu_{descriptor}.csv'
    results.to_csv(filename)
    return results

# Plotting functions
def plot_bioactivity_class():
    plt.figure(figsize=(5.5, 5.5))
    sns.countplot(x='class', data=df_2class, edgecolor='black')
    plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
    plt.ylabel('Frequency', fontsize=14, fontweight='bold')
    plt.savefig('FABP4_plot_bioactivity_class.pdf')
    plt.close()

def plot_mw_vs_logp():
    plt.figure(figsize=(5.5, 5.5))
    sns.scatterplot(x='MW', y='LogP', data=df_2class, hue='class', 
                   size='pIC50', edgecolor='black', alpha=0.7)
    plt.xlabel('MW', fontsize=14, fontweight='bold')
    plt.ylabel('LogP', fontsize=14, fontweight='bold')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
    plt.savefig('FABP4_plot_MW_vs_LogP.pdf')
    plt.close()

def plot_boxplot(descriptor):
    plt.figure(figsize=(5.5, 5.5))
    sns.boxplot(x='class', y=descriptor, data=df_2class)
    plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
    plt.ylabel(descriptor, fontsize=14, fontweight='bold')
    plt.savefig(f'FABP4_plot_{descriptor}.pdf')
    plt.close()

# Generate plots
plot_bioactivity_class()
plot_mw_vs_logp()

# Generate boxplots and statistical analysis for each descriptor
descriptors = ['pIC50', 'MW', 'LogP', 'NumHDonors', 'NumHAcceptors']
for desc in descriptors:
    plot_boxplot(desc)
    mannwhitney(desc)
