In [1]:
import pandas as pd
import numpy as np

### Removing Duplicates

In [2]:
df = pd.read_csv('descriptors_output_with_pIC50.csv')

In [3]:
df.shape

(1237, 883)

In [4]:
np.where(df.isna())

(array([], dtype=int64), array([], dtype=int64))

In [5]:
df.duplicated().sum()

11

In [6]:
np.where(df.duplicated())

(array([1050, 1054, 1055, 1062, 1063, 1144, 1179, 1187, 1198, 1202, 1212],
       dtype=int64),)

In [7]:
df.drop_duplicates(keep='first', inplace=True)

In [8]:
df.duplicated().sum()

0

In [9]:
names = df.select_dtypes(include='object')

In [10]:
df.drop('Name', axis=1, inplace=True)

In [11]:
df.shape

(1226, 882)

### Splitting the dataset

In [12]:
x = df.drop('pIC50', axis=1)
y = df.pIC50

### Removing features with little variance

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
variances = x.var()
selected_features = variances[variances > 0.1].index

In [None]:
x = x[selected_features]

In [None]:
x

In [None]:
x.shape

### Removing highly correlated features

In [None]:
x_df = pd.DataFrame(x)
corr_matrix = x_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
x_df = x_df.drop(columns=to_drop)
x = x_df.to_numpy()

In [None]:
x.shape

### PCA

In [24]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
x_pca = pca.fit_transform(x)
x_pca.shape

(1226, 2)

##### Or

In [None]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=2, kernel='rbf', gamma=0.1)
x_kpca = kpca.fit_transform(x)
x_kpca.shape

### Creating the model

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

### Transforming X with PolynomialFeatures

In [42]:
from sklearn.preprocessing import PolynomialFeatures
pr = PolynomialFeatures(degree=2)
x_train_pr = pr.fit_transform(x_train)
x_test_pr = pr.transform(x_test)

### Using LazyPredict to get the r2 scores of various models at a glance

In [None]:
from lazypredict.Supervised import LazyRegressor
lzr = LazyRegressor()

In [None]:
models, predictions = lzr.fit(x_train, x_test, y_train, y_test)

In [None]:
top5 = models.sort_values(by='R-Squared', ascending=False).head()

In [None]:
top5

### Model Initialization

#### Random Forest Regressor

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)
rfr.score(x_test, y_test)

0.4520739420292207

In [18]:
y_pred = rfr.predict(x_test)

In [19]:
rfr.score(x_test, y_test)

0.4520739420292207

#### RFE

In [None]:
from sklearn.feature_selection  import RFE

In [None]:
rfe = RFE(rfr, n_features_to_select=10)
rfe.fit(x_train, y_train)

In [None]:
x_train_rfe = selector.transform(x_train)
x_test_rfe = selector.transform(x_test)

In [None]:
rfr.fit(x_train_rfe, y_train)
rfr.score(x_test_rfe, y_test)

#### LGBM

In [None]:
import lightgbm as lgb

In [None]:
train_data = lgb.Dataset(x_train, label=y_train)
test_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 10,
    'verbose': 0
}

In [None]:
gbm = lgb.train(
    params, 
    train_data,
    valid_sets = [test_data],
    num_boost_round = 1000
)

In [None]:
y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test, y_pred)

### Model performance visualisation

In [None]:
import seaborn as sns

In [None]:
ax = sns.kdeplot(y_test)
sns.kdeplot(y_pred, color='orange');

### Cross validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(rfr, x_test, y_test, cv=4, scoring='r2').mean()

### Testing Streamlit

In [None]:
!pip install --upgrade streamlit altair

In [26]:
import joblib

In [27]:
joblib.dump(rfr, 'rfr_model.pkl')

['rfr_model.pkl']

In [28]:
model = joblib.load('rfr_model.pkl')

In [29]:
model

In [None]:
!pip uninstall altair
!pip install altair==4.0.0

In [34]:
import streamlit as st
import pandas as pd
import joblib
import subprocess
import os

# Load the model
model = joblib.load('rfr_model.pkl')

# Function to calculate descriptors using PaDEL
def calculate_descriptors(smiles):
    # Write SMILES to a temporary file
    with open('molecule.smi', 'w') as f:
        f.write(smiles)
    
    # Define output file
    output_file = 'descriptors_output.csv'
    
    # Run PaDEL-Descriptor
    subprocess.run([
        'java', '-jar', 'Drug Design/padel/PaDEL-Descriptor/PaDEL-Descriptor.jar',
        '-descriptortypes', 'Drug Design/padel/PaDEL-Descriptor/descriptors.xml',
        '-dir', '.', '-file', output_file,
        '-2d', '-removesalt', '-standardizenitro'
    ], check=True)

    # Read the descriptors from the output file
    descriptors = pd.read_csv(output_file)
    
    # Clean up temporary files
    os.remove('molecule.smi')
    os.remove(output_file)
    
    return descriptors

ModuleNotFoundError: No module named 'altair.vegalite.v4'

In [None]:
def main():
    st.title("Molecular Bioactivity Prediction App")

    st.write("""
    This app predicts the **bioactivity** of molecules!
    """)

    # Sidebar for user input
    st.sidebar.header("User Input")
    smiles_input = st.sidebar.text_area("Enter SMILES notation of the molecule", "")

    if st.sidebar.button("Predict"):
        if smiles_input:
            try:
                input_df = calculate_descriptors(smiles_input)

                # Display the input features
                st.subheader('Molecule Features')
                st.write(input_df)

                # Make prediction
                prediction = model.predict(input_df)
                st.subheader('Prediction')
                st.write('Bioactivity: ', prediction[0])
            except Exception as e:
                st.error(f"Error processing molecule: {e}")
        else:
            st.error("Please enter the SMILES notation of a molecule")

if __name__ == "__main__":
    main()