# Nano AI-QSAR model project


Author:Kun Mi

Referred: Wei-Chun Chou (J Control Release. 2023 Sep;361:53-63. )  
Date created: 2024/01/10  
Final version date: 2024/05/07

Description: This file is used to assess the NP formulations with high DEtumor.


# Install and import required python pacakges

In [None]:
# Install python pacakges
#!pip install pycaret==2.3.10 markupsafe==2.0.1 pyyaml==5.4.1 -qq
!pip install -q lightgbm # install the lightgbm package
!pip install -q scikit-optimize # install the package used for Bayesian optimization
# !pip install -q scikeras[tensorflow]
!pip install -q tensorflow==2.15.0
!pip install -q keras-tuner

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#Install basic python pcakges
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_log_error
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from time import time
from tensorflow import keras

# Link to google drive and set up your project folder

In [None]:
#Mount drive to google colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set your working directory to a speicifc folder in your Google Drive
# The base Google Drive directory
root_dir = "/content/drive/My Drive/Colab Notebooks/Nano-AI-QSAR"

# choose where you want your project files to be saved
project_folder = "/Model application/" # Name your project here. Please instead the "my project folder" to your prefer name

# Make sure that floder exists. If not, automatically create a new folder
if (not os.path.isdir(root_dir + project_folder)):
  os.mkdir(root_dir + project_folder)
  print(root_dir + project_folder + 'did not exist but was created.')

# Change the OS to use your project folder as the working directory
os.chdir(root_dir + project_folder)

Mounted at /content/drive


In [None]:
Data1 = pd.read_csv(os.path.join(root_dir + project_folder + "Data-Model-application.csv"), encoding='latin1')
df1 = pd.DataFrame(Data1)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7534 entries, 0 to 7533
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ï»¿No.          7534 non-null   int64  
 1   Type            7534 non-null   object 
 2   MAT             7534 non-null   object 
 3   TS              7534 non-null   object 
 4   CT              7534 non-null   object 
 5   TM              7534 non-null   object 
 6   Shape           7534 non-null   object 
 7   Size            7493 non-null   float64
 8   Zeta Potential  7461 non-null   float64
 9   Admin           7519 non-null   float64
 10  DE_tumor        534 non-null    float64
 11  DE_heart        339 non-null    float64
 12  DE_liver        456 non-null    float64
 13  DE_spleen       413 non-null    float64
 14  DE_lung         367 non-null    float64
 15  DE_kidney       396 non-null    float64
dtypes: float64(9), int64(1), object(6)
memory usage: 941.9+ KB


# Prediction

In [None]:
## Model prediction to derive the NP formulation with high DE tumor
df_X1 = df1[["Type","MAT","TS", "CT","TM","Shape","Size","Zeta Potential","Admin" ]]
df_y1 = df1[["DE_tumor"]]
df_y2 = df1[["DE_heart"]]
df_y3 = df1[["DE_liver"]]
df_y4 = df1[["DE_spleen"]]
df_y5 = df1[["DE_lung"]]
df_y6 = df1[["DE_kidney"]]

df_X2=df_X1.iloc[534:7534]

## Replace mising value with "Mean" or "Frequency values"
imputer_mean = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer_freq = SimpleImputer(strategy='most_frequent', missing_values=np.nan)
cols_num = ['Size','Zeta Potential','Admin']
cols_label = ['Type','MAT','TS','CT','TM','Shape']
imputer_mean = imputer_mean.fit(df_X1[cols_num])
imputer_freq = imputer_freq.fit(df_X1[cols_label])
df_X1[cols_num] = imputer_mean.transform(df_X1[cols_num])
df_X1[cols_label] = imputer_freq.transform(df_X1[cols_label])
#Data preprocess
test_X1=df_X1
from sklearn.preprocessing import LabelEncoder

# Encode labels of multiple columns at once
test_X1[cols_label] = test_X1[cols_label].apply(LabelEncoder().fit_transform)
#Normalization
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

cols_num = ['Size','Zeta Potential','Admin']

scaler = StandardScaler()
mscaler = MinMaxScaler()

Data_num_tr1 = pd.DataFrame(mscaler.fit_transform(test_X1[cols_num]))
Data_num_tr1.columns = list(test_X1[cols_num].columns)
#OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse_output=False)
cols_label_1hot = ['Type','MAT','TS','CT','TM','Shape']
data_cat_1hot1 = pd.DataFrame(cat_encoder.fit_transform(test_X1[cols_label_1hot]))
data_cat_1hot1.columns = cat_encoder.get_feature_names_out(cols_label_1hot)

cols_label_bi = ['Type','MAT','TS','CT','TM','Shape']
data_cat_bi = test_X1[cols_label_bi]
preData1=pd.concat([Data_num_tr1, data_cat_1hot1], axis=1)

#Define input varaibles
X_re = preData1.iloc[534:7534].to_numpy()

#Define output varaibles

y_re1 = df_y1['DE_tumor'].iloc[534:7534].to_numpy()
y_re2 = df_y2['DE_heart'].iloc[534:7534].to_numpy()
y_re3 = df_y3['DE_liver'].iloc[534:7534].to_numpy()
y_re4 = df_y4['DE_spleen'].iloc[534:7534].to_numpy()
y_re5 = df_y5['DE_lung'].iloc[534:7534].to_numpy()
y_re6 = df_y6['DE_kidney'].iloc[534:7534].to_numpy()

# summarize shape
print(X_re.shape, y_re1.shape)
print(X_re.shape, y_re2.shape)
print(X_re.shape, y_re3.shape)
print(X_re.shape, y_re4.shape)
print(X_re.shape, y_re5.shape)
print(X_re.shape, y_re6.shape)

#Load model & Make prediction
from keras.models import load_model

best_model_v1 = load_model('DL_tumor_best_model.h5')
best_model_v2 = load_model('DL_heart_best_model.h5')
best_model_v3 = load_model('DL_liver_best_model.h5')
best_model_v4 = load_model('DL_spleen_best_model.h5')
best_model_v5 = load_model('DL_lung_best_model.h5')
best_model_v6 = load_model('DL_kidney_best_model.h5')

##Model predict
all_preds_v1  = best_model_v1.predict(X_re)
all_preds_v2  = best_model_v2.predict(X_re)
all_preds_v3  = best_model_v3.predict(X_re)
all_preds_v4  = best_model_v4.predict(X_re)
all_preds_v5  = best_model_v5.predict(X_re)
all_preds_v6  = best_model_v6.predict(X_re)


#Determine the table
Predict_data_1=pd.DataFrame(all_preds_v1)
Predict_data_2=pd.DataFrame(all_preds_v2)
Predict_data_3=pd.DataFrame(all_preds_v3)
Predict_data_4=pd.DataFrame(all_preds_v4)
Predict_data_5=pd.DataFrame(all_preds_v5)
Predict_data_6=pd.DataFrame(all_preds_v6)
#### Define the coulumn name
Predict_data_1.columns=["DE_tumor"]
Predict_data_2.columns=["DE_heart"]
Predict_data_3.columns=["DE_liver"]
Predict_data_4.columns=["DE_spleen"]
Predict_data_5.columns=["DE_lung"]
Predict_data_6.columns=["DE_kidney"]

MCS_data=df_X2.reset_index()
Prediction_all=pd.concat([MCS_data,
                          Predict_data_1,
                          Predict_data_2,
                          Predict_data_3,
                          Predict_data_4,
                          Predict_data_5,
                          Predict_data_6],
                         axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_X1[cols_num] = imputer_mean.transform(df_X1[cols_num])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_X1[cols_label] = imputer_freq.transform(df_X1[cols_label])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_X1[cols_label] = test_X1[cols_label].apply(LabelEncoder().fit_transform)


(7000, 39) (7000,)
(7000, 39) (7000,)
(7000, 39) (7000,)
(7000, 39) (7000,)
(7000, 39) (7000,)
(7000, 39) (7000,)






In [None]:
Low_limit=3
high_limit=4
Result=Prediction_all[(Prediction_all["DE_tumor"]>Low_limit) & (Prediction_all["DE_tumor"]<high_limit)]
Result=Result[(Result["DE_heart"]>0.0031) &
             (Result["DE_liver"]>0.1653) &
             (Result["DE_spleen"]>0.1008) &
             (Result["DE_lung"]>0.0067)&
             (Result["DE_kidney"]>0.0288)]

Result

Unnamed: 0,index,Type,MAT,TS,CT,TM,Shape,Size,Zeta Potential,Admin,DE_tumor,DE_heart,DE_liver,DE_spleen,DE_lung,DE_kidney
28,562,INM,Gold,Active,Cervix,AO,Spherical,2.31,-40.27,58.9,3.982876,0.756568,13.528610,3.920245,0.873364,2.682317
32,566,INM,Gold,Active,Sarcoma,AO,Others,2.09,-12.93,68.9,3.897069,1.011821,8.170308,4.437104,0.729194,1.504288
45,579,INM,Gold,Active,Glioma,XH,Rod,1.46,-31.46,85.3,3.700063,0.018127,51.333855,0.212477,0.308866,0.535257
54,588,INM,Gold,Passive,Cervix,XH,Spherical,2.18,-33.38,41.6,3.204739,0.889815,21.328217,4.366202,0.352523,3.966029
79,613,INM,Gold,Passive,Sarcoma,XH,Plate,1.75,-11.10,9.9,3.038027,0.254463,10.858767,0.970787,0.743597,3.769625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6938,7472,ONM,Polymeric,Active,Brain,XH,Spherical,1.57,23.72,125.5,3.237869,0.075628,4.594361,0.326459,0.021585,0.543459
6940,7474,ONM,Polymeric,Passive,Sarcoma,AH,Others,1.74,-37.71,109.5,3.835128,0.011253,24.793369,0.214972,0.132662,0.560341
6956,7490,ONM,Polymeric,Passive,Prostate,AO,Others,2.31,-33.06,137.6,3.032626,0.021304,8.943024,1.004224,0.075711,0.527888
6961,7495,ONM,Polymeric,Passive,Skin,AO,Plate,0.88,3.45,3.2,3.196256,0.023800,1.030722,0.212349,0.185990,6.757629


#Save

In [None]:
MCS_PRED =pd.DataFrame(Result)
MCS_PRED.columns = ["No.","Type","MAT","TS","CT","TM","Shape","Size","Zeta Potential","Admin","DE_tumor","DE_heart","DE_liver","DE_spleen","DE_lung","DE_kidney"]
MCS_PRED.to_csv('Model application.csv', encoding = 'utf-8-sig',index=False)