### Goal:
Use scikit-learn's logistic regression to develop a predictive model for blood-brain-barrier penetration

In [13]:
from pip._internal import main as pip

try:
    import pandas as pd
except ImportError:
    pip(['install', 'pandas'])
    import pandas as pd
    
try:
    import sklearn
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn import metrics
    from sklearn.metrics import confusion_matrix
except ImportError:
    pip(['install', 'sklearn'])
    import sklearn
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn import metrics
    from sklearn.metrics import confusion_matrix
    
try:
    import matplotlib.pyplot as plt
except ImportError:
    pip(['install', 'matplotlib'])
    import matplotlib.pyplot as plt

In [14]:
df = pd.read_csv('compounds_descriptors.csv', index_col=None)

In [15]:
# remove constant columns - they cannot be scaled
df = df.loc[:, (df != df.iloc[0]).any()] 

#### Split the main dataframe into X and Y

In [16]:
X = df.drop(['True', 'SMILES'], axis=1)
Y = df[['True']].replace({'non-penetrating': 0, 'penetrating': 1})

#### Scaling between 0 and 1

In [17]:
X_scaled = (X - X.min()) / (X.max() - X.min())

#### Select only 20 most important features

In [None]:
model = LogisticRegression(solver='newton-cg', multi_class='multinomial', max_iter=100) 

rfe = RFE(model, 20, verbose=1)
fit = rfe.fit(X_scaled, Y['True'])

X_rfe = X_scaled.loc[:, fit.support_.tolist()]

#### Cross-validation of logistic regression model

In [19]:
model = LogisticRegressionCV(Cs=100, class_weight=None,
            cv=5, dual=False, fit_intercept=True, intercept_scaling=1.0, 
            max_iter=10000, multi_class='ovr', n_jobs=1, penalty='l1', 
            random_state=None, refit=True, scoring='accuracy', 
            solver='liblinear', tol=0.0001, verbose=0).fit(X_rfe, Y['True'])

# get class predictions
df['Predicted'] = model.predict(X_rfe)

# get probabilities
dfProb = pd.DataFrame(model.predict_proba(X_rfe), columns=['Probability_N', 'Probability_P'])

dfJaqpot = df[['SMILES', 'True', 'Predicted']].replace({0: 'non-penetrating', 1: 'penetrating'})
dfJaqpot = pd.concat([dfJaqpot, dfProb], sort=False, axis=1)

dfJaqpot.to_csv('predictions_Jaqpot.csv', index=False)

#### Evaluate model performance
- confusion matrix
- PPV, NPV
- ROC, AUC

In [20]:
conf = pd.crosstab(dfJaqpot['True'], dfJaqpot['Predicted'])
conf

Predicted,non-penetrating,penetrating
True,Unnamed: 1_level_1,Unnamed: 2_level_1
non-penetrating,90,38
penetrating,25,235


In [23]:
tn, fp, fn, tp = confusion_matrix(dfJaqpot['True'], dfJaqpot['Predicted']).ravel()
    
ppv = tp / (tp + fp)
npv = tn / (tn + fn)
ppv, npv

(0.8608058608058609, 0.782608695652174)

In [None]:
# ROC plot
dfPlot = dfJaqpot.copy()

dfPlot = dfPlot.dropna(axis=0, how='any').drop_duplicates()

dfPlot['Probability_diff'] = (dfPlot['Probability_P'] - dfPlot['Probability_N']) # / 2. + 0.5
dfPlot['True'] = dfPlot['True'].replace({'penetrating': 1, 'non-penetrating': 0})

# compute ROC
fpr, tpr, thresholds = metrics.roc_curve(dfPlot['True'], dfPlot['Probability_diff'], pos_label=1)

# compute AUC
auc = metrics.auc(fpr, tpr)

# prepare the plot
fig, ax = plt.subplots(1, 1)

ax.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % auc)
ax.plot([0,1], [0,1], label='', color='black', linestyle='--')
ax.legend(loc="lower right")
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_aspect('equal')
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])

fig.tight_layout()

plt.show()

#### Deploy the model to Jaqpot service

In [26]:
pip(['install', 'jaqpotpy'])

Collecting jaqpotpy
  Downloading https://files.pythonhosted.org/packages/9f/66/e1615da250298a3326b814cbc36b832aa5c89d22ab2b1638db625bc6a75d/jaqpotpy-0.0.2.tar.gz
Building wheels for collected packages: jaqpotpy
  Building wheel for jaqpotpy (setup.py): started
  Building wheel for jaqpotpy (setup.py): finished with status 'done'
  Stored in directory: /home/jovyan/.cache/pip/wheels/ae/52/06/feb21d07f563e84b977b03eab36033d7ad51b24b47808e786d
Successfully built jaqpotpy
Installing collected packages: jaqpotpy
Successfully installed jaqpotpy-0.0.2


0

In [40]:
from jaqpotpy import Jaqpot
jaqpot = Jaqpot("http://api.jaqpot.org/jaqpot/services/")
#jaqpot.request_key_safe()
jaqpot.set_api_key("eyJhbGciOiJSUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJoX2p2Z3I3bWZ4VGJ3OHJLNW9Fb3dWWUVHUms2Z0hsLW9sSjdPUnQ3V2QwIn0.eyJqdGkiOiJkNDQ1NmYyMi1lMjU3LTQyYWEtYTc5OS0xYTFmMmQ2MzVlM2MiLCJleHAiOjE1NzA0NDc5ODYsIm5iZiI6MCwiaWF0IjoxNTcwNDQ3MDg2LCJpc3MiOiJodHRwczovL3Nzby5wcm9kLm9wZW5yaXNrbmV0Lm9yZy9hdXRoL3JlYWxtcy9vcGVucmlza25ldCIsImF1ZCI6ImFjY291bnQiLCJzdWIiOiI1NmUyNjk0MS0wMjk1LTRiNmUtOTRmOS03OWY3OWZiY2Y5NmMiLCJ0eXAiOiJCZWFyZXIiLCJhenAiOiJqYXFwb3QtdWkiLCJub25jZSI6Ik4wLjg1MTI5ODM1ODU0MDUyMDcxNTcwNDQ3MDg0MjY4IiwiYXV0aF90aW1lIjoxNTcwNDQ0NjY3LCJzZXNzaW9uX3N0YXRlIjoiNGE1ZjkwNGUtNmJkNy00YTM4LWIwYmEtZDkwNzRlNTEyMWI4IiwiYWNyIjoiMCIsImFsbG93ZWQtb3JpZ2lucyI6WyIqIl0sInJlYWxtX2FjY2VzcyI6eyJyb2xlcyI6WyJzdGFuZGFyZC11c2VyIiwib2ZmbGluZV9hY2Nlc3MiLCJ1bWFfYXV0aG9yaXphdGlvbiJdfSwicmVzb3VyY2VfYWNjZXNzIjp7ImFjY291bnQiOnsicm9sZXMiOlsibWFuYWdlLWFjY291bnQiLCJtYW5hZ2UtYWNjb3VudC1saW5rcyIsInZpZXctcHJvZmlsZSJdfX0sInNjb3BlIjoib3BlbmlkIHByb2ZpbGUgZW1haWwiLCJlbWFpbF92ZXJpZmllZCI6ZmFsc2UsIm5hbWUiOiJ0b21heiBtb2hvcmljIiwicHJlZmVycmVkX3VzZXJuYW1lIjoidG1vaG9yaWMiLCJnaXZlbl9uYW1lIjoidG9tYXoiLCJmYW1pbHlfbmFtZSI6Im1vaG9yaWMiLCJlbWFpbCI6Im1vaG9yaWMudG9tYXpAZ21haWwuY29tIn0.LyLwDEVdBXfdBn8oB2pV1bKXW8Pa2y4NcBran4Lh7ltK1Bsdotn7mJDGR549vf2Nrh9W7cbhDpUfoADwdX9JfRGEbC9660ZF9EAt62b8DknWIrX1MQ7WfogFi-jB9Be5RIAOSEpBVQxU1UaG3_9JCt7ZpI6-cYGH04SZCRansjAtY46DF6B7efYXxRbxykAIj_Av89C6FD4ko-P6ntHthQYaT98NyPc8YxjS1R6IYGZcXbSIr4abR4t7RcEppMz60HmUMGnLz2XLGVe1DtwnvCU1ze24d5eymfcotq2eFQomRUoJUlmddAL9bzRsSZjEID6jVsqP3K5DRv5AAD3rVw")
#jaqpot.request_key_safe()

[1m [32m 2019-10-07 11:20:12,214 - INFO - api key is set[0m


In [41]:
jaqpot.request_key('tmohoric', 'geslogit3006')

ERROR: Error: 'authToken'


In [42]:
jaqpot.deploy_linear_model(model, X_rfe, Y[['True']], title="Sklearn 2", description="Logistic regression model from python for the titanic dataset", algorithm="logistic regression")

ERROR: Some error occured: It seems your token is not valid


In [29]:
url = jaqpot.deploy_pipeline(model, X_rfe, Y['True'], "ORN consensus RFE 6", "Logistic Regression+RFE", "linearmodel")
url

TypeError: can only concatenate str (not "NoneType") to str

In [None]:
dfJQ_RFE, predicts_RFE = jaqpot.predict(XRFE3, modelId=url)