# Lithium Response predictor


In [1]:
import pandas as pd
import numpy as np


from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.model_selection import learning_curve

In [2]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

### Data Prepare

In [3]:
if False : 
    # import samples table
    samples_df = pd.read_csv(r'../data/dataset_joined/samples_joined.csv', index_col=0)
    samples_df.drop(columns=['patient', 'sex','age'], inplace=True)
    # import count matrix
    countMatrix_df = pd.read_csv('../data/dataset_joined/countMatrix_include_gene_name.csv', index_col=0).T
    df_full =  countMatrix_df.merge(samples_df, left_index=True, right_index= True)
    
    #import list of genes from deseq2 analysis
    genes = pd.read_csv('vsd_results_FC1.csv', index_col=0).T.columns

    # keep only relevant dataset, i.e. remove CTRL samples
    df = df_full[df_full['diagnosis']!= 'ctrl']

    ##  filter only relevant genes from deseq2 analysis ##
    df = df.loc[:,list(genes) + ['batch','condition']]
    df['condition']=df['condition'].apply(lambda x : 1 if x =='LR' else 0)
    
    df.to_csv('LiResp_dataset.csv')

if False : 
    # import samples table
    samples_df = pd.read_csv(r'samples_joined.csv', index_col=0)
    samples_df.drop(columns=['patient', 'sex','age'], inplace=True)
    # import count matrix
    countMatrix_df = pd.read_csv('countMatrix_include_gene_name.csv', index_col=0).T
    df_full =  countMatrix_df.merge(samples_df, left_index=True, right_index= True)
    
    #import list of genes from deseq2 analysis
    genes = pd.read_csv('original_dataset_CTRL_vs_BD_counts_results_FC1.csv', index_col=0).T.columns

    # keep only relevant dataset, i.e. drop batch #4 becuase it has no control
    df = df_full[df_full['batch']!= 4]

    ##  filter only relevant genes from deseq2 analysis ##
    df = df.loc[:,list(genes) + ['diagnosis']]
    df['diagnosis'] = df['diagnosis'].apply(lambda x : 1 if x =='BD' else 0)
    df.to_csv('BP_vsCtrl_dataset.csv')

    

In [4]:
df = pd.read_csv('LiResp_dataset.csv', index_col=0)


## Hyperparameter Tuning using GridSearchCV

In [5]:
classifiers=[
    {'name': "SVM", 
     'model': SVC(max_iter=1000, probability=True), 
     'cvGridParams' :[
                      {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                      {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
                     ]
    },
                       
    {'name' : "Neural Net", 
     'model': MLPClassifier(max_iter = 5000,  verbose=True,), 
    'cvGridParams' : {
                      'alpha' : 10. ** np.arange(-3, 2), 
                      'activation' : ['tanh', 'relu'],
                      'hidden_layer_sizes' : [(x,) for x in [50,150,300,500,700]], 
                      'learning_rate' : ['constant', 'adaptive']
                     }
    
    }, 
    {'name' : "Naive Bayes", 
     'model':GaussianNB(),
     'cvGridParams' :  {'var_smoothing': np.logspace(0,-9, num=100)}}, 
    
    {'name' : "Random Forest", 
     'model': RandomForestClassifier(), 
     'cvGridParams' :  {'n_estimators':[int(x) for x in np.linspace(10,1000,10)],
                        'criterion': ["gini", "entropy"]}
    },
]

In [6]:
X = np.array(df.drop(columns=['batch','condition']))
y = np.array(df['condition'].apply(lambda x : 1 if x =='LR' else 0))
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
# cv = ShuffleSplit(n_splits=20, test_size=0.2, random_state=0)

In [7]:
for clf in classifiers: 
    print(clf['name'])
    clf['gridSearch'] = GridSearchCV(clf['model'], clf['cvGridParams'], n_jobs=2, cv=5)
   
    clf['gridSearch'].fit(X_train, y_train)
    clf['best_params_'] = clf['gridSearch'].best_params_
    
    clf['score'] =  clf['gridSearch'].score(X_test, y_test)
    print(clf['best_params_'])
    

SVM
{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
Neural Net
Iteration 1, loss = 0.68839473
Iteration 2, loss = 0.43935347
Iteration 3, loss = 0.28036670
Iteration 4, loss = 0.18311312
Iteration 5, loss = 0.12291276
Iteration 6, loss = 0.08463639
Iteration 7, loss = 0.05989003
Iteration 8, loss = 0.04369883
Iteration 9, loss = 0.03291890
Iteration 10, loss = 0.02556443
Iteration 11, loss = 0.02040692
Iteration 12, loss = 0.01669114
Iteration 13, loss = 0.01394732
Iteration 14, loss = 0.01187656
Iteration 15, loss = 0.01028354
Iteration 16, loss = 0.00903716
Iteration 17, loss = 0.00804727
Iteration 18, loss = 0.00725044
Iteration 19, loss = 0.00660121
Iteration 20, loss = 0.00606637
Iteration 21, loss = 0.00562129
Iteration 22, loss = 0.00524748
Iteration 23, loss = 0.00493083
Iteration 24, loss = 0.00466048
Iteration 25, loss = 0.00442796
Iteration 26, loss = 0.00422662
Iteration 27, loss = 0.00405117
Iteration 28, loss = 0.00389738
Iteration 29, loss = 0.00376183
Iteration 30, loss = 0

In [9]:
import pickle
pickle.dump(classifiers, open( "classifiers_gridSearch.p", "wb" ) )

In [10]:
classifiers = pickle.load( open( "classifiers_gridSearch.p", "rb" ) )