# Prepare the data

In [1]:
import sys,os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [None]:
# all the gene-pathway annotation
df = pd.read_csv('Sly_pathway_annotation_20180827_with_expression_5_members.txt',header=None,index_col=None,sep='\t')
# gene-pathway annotation for only non-overlapping genes in 85 pathways
df_uni = pd.read_csv('Sly_pathway_annotation_20190117_with_expression_5_members_nonoverlapping.txt',header=None,index_col=None,sep='\t')
df.columns = ['Pathway','Gene']
pathway = df_uni[0].unique()
# to get back genes with multiple pathway annotation in only those 85 pathways
df_85 = df[df['Pathway'].isin(pathway)]
gene = df_85['Gene'].unique()
df_85.index = df_85['Gene']
df_85 = df_85.drop(['Gene'],axis=1)
df_85.columns = ['Pathway']

### make the multiple labels, using OneHotEncoder

In [None]:
cat_encoder = OneHotEncoder() # or cat_encoder = OneHotEncoder(sparse=False)
df_85_1hot = cat_encoder.fit_transform(df_85)
labels = pd.DataFrame(df_85_1hot.toarray()) # or labels = pd.DataFrame(df_85_1hot)
labels.columns = cat_encoder.categories_[0]
labels.index = df_85.index
# group rows with same index ID, add up the 1s
gp = labels.groupby(level=0).sum()

### parse the expression matrix, here FCs in all combination were used

In [None]:
exp = pd.read_csv('Results_Fold_changes_all_combination_Sly_20180124.txt',header=0,index_col=None,sep='\t')
exp_85 = exp[exp['gene'].isin(gene)]
exp_85.index = exp_85['gene']
exp_85 = exp_85.drop('gene',axis=1)
res = pd.concat([gp,exp_85],axis=1)
# save the multilable matrix
res.to_csv('Multilabel_85_pathways_matrix_all_FC.txt',index=True, header=True,sep="\t")

In [67]:
df = pd.read_csv('Multilabel_85_pathways_matrix_all_FPKM.txt',header=0,index_col=0,sep='\t')
X = df.iloc[:,85:]
y = df.iloc[:,0:85]
# count no of pathways each gene was annotated
freq = y.astype(bool).sum(axis=1)
# get genes with multiple pathway annotation
freq2 = freq[freq>1]
# get the corresponding labels
y2 = y.loc[y.index.isin(freq2.index),:]

In [69]:
# count no of genes annotated in each pathway
freq3 = y2.astype(bool).sum(axis=0)
sorted(freq3,reverse=True)
# get pathways with more than 10 genes
freq3[freq3 > 10]
y3 = y.loc[:,['PWY-321','PWY-361','PWY-5690','PWY-6443','PWY-6733']]

In [79]:
# for the remaining dataset, get rid of genes which are not annotated in the 5 pathways
freq4 = y3.astype(bool).sum(axis=1)
freq5 = freq4[freq4 > 0]
y4 = y3.loc[y3.index.isin(freq5.index),:]
y4.head()

Unnamed: 0,PWY-321,PWY-361,PWY-5690,PWY-6443,PWY-6733
NP_001234001.1,0.0,0.0,1.0,0.0,0.0
NP_001234005.1,0.0,0.0,1.0,0.0,0.0
NP_001234277.1,0.0,0.0,1.0,0.0,0.0
NP_001234574.1,0.0,0.0,1.0,0.0,0.0
NP_001234767.1,0.0,0.0,0.0,1.0,0.0


In [88]:
X = X.loc[X.index.isin(y4.index),:]
y = y4
df = pd.concat([y,X],axis=1)
df.to_csv('Multilabel_5_pathways_matrix_all_FPKM.txt',index=True, header=True,sep="\t")

# Split the data to training and test

In [90]:
df = pd.read_csv('Multilabel_5_pathways_matrix_all_FPKM.txt',header=0,index_col=0,sep='\t')
X = df.iloc[:,5:]
y = df.iloc[:,0:5]

#### StratifiedKFold can't be used in multi-label cases

In [None]:
#from sklearn.model_selection import StratifiedKFold
#skfolds = StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
#train_index, test_index = skfolds.split(X, y)
# ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

#### !!Note that IterativeStratification can't freeze the random state, therefore every run would lead to different split

In [None]:
import skmultilearn
from skmultilearn.model_selection import IterativeStratification
import numpy as np
#k_fold = IterativeStratification(n_splits=5, order=1)
#Split = pd.DataFrame(k_fold.split(X, y))
#X_train = X.iloc[Split.iloc[0,0],:]
#y_train = y.iloc[Split.iloc[0,0],:]
#X_test = X.iloc[Split.iloc[0,1],:]
#y_test = y.iloc[Split.iloc[0,1],:]
#ID_train = y_train.index.tolist()
#pd.DataFrame(ID_train).to_csv('Multilable_training_ID.txt',index=False, header=False,sep="\t")
#ID_test = y_test.index.tolist()
#pd.DataFrame(ID_test).to_csv('Multilable_test_ID.txt',index=False, header=False,sep="\t")
# count non zeros for each column (pathway annotation)
#y_train.astype(bool).sum(axis=0)
#y_test.astype(bool).sum(axis=0)

# reload the saved training and test gene IDs

In [115]:
df = pd.read_csv('Multilabel_5_pathways_matrix_all_FPKM.txt',header=0,index_col=0,sep='\t')
train_id = pd.read_csv('Multilable_training_ID.txt',header=None,index_col=None,sep='\t')
test_id = pd.read_csv('Multilable_test_ID.txt',header=None,index_col=None,sep='\t')
X_train = df.loc[df.index.isin(train_id[0]),:].iloc[:,5:]
y_train = df.iloc[df.index.isin(train_id[0]),:].iloc[:,0:5]
X_test = df.loc[df.index.isin(test_id[0]),:].iloc[:,5:]
y_test = df.iloc[df.index.isin(test_id[0]),:].iloc[:,0:5]

(33, 5)

# Impute the missing data

In [116]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer
class KNNImputer_Ks(BaseEstimator, TransformerMixin):
    def __init__(self, *Ks):
        self.Ks = Ks
    def fit(self, X,Ks):
        D_imputer = {}        
        for k in [3,4,5,6,7]:
            imputer = KNNImputer(n_neighbors=k)
            D_imputer[k] = imputer.fit(X)              
        return D_imputer
    def transform(self, X):
        Impute_train = {}
        for k in [3,4,5,6,7]:
            Impute_train[k] = pd.DataFrame(D_imputer[k].transform(X))
            Impute_train[k].index = X.index
            Impute_train[k].columns = X.columns 
            if k == 3:
                Imputed = Impute_train[k].copy(deep=True)
                Imputed.loc[:,:] = 0
            Imputed = Imputed.add(Impute_train[k],fill_value=0)
        return Imputed/5

In [117]:
imputer_knn = KNNImputer_Ks()
D_imputer = imputer_knn.fit(X_train, Ks="3,4,5,6,7")
X_train_KNN = imputer_knn.transform(X_train)
X_test_KNN = imputer_knn.transform(X_test)

### Scale the data

In [118]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_KNN = scaler.fit_transform(X_train_KNN)
X_test_KNN = scaler.transform(X_test_KNN)

# KNeighborsClassifier

### Grid Search

In [138]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
knn_clf = KNeighborsClassifier()
param_grid = {'n_neighbors':[3,4,5,10,15,20,25], \
              'weights': ['uniform', 'distance'], \
              'metric': ['euclidean','manhattan','minkowski']}
# add randomize search
# cut down the feature size to 30
# top 5 pathways


In [139]:
gs = GridSearchCV(knn_clf, param_grid, cv=5, scoring='f1_weighted', verbose=2, n_jobs=5)
gs.fit(X_train_KNN,y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:    2.2s
[Parallel(n_jobs=5)]: Done 210 out of 210 | elapsed:    2.8s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=5,
             param_grid={'metric': ['euclidean', 'manhattan', 'minkowski'],
                         'n_neighbors': [3, 4, 5, 10, 15, 20, 25],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_weighted', verbose=2)

In [140]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
parameter2use = gs.best_params_
KNC_clf = KNeighborsClassifier(metric=parameter2use['metric'],\
                            n_neighbors=parameter2use['n_neighbors'],\
                            weights= parameter2use['weights'])
cv_pred = cross_val_predict(estimator=KNC_clf, X=X_train_KNN, y=y_train, cv=5)
cv_score = cross_val_score(estimator=KNC_clf, X=X_train_KNN, y=y_train, \
                           cv=5,scoring='f1_weighted')
# using three F1 values: macro, average, weighted
cv_score

array([0.46151055, 0.20147293, 0.35882353, 0.40320513, 0.27708333])

In [141]:
pd.DataFrame(cv_pred).astype(bool).sum(axis=0)

0    23
1    21
2    22
3    33
4    26
dtype: int64

In [142]:
from sklearn.metrics import f1_score
F1_weighted_cv = f1_score(y_train,cv_pred,average = 'weighted')
f1_score(y_train,cv_pred,average=None)

array([0.24489796, 0.30769231, 0.3902439 , 0.39506173, 0.36363636])

In [144]:
KNC_clf.fit(X_train_KNN,y_train)
test_pred = KNC_clf.predict(X_test_KNN)
F1_weighted_test = f1_score(y_test,test_pred,average = 'weighted')
F1_weighted_test

0.4915069057926201

In [146]:
F1_macro_test = f1_score(y_test,test_pred,average = 'macro')
F1_macro_test

0.48718614718614717

In [152]:
F1_test = f1_score(y_test,test_pred,average = None)
F1_test

array([0.42857143, 0.42857143, 0.5       , 0.54545455, 0.53333333])

In [148]:
#Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from accuracy_score).
F1_sample_test = f1_score(y_test,test_pred,average = 'samples')
F1_sample_test

0.4444444444444444

### f1 score for each label

In [None]:
f1_score(y_test,test_pred,average=None)

# RandomForestClassifier

### Grid Search

In [128]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
RF_clf = RandomForestClassifier()
param_grid = {'max_depth':[3, 5, 10], \
              'max_features': [0.1, 0.5, 'sqrt', 'log2', None], \
              'n_estimators': [10,100,500,1000]}

In [129]:
gs = GridSearchCV(RF_clf, param_grid, cv=5, scoring='f1_weighted', verbose=2,\
                 n_jobs=5)
gs.fit(X_train_KNN,y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   12.9s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  1.1min
[Parallel(n_jobs=5)]: Done 300 out of 300 | elapsed:  2.5min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [130]:
gs.best_params_

{'max_depth': 5, 'max_features': 0.5, 'n_estimators': 10}

In [147]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
parameter2use = gs.best_params_
#parameter2use = {'max_depth':3,'max_features': 0.1,'n_estimators':100}
RF_clf = RandomForestClassifier(max_depth=parameter2use['max_depth'],\
                            max_features=parameter2use['max_features'],\
                            n_estimators= parameter2use['n_estimators'],\
                            random_state=42)
cv_score = cross_val_score(estimator=RF_clf, X=X_train_KNN, y=y_train, \
                           cv=5,scoring='f1_weighted')
cv_pred = cross_val_predict(estimator=RF_clf, X=X_train_KNN, y=y_train, \
                            cv=5)
cv_score

KeyError: 'max_depth'

In [133]:
pd.DataFrame(cv_pred).astype(bool).sum(axis=0)

0     6
1    21
2    11
3    38
4    11
dtype: int64

In [134]:
from sklearn.metrics import f1_score
F1_weighted_cv = f1_score(y_train,cv_pred,average = 'weighted')
f1_score(y_train,cv_pred,average=None)

array([0.0625    , 0.15384615, 0.6       , 0.25581395, 0.1       ])

In [135]:
RF_clf.fit(X_train_KNN,y_train)
test_pred = RF_clf.predict(X_test_KNN)
F1_weighted_test = f1_score(y_test,test_pred,average = 'weighted')
F1_weighted_test

In [136]:
F1_macro_test = f1_score(y_test,test_pred,average = 'macro')
F1_macro_test

0.3278688524590164