In [32]:
# set the directory to where the data is
import os

os.chdir(r"D:\Gene_Project")

In [33]:
# pandas for dealing with the data
import pandas as pd
# setting for seeing the entire string
pd.options.display.max_colwidth = None
pd.set_option('display.max_rows', 500)

In [34]:
# load the data - new data that was provided
data = pd.read_csv(r"pul_seq_low_high_substr_year_corrected.csv")

In [35]:
data.head()

Unnamed: 0,PULid,sig_gene_seq,low_level_substr,high_level_substr,Pub_year
0,PUL0001,"GH1,8.A.49,CE2,GH130,GH130,3.A.1,3.A.1,SBP_bac_1,LacI,GH36,GH113",beta-mannan,beta-mannan,2019
1,PUL0002,GH16,lichenan,beta-glucan,1996
2,PUL0003,"GH30_8,GH43_16|CBM6",xylan,xylan,2016
3,PUL0004,"4.A.1,GH1","glucose,cellobiose,maltose",multiple_substrates,2016
4,PUL0005,"GH94,GH3","beta-glucan,sophorose,laminaribiose",multiple_substrates,2016


In [36]:
from sklearn.pipeline import Pipeline

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
vectorizer_word = CountVectorizer(tokenizer=lambda x: str(x).replace("|", ",").split(','), lowercase = False, 
                                 ngram_range=(1,2))

In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
data["high_level_substr"].value_counts()

multiple_substrates           139
mono/di/trisaccharide          78
capsule polysaccharide         60
algal glycans                  40
pectin                         38
xylan                          38
O-antigen                      37
galactan                       36
alpha-glucan                   23
beta-mannan                    19
cellulose                      18
chitin                         15
glycosaminoglycan              14
beta-glucan                    13
N-glycan                       12
exopolysaccharide              10
-                               9
fructan                         8
alpha-mannan                    8
host glycan                     7
xyloglucan                      7
plant polysaccharide            6
glycoprotein                    6
human milk oligosaccharide      5
hemicellulose                   5
O-glycan                        3
Name: high_level_substr, dtype: int64

In [41]:
# selected_classes_high_level = ['capsule polysaccharide', 'algal glycans', 'xylan', 'pectin']

In [42]:
selected_classes_high_level = ['algal glycans']

In [43]:
# class_proportions = data["high_level_substr"].value_counts()/data["high_level_substr"].value_counts().sum()

In [44]:
data_model = data.copy()

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
# selected_classes_low_level = ["O-glycan,N-glycan"]

In [47]:
from sklearn.svm import OneClassSVM

In [48]:
from sklearn.preprocessing import FunctionTransformer

In [49]:
# from metric_learn import NCA

In [50]:
from sklearn.ensemble import IsolationForest

In [51]:
from sklearn.neighbors import LocalOutlierFactor

In [52]:
from sklearn.covariance import EllipticEnvelope

In [53]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [54]:
from sklearn.metrics import make_scorer, f1_score, classification_report, confusion_matrix

In [55]:
import numpy as np

In [56]:
f1 = make_scorer(f1_score, pos_label = -1)

In [57]:
def run_one_class(method_type, param_grid, selected_classes_high_level, inlier_outlier):
    data_model = data.copy()
    clf = Pipeline([('vectorizer',CountVectorizer(tokenizer=lambda x: str(x).replace("|", ",").split(','), lowercase = False, 
                                             ngram_range=(1,2))),
                 ('ft', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)), 
#                 ('nca',NCA()), 
                ('brf',method_type)
               ])
    if inlier_outlier == -1:
        data_model["inlier_outlier"] = [-1 if classes in selected_classes_high_level else 1 for classes in data_model["high_level_substr"]]
        f1 = make_scorer(f1_score, pos_label = -1)
    else:
        data_model["inlier_outlier"] = [1 if classes in selected_classes_high_level else -1 for classes in data_model["high_level_substr"]]
        f1 = make_scorer(f1_score, pos_label = 1)
#     data_model["inlier_outlier"] = [-1 if classes in selected_classes_high_level else 1 for classes in data_model["high_level_substr"]]
    X_train, X_test, y_train, y_test = train_test_split(data_model["sig_gene_seq"],
                                                    data_model["inlier_outlier"], test_size=0.3,
                                                    stratify=data_model["inlier_outlier"], 
                                                       random_state = 42)
    train_data = pd.concat([X_train, y_train],1)
    test_data = pd.concat([X_test, y_test],1)
    gsearch = GridSearchCV(clf, param_grid, cv = 5, n_jobs = 6, scoring = f1, verbose =2)
    gsearch.fit(train_data["sig_gene_seq"].values, train_data["inlier_outlier"].values)
    print("best score")
    print(gsearch.best_score_)
    y_test_pred = gsearch.predict(test_data["sig_gene_seq"].values)
    print("classification report")
    print(classification_report(y_test, y_test_pred))
    print("confusion matrix for test")
    print(confusion_matrix(y_test, y_test_pred, normalize  = 'true'))

In [58]:
run_one_class(OneClassSVM(), {"brf__nu": np.arange(0.1,1.04,0.05)}, "capsule polysaccharide", 1)

  train_data = pd.concat([X_train, y_train],1)
  test_data = pd.concat([X_test, y_test],1)


Fitting 5 folds for each of 19 candidates, totalling 95 fits
best score
0.175448129204935
classification report
              precision    recall  f1-score   support

          -1       0.82      0.10      0.18       179
           1       0.08      0.78      0.15        18

    accuracy                           0.16       197
   macro avg       0.45      0.44      0.16       197
weighted avg       0.75      0.16      0.18       197

confusion matrix for test
[[0.10055866 0.89944134]
 [0.22222222 0.77777778]]


5 fits failed out of a total of 95.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\vedpi\anaconda3\envs\newone\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\vedpi\anaconda3\envs\newone\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\vedpi\anaconda3\envs\newone\lib\site-packages\sklearn\svm\_classes.py", line 1631, in fit
    super().fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight)
  File "C:\Users\vedpi\anaconda3\envs\newone\lib\site-packages

In [59]:
run_one_class(IsolationForest(), {"brf__n_estimators": [100,300,500]}, "capsule polysaccharide", 1)

  train_data = pd.concat([X_train, y_train],1)
  test_data = pd.concat([X_test, y_test],1)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
best score
0.16755920271667415
classification report
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       179
           1       0.09      1.00      0.17        18

    accuracy                           0.09       197
   macro avg       0.05      0.50      0.08       197
weighted avg       0.01      0.09      0.02       197

confusion matrix for test
[[0. 1.]
 [0. 1.]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [60]:
run_one_class(LocalOutlierFactor(novelty = True), {"brf__n_neighbors": [2,5,10,20]}, "capsule polysaccharide", 1)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  train_data = pd.concat([X_train, y_train],1)
  test_data = pd.concat([X_test, y_test],1)


best score
0.22554243203424526
classification report
              precision    recall  f1-score   support

          -1       0.90      0.26      0.40       179
           1       0.09      0.72      0.16        18

    accuracy                           0.30       197
   macro avg       0.50      0.49      0.28       197
weighted avg       0.83      0.30      0.38       197

confusion matrix for test
[[0.25698324 0.74301676]
 [0.27777778 0.72222222]]


In [61]:
run_one_class(EllipticEnvelope(), {"brf__assume_centered": [False]}, "capsule polysaccharide",1)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  train_data = pd.concat([X_train, y_train],1)
  test_data = pd.concat([X_test, y_test],1)


best score
0.031895253682487724
classification report
              precision    recall  f1-score   support

          -1       0.85      0.47      0.60       179
           1       0.03      0.17      0.05        18

    accuracy                           0.44       197
   macro avg       0.44      0.32      0.33       197
weighted avg       0.77      0.44      0.55       197

confusion matrix for test
[[0.46927374 0.53072626]
 [0.83333333 0.16666667]]


In [62]:
run_one_class(BalancedRandomForestClassifier(), 
              {"brf__n_estimators": [100,300,500]}, "capsule polysaccharide",1)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


  train_data = pd.concat([X_train, y_train],1)
  test_data = pd.concat([X_test, y_test],1)


best score
0.7577415761626287
classification report
              precision    recall  f1-score   support

          -1       1.00      0.91      0.95       179
           1       0.51      1.00      0.68        18

    accuracy                           0.91       197
   macro avg       0.76      0.95      0.81       197
weighted avg       0.96      0.91      0.93       197

confusion matrix for test
[[0.90502793 0.09497207]
 [0.         1.        ]]


In [63]:
run_one_class(BalancedRandomForestClassifier(), 
              {"brf__n_estimators": [100,300,500]}, "xylan",1)

  train_data = pd.concat([X_train, y_train],1)
  test_data = pd.concat([X_test, y_test],1)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
best score
0.4934046345811051
classification report
              precision    recall  f1-score   support

          -1       0.98      0.98      0.98       186
           1       0.67      0.73      0.70        11

    accuracy                           0.96       197
   macro avg       0.83      0.85      0.84       197
weighted avg       0.97      0.96      0.97       197

confusion matrix for test
[[0.97849462 0.02150538]
 [0.27272727 0.72727273]]


In [64]:
run_one_class(BalancedRandomForestClassifier(), 
              {"brf__n_estimators": [100,300,500]}, "pectin",1)

  train_data = pd.concat([X_train, y_train],1)
  test_data = pd.concat([X_test, y_test],1)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
best score
0.38259876997907094
classification report
              precision    recall  f1-score   support

          -1       0.99      0.91      0.95       186
           1       0.38      0.91      0.54        11

    accuracy                           0.91       197
   macro avg       0.69      0.91      0.75       197
weighted avg       0.96      0.91      0.93       197

confusion matrix for test
[[0.91397849 0.08602151]
 [0.09090909 0.90909091]]


In [65]:
run_one_class(BalancedRandomForestClassifier(), 
              {"brf__n_estimators": [100,300,500]}, "O-antigen",1)

  train_data = pd.concat([X_train, y_train],1)
  test_data = pd.concat([X_test, y_test],1)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
best score
0.32147186147186146
classification report
              precision    recall  f1-score   support

          -1       1.00      0.75      0.86       183
           1       0.23      1.00      0.38        14

    accuracy                           0.77       197
   macro avg       0.62      0.87      0.62       197
weighted avg       0.95      0.77      0.82       197

confusion matrix for test
[[0.74863388 0.25136612]
 [0.         1.        ]]
