In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bf-model-dev-copy/config.json
/kaggle/input/bf-model-dev-copy/tokenizer.json
/kaggle/input/bf-model-dev-copy/tf_model.h5
/kaggle/input/bf-model-dev-copy/tokenizer_config.json
/kaggle/input/bf-model-dev-copy/tf_model.preproc
/kaggle/input/bf-model-dev-copy/special_tokens_map.json
/kaggle/input/bf-model-dev-copy/vocab.txt
/kaggle/input/test-val/test_data.tsv
/kaggle/input/test-val/val_data.tsv


<h1> Using PubMed Article Classifier - Tutorial </h1>

In [None]:
!pip install -q ktrain
!pip install -q cached-path

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.24.2 which is incompatible.
pdpbox 0.2.1 requires matplotlib==3.1.1, but you have matplotlib 3.5.1 which is incompatible.
imbalanced-learn 0.9.0 requires scikit-learn>=1.0.1, but you have scikit-learn 0.24.2 which is incompatible.
featuretools 1.6.0 requires numpy>=1.21.0, but you have numpy 1.20.3 which is incompatible.[0m


<h3> Below we import necessary libraries, turn on AMP (optional) and show two ways to load our model </h3>

In [None]:
import tensorflow as tf
import ktrain 
from cached_path import cached_path


# Activate AMP - This is optional and may improve performance
from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

# Load model (method 1 - from local directory)
#predictor=ktrain.load_predictor('../input/bf-model-dev-copy')

# Load model (method 2 - from github release)
model_path = cached_path("https://github.com/PathwayCommons/pathway-abstract-classifier/releases/download/pretrained-models/title_abstract_model.zip", extract_archive=True)
predictor = ktrain.load_predictor(model_path)

downloading:   0%|          | 0.00/389M [00:00<?, ?iB/s]

<h3> Here we read in our validation data, then pre-process our input data and make predictions (showing two different ways to do this). Finally, we pre-process our existing labels to be able to calculate perfomrance metrics </h3>

In [None]:
# Read in validation data
df=pd.read_csv('../input/test-val/val_data.tsv', delimiter="\t")

#pre-process input data and make predictions (method 1 - using lists)
titles=df['title'].tolist()
abstracts=df['abstract'].tolist()
sep_token = predictor.preproc.get_tokenizer().sep_token
texts = [" ".join([title, sep_token, abstract]) for title, abstract in zip(titles, abstracts)] 
# Make predictions, checking how long they take to make (note- there are 1042 examples here this is using a (likely NVIDIA P100) GPU)
%time predictions=predictor.predict((texts))

#pre-process input data and make predictions (method 2 - using dataframes, hard coded SEP token)
#texts=(df['title'].str.strip() + ' [SEP] ' + df['abstract'].str.strip()).to_numpy()
#predictions2=predictor.predict((texts))

# pre process existing labels 
df['class']=df['class'].astype('bool')
y_val=df['class'].to_numpy()

<h3> Checking performance now </h3>

In [None]:
# Check performance on validation set
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix

print(confusion_matrix((y_val),predictions))
print(classification_report((y_val), predictions))
print(matthews_corrcoef((y_val), predictions))

[[797  45]
 [ 47 153]]
              precision    recall  f1-score   support

       False       0.94      0.95      0.95       842
        True       0.77      0.77      0.77       200

    accuracy                           0.91      1042
   macro avg       0.86      0.86      0.86      1042
weighted avg       0.91      0.91      0.91      1042

0.7142926808037249


<h3> Let's try altering confidence threshold now (this will make our classifier more conservative) </h3>

In [None]:
# Default threshold is 0.5. I've found you need to go over 0.9 to start really seeing an effect
threshold=0.994
conf_predictions=(predictor.predict_proba(texts)[:,1] >= threshold).astype(bool)

print(confusion_matrix((y_val),conf_predictions))
print(classification_report((y_val), conf_predictions))
print(matthews_corrcoef((y_val), conf_predictions))

[[838   4]
 [121  79]]
              precision    recall  f1-score   support

       False       0.87      1.00      0.93       842
        True       0.95      0.40      0.56       200

    accuracy                           0.88      1042
   macro avg       0.91      0.70      0.74      1042
weighted avg       0.89      0.88      0.86      1042

0.5676294827683775


<h3> Now let's try a single example (with an explanation) </h3>

In [None]:
!pip install -q https://github.com/amaiya/eli5/archive/refs/heads/tfkeras_0_10_1.zip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
title = "YTHDC1-mediated augmentation of miR-30d in repressing pancreatic tumorigenesis via attenuation of RUNX1-induced transcriptional activation of Warburg effect"
abstract = "Pancreatic ductal adenocarcinoma (PDAC) is one of the most lethal human cancers. It thrives in a malnourished environment; however, little is known about the mechanisms by which PDAC cells actively promote aerobic glycolysis to maintain their metabolic needs. Gene Expression Omnibus (GEO) was used to identify differentially expressed miRNAs. The expression pattern of miR-30d in normal and PDAC tissues was studied by in situ hybridization. The role of miR-30d/RUNX1 in vitro and in vivo was evaluated by CCK8 assay and clonogenic formation as well as transwell experiment, subcutaneous xenograft model and liver metastasis model, respectively. Glucose uptake, ATP and lactate production were tested to study the regulatory effect of miR-30d/RUNX1 on aerobic glycolysis in PDAC cells. Quantitative real-time PCR, western blot, Chip assay, promoter luciferase activity, RIP, MeRIP, and RNA stability assay were used to explore the molecular mechanism of YTHDC1/miR-30d/RUNX1 in PDAC. Here, we discover that miR-30d expression was remarkably decreased in PDAC tissues and associated with good prognosis, contributed to the suppression of tumor growth and metastasis, and attenuation of Warburg effect. Mechanistically, the m6A reader YTHDC1 facilitated the biogenesis of mature miR-30d via m6A-mediated regulation of mRNA stability. Then, miR-30d inhibited aerobic glycolysis through regulating SLC2A1 and HK1 expression by directly targeting the transcription factor RUNX1, which bound to the promoters of the SLC2A1 and HK1 genes. Moreover, miR-30d was clinically inversely correlated with RUNX1, SLC2A1 and HK1, which function as adverse prognosis factors for overall survival in PDAC tissues. Overall, we demonstrated that miR-30d is a functional and clinical tumor-suppressive gene in PDAC. Our findings further uncover that miR-30d is a novel target for YTHDC1 through m6A modification, and miR-30d represses pancreatic tumorigenesis via suppressing aerobic glycolysis."
print(predictor.predict(title + " [SEP] " + abstract))

1


In [None]:
predictor.explain(title + " [SEP] " + abstract)

Contribution?,Feature
19.582,Highlighted in text (sum)
-0.981,<BIAS>


<h3> https://github.com/amaiya/ktrain/issues/234 suggests above may be finnicky with Kaggle notebooks (sometimes may not display output). Should work in other environments more consistently though </h3>