In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/all-labelled/all_labelled.tsv
/kaggle/input/test-val/test_data.tsv
/kaggle/input/test-val/val_data.tsv
/kaggle/input/top-losses-cl/top_losses_cl.tsv


In [2]:
# Load in train, validation data along with data flagged by Cleanlab as being mislabelled 
train=pd.read_csv("../input/all-labelled/all_labelled.tsv", delimiter="\t")
validation=pd.read_csv("../input/test-val/val_data.tsv", delimiter="\t")
cl=pd.read_csv("../input/top-losses-cl/top_losses_cl.tsv", delimiter= "\t")

In [3]:
# Switch class type to bool to facilitate preprocessing 
train['class']=train['class'].astype('bool')
validation['class']=validation['class'].astype('bool')
cl['class']=cl['class'].astype('bool')

In [4]:
# Inspect intersection of train data and data Cleanlab flagged as being mislabelled (to aid in preprocessing)
print(train[train['uid'].isin(cl['uid'])])

           uid                                           abstract  \
7     31969690  Neonatal mammalian heart maintains a transient...   
10    31959915  The identification of viability-associated lon...   
20    32127658  Autophagy is a cellular catabolic process that...   
33    32447347  Senescence is accompanied with histones level ...   
50    32376875  We have previously reported that Monoglyceride...   
...        ...                                                ...   
8491  33555257  The Hippo (Hpo) pathway regulates tissue growt...   
8613  33459596  Germline mutations in the Folliculin (FLCN) tu...   
8643  33438577  The microphthalmia-associated transcription fa...   
8648  33432928  Skeletal muscle possesses an outstanding capac...   
8669  33416496  The oncoprotein transcription factor MYC is a ...   

                                                  title  \
7     Targeting LncDACH1 promotes cardiac repair and...   
10    LINC00662 promotes hepatocellular carcinoma pr.

In [5]:
# Flip labels Cleanlab flagged as being incorrect
train.loc[train['uid'].isin(cl['uid']), 'class'] = ~(train.loc[train['uid'].isin(cl['uid']), 'class'] )

In [6]:
print(train[train['uid'].isin(cl['uid'])])

           uid                                           abstract  \
7     31969690  Neonatal mammalian heart maintains a transient...   
10    31959915  The identification of viability-associated lon...   
20    32127658  Autophagy is a cellular catabolic process that...   
33    32447347  Senescence is accompanied with histones level ...   
50    32376875  We have previously reported that Monoglyceride...   
...        ...                                                ...   
8491  33555257  The Hippo (Hpo) pathway regulates tissue growt...   
8613  33459596  Germline mutations in the Folliculin (FLCN) tu...   
8643  33438577  The microphthalmia-associated transcription fa...   
8648  33432928  Skeletal muscle possesses an outstanding capac...   
8669  33416496  The oncoprotein transcription factor MYC is a ...   

                                                  title  \
7     Targeting LncDACH1 promotes cardiac repair and...   
10    LINC00662 promotes hepatocellular carcinoma pr.

In [7]:
# Pre-process train data 
train.dropna(inplace=True)
train=train.drop_duplicates()

In [8]:
!pip install -q ktrain 

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.24.2 which is incompatible.
pdpbox 0.2.1 requires matplotlib==3.1.1, but you have matplotlib 3.5.1 which is incompatible.
imbalanced-learn 0.9.0 requires scikit-learn>=1.0.1, but you have scikit-learn 0.24.2 which is incompatible.[0m[31m
[0m

In [9]:
import tensorflow as tf 
import ktrain 
from ktrain import text 
from sklearn.model_selection import train_test_split
# Enable AMP
from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

In [10]:
# Set random seed for reproducibility 
import os
import random
def reset_random_seeds(seed=3):
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
reset_random_seeds() 

In [11]:
# Extract input and labels from train data 
X_train=(train['title'].str.strip() + ' [SEP] ' + train['abstract'].str.strip()).to_numpy()
print(X_train)
y_train=train['class'].to_numpy()

['Mutant GTF2I induces cell transformation and metabolic alterations in thymic epithelial cells. [SEP] The pathogenesis of thymic epithelial tumors (TETs) is poorly understood. Recently we reported the frequent occurrence of a missense mutation in the GTF2I gene in TETs and hypothesized that GTF2I mutation might contribute to thymic tumorigenesis. Expression of mutant TFII-I altered the transcriptome of normal thymic epithelial cells and upregulated several oncogenic genes. Gtf2i L424H knockin cells exhibited cell transformation, aneuploidy, and increase tumor growth and survival under glucose deprivation or DNA damage. Gtf2i mutation also increased the expression of several glycolytic enzymes, cyclooxygenase-2, and caused modifications of lipid metabolism. Elevated cyclooxygenase-2 expression by Gtf2i mutation was required for survival under metabolic stress and cellular transformation of thymic epithelial cells. Our findings identify GTF2I mutation as a new oncogenic driver that is r

In [12]:
# Extract input and labels from validation data
X_val=(validation['title'].str.strip() + ' [SEP] ' + validation['abstract'].str.strip()).to_numpy()
y_val=validation['class'].to_numpy()

In [13]:
t_mod = text.Transformer('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext', maxlen=500, class_names = [0,1])

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

In [15]:
trn=t_mod.preprocess_train(X_train, y_train)
val = t_mod.preprocess_test(X_val, y_val)
model=t_mod.get_classifier()
learner=ktrain.get_learner(model,train_data=trn, val_data=val, batch_size=16)

preprocessing train...
language: en
train sequence lengths:
	mean : 184
	95percentile : 264
	99percentile : 285


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 181
	95percentile : 266
	99percentile : 277


404 Client Error: Not Found for url: https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/resolve/main/tf_model.h5


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [16]:
# Fine tune transformer on data (final hyperparameters here were chosen through grid-search)
learning_rate= 2e-5 
epochs=3 
learner.fit_onecycle(learning_rate,epochs)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f49fc6d45d0>

In [17]:
# Assess model on validation data 
predictor=ktrain.get_predictor(learner.model, preproc=t_mod)
predictions=predictor.predict((X_val))
print(classification_report((y_val), predictions))
print(matthews_corrcoef((y_val), predictions))

              precision    recall  f1-score   support

       False       0.94      0.95      0.94       842
        True       0.77      0.76      0.76       200

    accuracy                           0.91      1042
   macro avg       0.86      0.85      0.85      1042
weighted avg       0.91      0.91      0.91      1042

0.7080812327590811


In [18]:
predictor.save('./')