In [1]:
import pandas as pd
import numpy as np
import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
#keras fast text method
df = pd.read_csv('10k_bi.csv')
df = df.iloc[:,0:2]

In [3]:
df.dropna()

Unnamed: 0,sentence,class
0,['the thyroid gland is unremarkable.'],0
1,['the thyroid gland and esophagus are unremark...,0
2,['the thyroid gland and esophagus are unremark...,0
3,['the thyroid gland is unremarkable.'],0
4,['findings: thyroid gland is normal.'],0
...,...,...
6573,['findings: the partially visualized thyroid i...,0
6574,['findings: the partially visualized thyroid i...,0
6575,['findings: mild asymmetric atrophy of the lef...,0
6576,['the thyroid is partially visualized and norm...,0


In [4]:
text2 = df.sentence.str.replace("\[", "")

text3 = text2.str.replace("\]", "")
text4 = text3.str.replace("'", "")
text4.head()

0                   the thyroid gland is unremarkable.
1    the thyroid gland and esophagus are unremarkable.
2    the thyroid gland and esophagus are unremarkable.
3                   the thyroid gland is unremarkable.
4                   findings: thyroid gland is normal.
Name: sentence, dtype: object

In [5]:
df.sentence = text4

In [6]:
#STEMMING WORDS
import nltk.stem as stm
import re
stemmer = stm.SnowballStemmer("english")
df["sentence"] = df.sentence.apply(lambda x: (" ").join([stemmer.stem(z) for z in re.sub("[^a-zA-Z0-9]"," ", x).split(" ")]))
df.head(1)

Unnamed: 0,sentence,class
0,the thyroid gland is unremark,0


In [7]:
#PROCESS TEXT: RAW
#filters = '!"#$%&*+/:;<=>?@[\]^_`{|}~\t\n' if use char level true
from keras.preprocessing.text import Tokenizer

tok_raw = Tokenizer(char_level = False)
tok_raw.fit_on_texts(df.sentence.str.lower())
tok_stem = Tokenizer(char_level = False)
tok_stem.fit_on_texts(df.sentence)
df['toks'] = tok_stem.texts_to_sequences(df.sentence)
df.head()

Unnamed: 0,sentence,class,toks
0,the thyroid gland is unremark,0,"[3, 1, 8, 4, 6]"
1,the thyroid gland and esophagus are unremark,0,"[3, 1, 8, 12, 49, 14, 6]"
2,the thyroid gland and esophagus are unremark,0,"[3, 1, 8, 12, 49, 14, 6]"
3,the thyroid gland is unremark,0,"[3, 1, 8, 4, 6]"
4,find thyroid gland is normal,0,"[2, 1, 8, 4, 5]"


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.toks,
                                                    pd.get_dummies(df['class'], drop_first=False),
                                                    test_size=0.2,
                                                    stratify= df['class'],
                                                    random_state=42)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.25,
                                                    stratify= y_train,
                                                    random_state=42)

In [10]:
y_train = y_train.values
y_test = y_test.values
y_val = y_val.values

In [11]:
long = []
for row in X_val:
    long.append(np.max(row))
np.max(long)

1261

In [12]:
long = []
for row in X_test:
    long.append(np.max(row))
np.max(long)

1268

In [13]:
long = []
for row in X_train:
    long.append(np.max(row))
np.max(long)

1273

In [14]:
n_stem_seq = 1274

In [15]:
long = []
for row in X_val:
    long.append(len(row))
np.max(long)

152

In [16]:
long = []
for row in X_train:
    long.append(len(row))
np.max(long)

151

In [17]:
long = []
for row in X_test:
    long.append(len(row))
np.max(long)

118

In [18]:
maxlen = 152
from keras.preprocessing.sequence import pad_sequences


X_train = pad_sequences(X_train, maxlen, truncating = 'post')
X_val = pad_sequences (X_val, maxlen, truncating = 'post')
X_test = pad_sequences(X_test, maxlen, truncating = 'post')

In [19]:
#KERAS MODEL DEFINITION
import tensorflow as tf
from keras.layers import Dense, Dropout, Embedding
from keras.layers import Input, GlobalAveragePooling1D
from keras.models import Model
from keras.optimizers import Adam 
from keras.regularizers import l1_l2

def get_model():
    embed_dim = 200
       
    input_text = Input(shape=[maxlen], name="stem_input")
    
    emb = (Embedding(n_stem_seq, embed_dim,input_length = maxlen))(input_text)
    
    pool = GlobalAveragePooling1D()(emb)
    
    output = Dense(2, activation="softmax")(pool)

    model = Model([input_text], output)

    optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])
    return model

model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
stem_input (InputLayer)      (None, 152)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 152, 200)          254800    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 402       
Total params: 255,202
Trainable params: 255,202
Non-trainable params: 0
_________________________________________________________________


In [21]:
from keras.callbacks import ModelCheckpoint
class_weight = {0:0.5 , 1:50 }
filepath = 'fasttext10kbinaryweighted2.19.20.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose = 1, save_best_only = True)
epochs = 300
hist = model.fit(X_train, y_train,
                 batch_size=64,
                 validation_data=(X_val, y_val),
                 epochs=epochs,
                 class_weight=class_weight,
                 callbacks=[EarlyStopping(patience=10, monitor='val_loss'), checkpoint])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 3946 samples, validate on 1316 samples
Epoch 1/300

Epoch 00001: val_loss improved from inf to 1.28036, saving model to fasttext10kbinaryweighted2.19.20.h5
Epoch 2/300

Epoch 00002: val_loss improved from 1.28036 to 1.13856, saving model to fasttext10kbinaryweighted2.19.20.h5
Epoch 3/300

Epoch 00003: val_loss improved from 1.13856 to 0.99086, saving model to fasttext10kbinaryweighted2.19.20.h5
Epoch 4/300

Epoch 00004: val_loss improved from 0.99086 to 0.81092, saving model to fasttext10kbinaryweighted2.19.20.h5
Epoch 5/300

Epoch 00005: val_loss improved from 0.81092 to 0.52111, saving model to fasttext10kbinaryweighted2.19.20.h5
Epoch 6/300

Epoch 00006: val_loss improved from 0.52111 to 0.47116, saving model to fasttext10kbinaryweighted2.19.20.h5
Epoch 7/300

Epoch 00007: val_loss improved from 0.47116 to 0.45139, saving model to fasttext10kbinaryweighted2.19.20.h5
Epoch 8/300

Epoch 00008: val_loss improved from 0.45139 to 0.41649, saving model to fasttext10kbinaryweighte

In [22]:
from keras.models import load_model
model = load_model(filepath)
test_pred = model.predict(X_test, batch_size=64)

In [23]:
test_pred2 = test_pred.argmax(1)
y_test2 = y_test.argmax(1)

In [24]:
from sklearn.metrics import precision_score
precision_score(y_test2, test_pred2)

0.6415094339622641

In [25]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test2, test_pred2, average = 'macro')

(0.8191711857063894, 0.9399349312247756, 0.8691005963733236, None)

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test2, test_pred2)

0.9825227963525835

In [27]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test2, test_pred2, average= 'macro')

0.9399349312247757

In [30]:
from sklearn import metrics
tn, fp, fn, tp = metrics.confusion_matrix(y_test2, test_pred2).ravel()


In [31]:
print('True Negative: ', tn)
print('False Positive: ', fp)
print('False Negative: ', fn)
print('True Positive: ', tp)

print('Accuracy: ', (tp+tn)/(tp+fp+fn+tn))
print('Precision/PPV: ', tp/(tp+fp))
print('Sensitivity/Recall: ', tp/(tp+fn))

True Negative:  1259
False Positive:  19
False Negative:  4
True Positive:  34
Accuracy:  0.9825227963525835
Precision/PPV:  0.6415094339622641
Sensitivity/Recall:  0.8947368421052632
