In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense, GlobalAveragePooling1D, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [32]:
df = pd.read_csv('2018_thyroid_sents_no_duplicates.csv')
df2 = pd.read_csv('2017_thyroid_sents.csv')
df3 = pd.concat([df, df2])

In [39]:
df3.head()

Unnamed: 0,key,text,truth
0,1,['there is heterogeneous enlargement of the r...,1
1,5,['findings: the imaged thyroid gland is norma...,0
2,8,['findings: thyroid gland is within normal lim...,0
3,9,['findings: thyroid gland is within normal li...,0
4,10,"['diffusely enlarged thyroid, similar to prior.']",0


In [40]:
text2 = df3.text.str.replace("\[", "")
text3 = text2.str.replace("\]", "")
text4 = text3.str.replace("'", "")
text4.head()

0    there is heterogeneous enlargement of the  rig...
1       findings:  the imaged thyroid gland is normal.
2     findings: thyroid gland is within normal limits.
3    findings:  thyroid gland is within normal limits.
4        diffusely enlarged thyroid, similar to prior.
Name: text, dtype: object

In [41]:
df3.text = text4

In [44]:
tok_raw = Tokenizer(char_level=False)
tok_raw.fit_on_texts(df3.text.str.lower())
tok_stem = Tokenizer(char_level=False)
tok_stem.fit_on_texts(df3.text)
df3['toks'] = tok_stem.texts_to_sequences(df3.text)
df3.head()

Unnamed: 0,key,text,truth,toks
0,1,there is heterogeneous enlargement of the rig...,1,"[32, 4, 60, 58, 6, 2, 14, 13, 6, 2, 1, 9, 19, ..."
1,5,findings: the imaged thyroid gland is normal.,0,"[3, 2, 88, 1, 9, 4, 5]"
2,8,findings: thyroid gland is within normal limits.,0,"[3, 1, 9, 4, 27, 5, 37]"
3,9,findings: thyroid gland is within normal limits.,0,"[3, 1, 9, 4, 27, 5, 37]"
4,10,"diffusely enlarged thyroid, similar to prior.",0,"[383, 46, 1, 75, 21, 35]"


In [45]:
long = []
for row in df3.toks:
    long.append(np.max(row))
np.max(long)

2182

In [46]:
n_stem_seq = 2183

In [52]:
long = []
for row in df3.toks:
    long.append(len(row))
np.max(long)

402

In [54]:
maxlen = 402

In [47]:
df['toks'] = tok_stem.texts_to_sequences(df.text)
df2['toks'] = tok_stem.texts_to_sequences(df2.text)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(df.toks,
                                                   pd.get_dummies(df['truth'], drop_first=False),
                                                   test_size = 0.2,
                                                   stratify= df['truth'],
                                                   random_state=42)

In [55]:
X_train = pad_sequences(X_train, maxlen, truncating = 'post')
X_test = pad_sequences(X_test, maxlen, truncating = 'post')

In [56]:
y_train = y_train.values
y_test = y_test.values

In [57]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2

In [112]:
def get_model():
    embed_dim = 300
    
    input_text = Input(shape=[maxlen], name ="Input_Text")
    
    emb = (Embedding(n_stem_seq, embed_dim, input_length = maxlen))(input_text)
    
    pool = GlobalAveragePooling1D()(emb)
    
    output = Dense(2, activation='softmax')(pool)
    
    model = Model([input_text], output)
    
    optimizer = Adam(lr=0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon=1e-08, decay = 0.0)
    
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

model = get_model()
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input_Text (InputLayer)      [(None, 402)]             0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 402, 300)          654900    
_________________________________________________________________
global_average_pooling1d_5 ( (None, 300)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 602       
Total params: 655,502
Trainable params: 655,502
Non-trainable params: 0
_________________________________________________________________


In [113]:
from tensorflow.keras.callbacks import ModelCheckpoint
class_weight = {0:0.5 , 1:10}
filepath = '9.4.20_training_w_weights.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose = 1, save_best_only=True)
epochs = 300
hist = model.fit(X_train, y_train,
                batch_size = 64, 
                validation_data = (X_test, y_test),
                epochs = epochs,
                class_weight = class_weight,
                callbacks=[EarlyStopping(patience=10, monitor='val_loss'), checkpoint])

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 1860 samples, validate on 465 samples
Epoch 1/300
Epoch 00001: val_loss improved from inf to 0.93516, saving model to 9.4.20_training_w_weights.h5
Epoch 2/300
Epoch 00002: val_loss improved from 0.93516 to 0.92439, saving model to 9.4.20_training_w_weights.h5
Epoch 3/300
Epoch 00003: val_loss improved from 0.92439 to 0.91168, saving model to 9.4.20_training_w_weights.h5
Epoch 4/300
Epoch 00004: val_loss improved from 0.91168 to 0.89462, saving model to 9.4.20_training_w_weights.h5
Epoch 5/300
Epoch 00005: val_loss improved from 0.89462 to 0.87011, saving model to 9.4.20_training_w_weights.h5
Epoch 6/300
Epoch 00006: val_loss improved from 0.87011 to 0.83571, saving model to 9.4.20_training_w_weights.h5
Epoch 7/300
Epoch 00007: val_loss improved from 0.83571 to 0.79351, saving model to 9.4.20_training_w_weights.h5
Epoch 8/300
Epoch 00008: val_loss improved from 0.79351 to 0.74623, saving model to 9.4.20_training_w_weights.h5
Epo

Epoch 25/300
Epoch 00025: val_loss did not improve from 0.35685
Epoch 26/300
Epoch 00026: val_loss improved from 0.35685 to 0.34563, saving model to 9.4.20_training_w_weights.h5
Epoch 27/300
Epoch 00027: val_loss improved from 0.34563 to 0.33777, saving model to 9.4.20_training_w_weights.h5
Epoch 28/300
Epoch 00028: val_loss did not improve from 0.33777
Epoch 29/300
Epoch 00029: val_loss did not improve from 0.33777
Epoch 30/300
Epoch 00030: val_loss improved from 0.33777 to 0.33324, saving model to 9.4.20_training_w_weights.h5
Epoch 31/300
Epoch 00031: val_loss did not improve from 0.33324
Epoch 32/300
Epoch 00032: val_loss did not improve from 0.33324
Epoch 33/300
Epoch 00033: val_loss improved from 0.33324 to 0.32103, saving model to 9.4.20_training_w_weights.h5
Epoch 34/300
Epoch 00034: val_loss did not improve from 0.32103
Epoch 35/300
Epoch 00035: val_loss improved from 0.32103 to 0.31808, saving model to 9.4.20_training_w_weights.h5
Epoch 36/300
Epoch 00036: val_loss did not imp

In [81]:
from tensorflow.keras.models import load_model
model = load_model(filepath)
test_pred = model.predict(X_test, batch_size=64)


In [82]:
test_pred2 = test_pred.argmax(1)
y_test2 = y_test.argmax(1)

In [83]:
from sklearn import metrics
tn, fp, fn, tp = metrics.confusion_matrix(y_test2, test_pred2).ravel()

In [84]:
print('True Negative: ', tn)
print('False Positive: ', fp)
print('False Negative: ', fn)
print('True Positive: ', tp)

print('Accuracy: ', (tp+tn)/(tp+fp+fn+tn))
print('Precision/PPV: ', (tp)/(tp+fp))
print('Sensetivity/Recall: ', tp/(tp+fn))


True Negative:  364
False Positive:  49
False Negative:  4
True Positive:  48
Accuracy:  0.886021505376344
Precision/PPV:  0.4948453608247423
Sensetivity/Recall:  0.9230769230769231


In [77]:
2064/2325

0.8877419354838709

In [78]:
6387/6576

0.9712591240875912

In [114]:
model = load_model(filepath)
test_pred = model.predict(df2_fortest, batch_size=64)

In [87]:
df2_fortest = pad_sequences(df2.toks, maxlen, truncating = 'post')

In [115]:
test_pred2 = test_pred.argmax(1)


In [116]:
tn, fp, fn, tp = metrics.confusion_matrix(df2.truth, test_pred2).ravel()

In [117]:
print('True Negative: ', tn)
print('False Positive: ', fp)
print('False Negative: ', fn)
print('True Positive: ', tp)

print('Accuracy: ', (tp+tn)/(tp+fp+fn+tn))
print('Precision/PPV: ', (tp)/(tp+fp))
print('Sensetivity/Recall: ', tp/(tp+fn))


True Negative:  6135
False Positive:  252
False Negative:  14
True Positive:  175
Accuracy:  0.9595498783454988
Precision/PPV:  0.4098360655737705
Sensetivity/Recall:  0.9259259259259259


In [118]:
from scipy.stats import bayes_mvs

In [None]:
bayes_mvs( , alpha=0.95)

In [119]:
total_tp = 175+175+172
total_fp = 185+252+213
total_tn = 6202+6135+6174
total_fn = 14+14+17

In [120]:
trues = total_tp + total_tn
falses = total_fp + total_fn

In [121]:
trues/(trues+falses)

0.9647708840227088

In [122]:
true_ones = np.ones(trues)

In [123]:
true_ones.shape

(19033,)

In [124]:
false_zeros = np.zeros(falses)

In [125]:
false_zeros.shape

(695,)

In [132]:
acc = np.concatenate([false_zeros, true_ones])

In [133]:
acc.shape

(19728,)

In [135]:
bayes_mvs(acc, alpha=0.95)

(Mean(statistic=0.9647708840227088, minmax=(0.9621983002360095, 0.9673434678094082)),
 Variance(statistic=0.03398802536474973, minmax=(0.03331729573342786, 0.0346587549960716)),
 Std_dev(statistic=0.1843584154975024, minmax=(0.18253932405675677, 0.18617750693824803)))

In [138]:
sens = np.concatenate([np.ones(total_tp), np.zeros(total_fn)])

In [139]:
bayes_mvs(sens, alpha=0.95)

(Mean(statistic=0.9206349206349206, minmax=(0.8983183219787403, 0.9429515192911009)),
 Variance(statistic=0.0734549138804458, minmax=(0.06536363836428487, 0.08253182578217887)),
 Std_dev(statistic=0.2709055613430478, minmax=(0.25566313454286843, 0.2872835285605126)))

In [140]:
spec = np.concatenate([np.ones(total_tn), np.zeros(total_fp)])

In [141]:
bayes_mvs(spec, alpha=0.95)

(Mean(statistic=0.966076927091488, minmax=(0.9635136683127694, 0.9686401858702065)),
 Variance(statistic=0.032772298032955866, minmax=(0.03211606075197317, 0.03342853531393856)),
 Std_dev(statistic=0.18103120734546257, minmax=(0.1792187096810947, 0.18284370500983044)))

In [142]:
ppv = np.concatenate([np.ones(total_tp), np.zeros(total_fp)])

In [143]:
bayes_mvs(ppv, alpha=0.95)

(Mean(statistic=0.4453924914675768, minmax=(0.4169381457842952, 0.4738468371508584)),
 Variance(statistic=0.24701802001188128, minmax=(0.22701811563211313, 0.2670179243916494)),
 Std_dev(statistic=0.4970090743757917, minmax=(0.47688881358891716, 0.5171293351626663)))

In [145]:
npv = np.concatenate([np.ones(total_tn), np.zeros(total_fn)])

In [146]:
bayes_mvs(npv, alpha=0.95)

(Mean(statistic=0.9975749083854278, minmax=(0.9968672189008666, 0.9982825978699891)),
 Variance(statistic=0.0024192105452330386, minmax=(0.0023699845422334716, 0.0024684365482326056)),
 Std_dev(statistic=0.04918547087538187, minmax=(0.04868505884187418, 0.04968588290888957)))