In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime
from sklearn.metrics import accuracy_score,f1_score
%load_ext tensorboard

In [None]:
!rm -rf ./logs/

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

import pandas as pd
data=pd.read_csv('gdrive/My Drive/cleaned_data.csv')
data=data.drop([data.columns[0]],axis=1)

In [None]:
data

In [None]:
data['medical_specialty'].value_counts()

In [None]:
data[['description','keywords']]=data[['description','keywords']].fillna("none")

Using only transcription as a feature for classification

In [None]:
data_transcript=data[['transcription','medical_specialty']]

In [None]:
data_transcript

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
X=data_transcript.iloc[:,0]
Y=data_transcript.iloc[:,-1]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,shuffle=True,test_size=0.2)

In [None]:
training_sentences=X_train.to_list()
testing_sentences=X_test.to_list()
training_labels=y_train.to_list()
testing_labels=y_test.to_list()

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, categories='auto')
training_labels_final=encoder.fit_transform(np.array(training_labels).reshape(-1,1))
testing_labels_final=encoder.transform(np.array(testing_labels).reshape(-1,1))

In [None]:
max_length=0
for sentence in training_sentences:
  max_length=max(max_length,len(sentence.split()))

In [None]:
max_length

In [None]:
embedding_dim=100
trunc_type='post'
oov_token='<oov>'

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer=Tokenizer(oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)

sequences=tokenizer.texts_to_sequences(training_sentences)
padded=pad_sequences(sequences,maxlen=max_length,truncating=trunc_type)

testing_sequences=tokenizer.texts_to_sequences(testing_sentences)
testing_padded=pad_sequences(testing_sequences,maxlen=max_length)

In [None]:
vocab_size=len(tokenizer.word_index)+1

In [None]:
vocab_size

In [None]:
padded

In [None]:
training_labels_final

In [None]:
!pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(sampling_strategy='auto',k_neighbors=2)

In [None]:
vocab_size

In [None]:
X_resampled, y_resampled = smote.fit_resample(padded, training_labels_final)

In [None]:
unique_rows, counts = np.unique(y_resampled, axis=0, return_counts=True)

for row, c in zip(unique_rows, counts):
   print(c)

ANN

In [None]:
#ANN model
model_ANN=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),# Or average pooling
    tf.keras.layers.Dense(128,activation="relu"),
    tf.keras.layers.Dense(64,activation="relu"),
    tf.keras.layers.Dense(40,activation="softmax")
],name="ANN_model")

model_ANN.compile(loss=tf.keras.losses.CategoricalCrossentropy(),optimizer="adam")
log_dir = "logs/fit/" + model_ANN.name+"-"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
model_ANN.fit(padded,training_labels_final,epochs=5,callbacks=[tensorboard_callback],validation_data=(testing_padded,testing_labels_final))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d595e2607f0>

In [None]:
def EvalModel(model=None,train_seq=None,test_seq=None,training_labels=None,testing_labels=None):
  predicted_prob_train=model.predict(train_seq,verbose=False)
  predicted_training_labels=[np.argmax(probs) for probs in predicted_prob_train]
  train_f1 = f1_score(training_labels, predicted_training_labels,average='micro')
  #train_f1 = accuracy_score(training_labels, predicted_training_labels)
  predicted_prob_test=model.predict(test_seq,verbose=False)

  predicted_testing_labels=[np.argmax(probs) for probs in predicted_prob_test]
  test_f1= f1_score(testing_labels, predicted_testing_labels,average='micro')
  #test_f1= accuracy_score(testing_labels, predicted_testing_labels)
  return {'train':train_f1,'test':test_f1}

In [None]:
EvalModel(model_ANN,padded,testing_padded,training_labels,testing_labels)

{'train': 0.34833708427106774, 'test': 0.334}

CNN

In [None]:
model_CNN=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.Conv1D(filters=32,kernel_size=(3)),# Or average pooling
    tf.keras.layers.MaxPooling1D(pool_size=(2),padding='valid'),
    tf.keras.layers.Flatten(),

    tf.keras.layers.Dense(64,activation="relu"),
    tf.keras.layers.Dense(40,activation="softmax")
],name="CNN_model")

model_CNN.compile(loss=tf.keras.losses.CategoricalCrossentropy(),optimizer="adam")
log_dir = "logs/fit/" + model_CNN.name+'_'+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
model_CNN.fit(padded,training_labels_final,epochs=5,callbacks=[tensorboard_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d59529333a0>

In [None]:
EvalModel(model_CNN,padded,testing_padded,training_labels,testing_labels)

{'train': 0.5408852213053263, 'test': 0.103}

LSTM


In [None]:
model_lstm=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(32,"relu"),
    tf.keras.layers.Dense(40,"softmax")
],"lstm")
model_lstm.compile(loss=tf.keras.losses.CategoricalCrossentropy(),optimizer="adam")
log_dir = "logs/fit/" + model_lstm.name+'_'+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
model_lstm.fit(X_resampled,y_resampled,epochs=5,callbacks=[tensorboard_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d58f8136b30>

In [None]:
EvalModel(model_lstm,padded,testing_padded,training_labels,testing_labels)

{'train': 0.36184046011502874, 'test': 0.162}

ELMO (BILSTM)

In [None]:
model_BiLSTM=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True)),
    tf.keras.layers.Dense(32,"relu"),
    tf.keras.layers.Dense(40,"softmax")
],"BiLSTM")
model_BiLSTM.compile(loss=tf.keras.losses.CategoricalCrossentropy(),optimizer="adam")
log_dir = "logs/fit/" + model_lstm.name+'_'+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
model_BiLSTM.fit(padded,training_labels_final,epochs=5,callbacks=[tensorboard_callback])

Epoch 1/5


ValueError: ignored

In [None]:
EvalModel(model_BiLSTM,padded,testing_padded,training_labels,testing_labels)

In [None]:
model_BiLSTM_deep=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(64,"relu"),
    tf.keras.layers.Dense(40,"softmax")
],"BiLSTM_deep")
model_BiLSTM_deep.compile(loss=tf.keras.losses.CategoricalCrossentropy(),optimizer="adam")
log_dir = "logs/fit/" + model_BiLSTM.name+'_'+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
model_BiLSTM_deep.fit(padded,training_labels_final,epochs=5,callbacks=[tensorboard_callback])

In [None]:
EvalModel(model_BiLSTM_deep,padded,testing_padded,training_labels,testing_labels)

In [None]:
!pip install tensorflow_text

In [None]:
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
encoder = hub.KerasLayer("https://tfhub.dev/google/LEALLA/LEALLA-base/1")

In [None]:
%tensorboard --logdir logs/fit