## Bert

In [108]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import transformers
import torch
import logging
logging.basicConfig(level=logging.INFO)
from transformers import BertTokenizer, BertForSequenceClassification, BertModel

In [109]:
!nvidia-smi

Tue May 23 22:52:45 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.14                 Driver Version: 531.14       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1650       WDDM | 00000000:01:00.0  On |                  N/A |
| N/A   50C    P5                6W /  N/A|    774MiB /  4096MiB |     40%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [110]:
# =============================================================================
# ## check GPU
# =============================================================================
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1650


# Import data

In [111]:
url = r'mtsamples.csv'

In [112]:
df = pd.read_csv(url)
df.sample(5)

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
2284,Right carpal tunnel release and right index a...,Orthopedic,Carpal Tunnel Release - 8,"PREOPERATIVE DIAGNOSES:,1. Right carpal tunne...","orthopedic, compressed median nerve, stenosing..."
4445,Patient reports a rotational sensation upon a...,Consult - History and Phy.,Dizziness - Recurrent,"CHIEF COMPLAINT:, Recurrent dizziness x1 mont...",
3002,Left partial nephrectomy due to left renal mass.,Nephrology,Nephrectomy - Partial,"PREOPERATIVE DIAGNOSIS:, Left renal mass, 5 c...","nephrology, renal mass, bovie, finochietto ret..."
3117,The patient is a 67-year-old white female wit...,Hematology - Oncology,Uterine Papillary Serous Carcinoma,"HISTORY OF PRESENT ILLNESS:, The patient is a...","hematology - oncology, chemotherapy, uterine p..."
1980,She is a 14-year-old Hispanic female with his...,Pain Management,Knee Injection - 2,"INDICATIONS FOR PROCEDURE: , The patient was h...","pain management, arthralgias, aristospan, pauc..."


In [113]:
# use only two column transcription, medical_specialty
df = df[['transcription', 'medical_specialty']] 
# clean medical_specialty column contain spacebar in front of text
df['medical_specialty'].str.strip()
# creat LabelEncoder
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['label'] = labelencoder.fit_transform(df['medical_specialty'])
df

Unnamed: 0,transcription,medical_specialty,label
0,"SUBJECTIVE:, This 23-year-old white female pr...",Allergy / Immunology,0
1,"PAST MEDICAL HISTORY:, He has difficulty climb...",Bariatrics,2
2,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...",Bariatrics,2
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,3
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,3
...,...,...,...
4994,"HISTORY:, I had the pleasure of meeting and e...",Allergy / Immunology,0
4995,"ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...",Allergy / Immunology,0
4996,"SUBJECTIVE: , This is a 42-year-old white fema...",Allergy / Immunology,0
4997,"CHIEF COMPLAINT: , This 5-year-old male presen...",Allergy / Immunology,0


In [114]:
df.loc[df['label'] == 0].sample(5)[['transcription','label']]

Unnamed: 0,transcription,label
0,"SUBJECTIVE:, This 23-year-old white female pr...",0
4995,"ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...",0
4996,"SUBJECTIVE: , This is a 42-year-old white fema...",0
4998,"HISTORY: , A 34-year-old male presents today s...",0
4994,"HISTORY:, I had the pleasure of meeting and e...",0


In [115]:
df.dropna(inplace=True)

In [116]:
sentences = df.transcription.values
labels = df.label.values
sentences, labels

(array(['SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without aden

In [117]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [118]:
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without a

# Convert text file into word embedding array

In [119]:
max_length = 0
for sentence in sentences:
    max_length = max(max_length, len(tokenizer.encode(sentence, add_special_tokens=True)))
    if max_length >= 512:
        max_length = 512
        break

print('Max sentence length: ', max_length)


Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  512


## BERT extract word embedding

In [120]:
# Load pre-trained model (weights)
model_wordembedding = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model_wordembedding.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [121]:
input_ids = []
attention_masks = []
word_embedding = []
for sentence in sentences[:3]:
    print(len(sentence))
    encoded_dict = tokenizer.encode_plus(sentence, 
                                         truncation=True, # cut word length to maximum 512
                                         add_special_tokens = True, 
                                         max_length = max_length,
                                         padding='max_length',
                                         return_tensors = 'pt')
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    # Run the text through BERT, and collect all of the hidden states produced
    with torch.no_grad():
       outputs = model_wordembedding(**encoded_dict)

    last_hidden_states = outputs.last_hidden_state
    word_embedding.append(last_hidden_states[0])


input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.LongTensor(labels)


# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
print('Word Embedding:', word_embedding[0])


1331
2431
4422
Original:  SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Su

In [122]:
print('Word Embedding:', word_embedding[0])

Word Embedding: tensor([[-0.5568, -0.2536,  0.0660,  ...,  0.0279,  0.1602,  0.4339],
        [ 0.2378, -0.1247, -0.4652,  ...,  0.2637,  0.8827, -0.3923],
        [ 0.0422, -0.5339, -0.4318,  ..., -0.1158,  0.4419,  0.5342],
        ...,
        [-0.3046,  0.0290, -0.1525,  ...,  0.1469,  0.0332, -0.0914],
        [-0.4048,  0.1284,  0.2849,  ...,  0.1745, -0.2280, -0.0453],
        [-0.3535, -0.1386,  0.1534,  ...,  0.0301, -0.1704, -0.1329]])


In [123]:
# import tqdm for show progression
from tqdm import tqdm
tqdm.pandas()

In [127]:
# function for convert image file into 3d integer array and store in list after that return its numpy
def getWordEmbedding(sentences):
  input_ids = []
  attention_masks = []
  word_embedding = []

  for index,sentence in tqdm(enumerate(sentences[:5])):
            
      encoded_dict = tokenizer.encode_plus(sentence, 
                                          truncation=True, # cut word length to maximum 512
                                          add_special_tokens = True, 
                                          max_length = max_length,
                                          padding='max_length',
                                          return_tensors = 'pt')
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

      # Run the text through BERT, and collect all of the hidden states produced
      with torch.no_grad():
        outputs = model_wordembedding(**encoded_dict)
      last_hidden_states = outputs.last_hidden_state
      word_embedding.append(last_hidden_states[0])

  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  
  # convert list into numpy array then return data and label
  return input_ids, attention_masks, word_embedding

In [128]:
input_ids = []
attention_masks = []
word_embedding = []
labels = torch.LongTensor(labels)

input_ids, attention_masks, word_embedding  = getWordEmbedding(sentences)

input_ids[:5], labels[:5]

5it [00:06,  1.33s/it]


(tensor([[  101, 20714,  1024,  ...,     0,     0,     0],
         [  101,  2627,  2966,  ...,  1010, 26572,   102],
         [  101,  2381,  1997,  ...,  4645,  2545,   102],
         [  101,  1016,  1011,  ...,     0,     0,     0],
         [  101,  1015,  1012,  ...,     0,     0,     0]]),
 tensor([0, 2, 2, 3, 3]))

In [129]:
# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0].shape, input_ids[0])

Original:  SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without ad

In [130]:
print('Word Embedding:', word_embedding[0].shape, word_embedding[0])

Word Embedding: torch.Size([512, 768]) tensor([[-0.5568, -0.2536,  0.0660,  ...,  0.0279,  0.1602,  0.4339],
        [ 0.2378, -0.1247, -0.4652,  ...,  0.2637,  0.8827, -0.3923],
        [ 0.0422, -0.5339, -0.4318,  ..., -0.1158,  0.4419,  0.5342],
        ...,
        [-0.3046,  0.0290, -0.1525,  ...,  0.1469,  0.0332, -0.0914],
        [-0.4048,  0.1284,  0.2849,  ...,  0.1745, -0.2280, -0.0453],
        [-0.3535, -0.1386,  0.1534,  ...,  0.0301, -0.1704, -0.1329]])


## Save preprocess file to h5

In [132]:
import h5py

In [134]:
url =''

In [135]:
hf = h5py.File(url+'mtsample_wordEmbedding.h5', 'w')
hf.create_dataset('input_ids', data=input_ids)
hf.create_dataset('attention_masks', data=attention_masks)
hf.create_dataset('word_embedding', data=word_embedding)
hf.create_dataset('labels', data=labels)
hf.close()

  data = np.asarray(data, order="C", dtype=as_dtype)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part.

## Read preprocess h5 file

In [136]:
import h5py

In [137]:
hf = h5py.File(url+'NeuralnetTrain.h5', 'r')
list(hf.keys())

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'NeuralnetTrain.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
print("Loading training data ...", end="")
hf = h5py.File(url + "NeuralnetTrain.h5", "r")
X_train = np.array(hf['X_train'])
y_train = np.array(hf['y_train'])
hf.close()
print(X_train.shape,y_train.shape," Done.")

print("Loading validating data ...", end="")
hf = h5py.File(url + "NeuralnetVal.h5", "r")
X_val = np.array(hf.get("X_val"))
y_val = np.array(hf.get("y_val"))
hf.close()
print(X_val.shape,y_val.shape," Done.")


print("Loading testing data ...", end="")
hf = h5py.File(url + "NeuralnetTest.h5", "r")
X_test = np.array(hf.get("X_test"))
y_test = np.array(hf.get("y_test"))
hf.close()
print(X_test.shape,y_test.shape," Done.")


Loading training data ...(5187, 50, 50, 3) (5187,)  Done.
Loading validating data ...(1297, 50, 50, 3) (1297,)  Done.
Loading testing data ...(1622, 50, 50, 3) (1622,)  Done.


In [None]:
def plotLoss(modelHistory):
  plt.plot(modelHistory.history['loss'])
  plt.plot(modelHistory.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'val'], loc='upper left')
  plt.show()

In [None]:
def plotAccuracy(modelHistory):
  plt.plot(modelHistory.history['accuracy'])
  plt.plot(modelHistory.history['val_accuracy'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'val'], loc='upper left')
  plt.show()