In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('transcription_fresh.csv')

In [3]:
df.head()

Unnamed: 0,medical_specialty,transcription
0,Orthopedic,"TITLE OF OPERATION: , Youngswick osteotomy wit..."
1,Orthopedic,"PREOPERATIVE DIAGNOSES:,1. Hallux rigidus, le..."
2,Orthopedic,"CHIEF COMPLAINT:, Left wrist pain.,HISTORY OF..."
3,Orthopedic,"PREOPERATIVE DIAGNOSIS: , Wrist ganglion.,POST..."
4,Orthopedic,"PREOPERATIVE DIAGNOSIS:, T11 compression frac..."


In [4]:
df = df.dropna(axis=0, subset=['transcription'])

In [5]:
######### balancing dataset
balance_point = 316
def balance_dataset(df):
    df1 = df[df.medical_specialty == ' Consult - History and Phy.'].iloc[:balance_point]
    df2 = df[df.medical_specialty == ' Cardiovascular / Pulmonary'].iloc[:balance_point]
    df3 = df[df.medical_specialty == ' Orthopedic'].iloc[:balance_point]
    df4 = df[df.medical_specialty == ' Surgery'].iloc[:balance_point]
    
    frames = [df1, df2, df3, df4]
    return pd.concat(frames)

df = balance_dataset(df)

In [6]:
### shuffling the dataset
from sklearn.utils import shuffle
df = shuffle(df)

In [7]:
df = df.reset_index(drop=True)

In [8]:
df

Unnamed: 0,medical_specialty,transcription
0,Surgery,"PREOPERATIVE DIAGNOSIS: , Adenocarcinoma of th..."
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Rhabdomyosarcoma of ..."
2,Cardiovascular / Pulmonary,"REASON FOR VISIT:, Six-month follow-up visit ..."
3,Surgery,"PREOPERATIVE DIAGNOSIS: , Bilateral progressiv..."
4,Surgery,"PREOPERATIVE DIAGNOSIS:, Varicose veins.,POST..."
...,...,...
1259,Orthopedic,"PREOPERATIVE DIAGNOSIS: , Closed type-III supr..."
1260,Surgery,"PREOPERATIVE DIAGNOSIS: , Respiratory failure...."
1261,Surgery,"PREOPERATIVE DIAGNOSIS:, Headaches, question ..."
1262,Cardiovascular / Pulmonary,"DISCHARGE DIAGNOSIS:,1. Respiratory failure i..."


In [9]:
df['medical_specialty'] = df['medical_specialty'].astype('category')

In [10]:
df["medical_specialty_cat"] = df["medical_specialty"].cat.codes

In [11]:
messages = df.transcription.tolist()

In [12]:
#Data cleaning and preprocessing
import re
import nltk
# nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(0, len(messages)):
    message = re.sub('[^a-zA-Z]', ' ', str(messages[i]))
    message = message.lower()
    message = message.split()
    
    message = [lemmatizer.lemmatize(word) for word in message if not word in stopwords.words('english')]
    message = ' '.join(message)
    corpus.append(message)

In [13]:
len(corpus)

1264

In [14]:
target = df.medical_specialty_cat.tolist()

In [15]:
df

Unnamed: 0,medical_specialty,transcription,medical_specialty_cat
0,Surgery,"PREOPERATIVE DIAGNOSIS: , Adenocarcinoma of th...",3
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Rhabdomyosarcoma of ...",3
2,Cardiovascular / Pulmonary,"REASON FOR VISIT:, Six-month follow-up visit ...",0
3,Surgery,"PREOPERATIVE DIAGNOSIS: , Bilateral progressiv...",3
4,Surgery,"PREOPERATIVE DIAGNOSIS:, Varicose veins.,POST...",3
...,...,...,...
1259,Orthopedic,"PREOPERATIVE DIAGNOSIS: , Closed type-III supr...",2
1260,Surgery,"PREOPERATIVE DIAGNOSIS: , Respiratory failure....",3
1261,Surgery,"PREOPERATIVE DIAGNOSIS:, Headaches, question ...",3
1262,Cardiovascular / Pulmonary,"DISCHARGE DIAGNOSIS:,1. Respiratory failure i...",0


In [16]:
data = []
for i in range(len(corpus)):
    single_data = []
    single_data.append(corpus[i])
    single_data.append(target[i])
    data.append(single_data)
    

In [17]:
data[220]

['history pleasure meeting evaluating patient today referred evaluation tracheostomy tube placement treatment recommendation well aware pleasant year old gentleman unfortunately suffering end stage copd required tracheostomy tube placement three month ago treated acute exacerbation copd difficulty coming ventilatory support resides extended care facility capped tracheostomy tube unfortunately state use tracheostomy tube since discharge admission extended care facility requires constant oxygen administration problem shortness breath worsening requiring opening tracheostomy tube site state tenderness associated tracheostomy tube difficulty swallowing wish removed apparently history airway issue sleeping need uncapping tube essentially tube remained present month capped neck history previous tracheostomy tube insertion past medical history copd history hypercarbic hypoxemia history coronary artery disease history previous myocardial infarction history liver cirrhosis secondary alcohol use

In [18]:
train_data = data[:1136]
eval_data = data[1136:]

In [19]:
len(train_data)

1136

In [20]:
eval_data

[['preoperative diagnosis left obstructed renal ureteropelvic junction obstruction status post pyeloplasty percutaneous procedure pyeloureteroscopy x status post pseudomonas pyelonephritis x renal insufficiency solitary kidney postoperative diagnosis left obstructed renal ureteropelvic junction obstruction status post pyeloplasty percutaneous procedure pyeloureteroscopy x status post pseudomonas pyelonephritis x renal insufficiency solitary kidney procedure cystoscopy anesthesia retrograde antegrade pyeloureteroscopy left ureteropelvic junction obstruction difficult open renal biopsy anesthesia general endotracheal anesthetic caudal block x fluid received ml crystalloid estimated blood loss le ml specimen tissue sent pathology renal biopsy abnormal finding stenotic scarred ureteropelvic junction dilated ureter dilated renal pelvis tube drain french silicone foley catheter ml balloon french ureteral double j stent multilength indication operation patient year old boy solitary left kidne

In [21]:
from simpletransformers.classification import ClassificationModel
import pandas as pd
import logging



In [22]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [23]:
# Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns. If the Dataframe has a header, it should contain a 'text' and a 'labels' column. If no header is present, the Dataframe should contain at least two columns, with the first column is the text with type str, and the second column in the label with type int.
#train_data = [['Example sentence belonging to class 1', 1], ['Example sentence belonging to class 0', 0], ['Example eval senntence belonging to class 2', 2]]
train_df = pd.DataFrame(train_data)

#eval_data = [['Example eval sentence belonging to class 1', 1], ['Example eval sentence belonging to class 0', 0], ['Example eval senntence belonging to class 2', 2]]
eval_df = pd.DataFrame(eval_data)

In [24]:
train_df[1].value_counts()

2    289
1    287
3    281
0    279
Name: 1, dtype: int64

In [25]:
# Create a ClassificationModel
model = ClassificationModel('bert', 'bert-base-cased', use_cuda=False,num_labels=4, args={"num_train_epochs": 5,"train_batch_size":3,'reprocess_input_data': True, 'overwrite_output_dir': True})
# You can set class weights by using the optional weight argument

- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Train the model
model.train_model(train_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=1136.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=379.0, style=ProgressStyle(des…








HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=379.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=379.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=379.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=379.0, style=ProgressStyle(des…





INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to outputs/.


In [27]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

#predictions, raw_outputs = model.predict(["Some arbitary sentence"])

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=16.0, style=ProgressStyle(descri…

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8548032786103316, 'eval_loss': 0.6032861072671949}





In [28]:
model_outputs

array([[-0.8782302 , -3.3733299 , -2.841085  ,  6.3340583 ],
       [ 0.45935148, -3.6723046 , -3.4804733 ,  5.7595377 ],
       [-2.2105386 ,  6.8054566 , -1.9185472 , -2.5219917 ],
       [ 6.2371106 , -2.4440532 , -3.0597064 , -0.81286025],
       [-1.2382469 , -2.9393215 ,  6.0899057 , -1.6744866 ],
       [ 5.603412  , -2.1611698 , -1.4583402 , -1.7644577 ],
       [-2.2125144 ,  6.9076257 , -1.7920537 , -2.599936  ],
       [-1.9660954 , -2.7458715 ,  6.706381  , -1.3363246 ],
       [ 6.3454623 , -2.3474164 , -2.6882646 , -1.4088447 ],
       [-1.8588797 , -2.4312    ,  6.7040105 , -1.908662  ],
       [ 6.242384  , -2.510218  , -2.8963635 , -0.90730166],
       [ 6.292773  , -2.6196537 , -2.9749095 , -0.72057176],
       [-2.6641088 ,  6.8883057 , -1.770063  , -2.20336   ],
       [-1.8798505 , -2.557366  ,  6.694404  , -1.7487676 ],
       [ 5.343629  ,  0.43931347, -3.0328784 , -2.5460386 ],
       [ 6.202529  , -2.5488691 , -2.9043748 , -0.86364365],
       [-2.6400363 ,  6.

In [29]:
import numpy as np
preds = np.argmax(model_outputs, axis = -1)

In [30]:
from sklearn.metrics import accuracy_score
print(accuracy_score(eval_df[1].tolist(), preds))

0.890625


In [31]:
model = ClassificationModel('bert', 'outputs/', use_cuda = False)

In [32]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=16.0, style=ProgressStyle(descri…

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8548032786103316, 'eval_loss': 0.6032861072671949}





In [33]:
predictions, raw_outputs = model.predict(['hard', 'soft', 'dull','beautifull','bad'])

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [34]:
raw_outputs

array([[ 0.8051026 , -0.895271  ,  2.1208284 , -1.5648825 ],
       [ 1.2558453 , -1.0878137 , -0.02491438, -0.25917736],
       [ 1.9624728 , -0.822741  ,  0.682927  , -1.5536625 ],
       [ 1.2537329 , -1.1005013 ,  1.143033  , -1.0593405 ],
       [ 1.3489013 , -1.1127791 ,  1.3244436 , -1.1962984 ]],
      dtype=float32)

In [35]:
import numpy as np
preds = np.argmax(raw_outputs, axis = -1)

In [36]:
preds

array([2, 0, 0, 0, 0])

In [37]:
from sklearn.metrics import accuracy_score
print(accuracy_score(eval_df[1].tolist(), preds))

ValueError: Found input variables with inconsistent numbers of samples: [128, 5]