In [1]:
import numpy as np
import pandas as pd
import string
import ast
import torch
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

In [3]:
df = pd.read_csv("mtsamples.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [4]:
df = df.loc[:,["transcription", "keywords"]]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   transcription  4966 non-null   object
 1   keywords       3931 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB


In [7]:
df.describe()

Unnamed: 0,transcription,keywords
count,4966,3931.0
unique,2357,3849.0
top,"PREOPERATIVE DIAGNOSIS: , Low back pain.,POSTO...",
freq,5,81.0


In [8]:
df.shape

(4999, 2)

In [9]:
df.isnull().sum()

transcription      33
keywords         1068
dtype: int64

Converting transcription to lowercase.

In [10]:
df['transcription_lower']=df['transcription'].apply(lambda x: ' '.join([i for i in str(x).lower().split(' ')]))
df['transcription_lower']=df['transcription_lower'].astype(str)

 Removing rows which have atleast one NULL value.

In [20]:
df=df[["transcription","keywords","transcription_lower"]].dropna()

Removing duplicate rows.

In [21]:
df=df.drop_duplicates(['transcription', 'keywords',"transcription_lower"])

In [22]:
df.isnull().sum()

transcription          0
keywords               0
transcription_lower    0
dtype: int64

In [23]:
df.head()

Unnamed: 0,transcription,keywords,transcription_lower
0,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...","subjective:, this 23-year-old white female pr..."
1,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...","past medical history:, he has difficulty climb..."
2,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...","history of present illness: , i have seen abc ..."
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...","2-d m-mode: , ,1. left atrial enlargement wit..."
4,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",1. the left ventricular cavity size and wall ...


In [24]:
def replace(input_string):
    return input_string.replace('\'', '\\\'')

In [25]:
df['keywords_list'] = df['keywords'].astype(str).apply(replace)

In [26]:
def add_single_quotes(input_string):
    """Add single quotes to every word in a comma-separated string."""
    words = input_string.split(', ')
    quoted_words = [f"'{word.strip()}'" for word in words]
    return ', '.join(quoted_words)

In [27]:
df['keywords_list'] = df['keywords_list'].apply(add_single_quotes)

In [28]:
def add_square_brackets(input_string):
  input_string = '['+input_string+']'
  return input_string

In [29]:
df['keywords_list'] = df['keywords_list'].apply(add_square_brackets)

In [30]:
ast.literal_eval(df['keywords_list'].iloc[0])

['allergy / immunology',
 'allergic rhinitis',
 'allergies',
 'asthma',
 'nasal sprays',
 'rhinitis',
 'nasal',
 'erythematous',
 'allegra',
 'sprays',
 'allergic,']

In [31]:
df['keywords_list'] = df['keywords_list'].apply(lambda x: ast.literal_eval(x))

In [32]:
type(df['keywords_list'].iloc[0])

list

In [33]:
data=df[["transcription_lower","keywords_list"]]

In [34]:
data.head()

Unnamed: 0,transcription_lower,keywords_list
0,"subjective:, this 23-year-old white female pr...","[allergy / immunology, allergic rhinitis, alle..."
1,"past medical history:, he has difficulty climb...","[bariatrics, laparoscopic gastric bypass, weig..."
2,"history of present illness: , i have seen abc ...","[bariatrics, laparoscopic gastric bypass, hear..."
3,"2-d m-mode: , ,1. left atrial enlargement wit...","[cardiovascular / pulmonary, 2-d m-mode, doppl..."
4,1. the left ventricular cavity size and wall ...,"[cardiovascular / pulmonary, 2-d, doppler, ech..."


In [None]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(data['keywords_list'])

In [None]:
# pd.DataFrame(y, columns=multilabel.classes_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['transcription_lower'], y, test_size = 0.2, random_state = 0)

In [None]:
tfidf = TfidfVectorizer(min_df=0.00009, smooth_idf=True, tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3), max_features=50000,stop_words='english')
X_train = tfidf.fit_transform(X_train)
X_test =tfidf.transform(X_test)



In [None]:
sgd = SGDClassifier(loss='log', alpha=1e-5, penalty='l1')
# lr = LogisticRegression(solver='lbfgs')
# svc = LinearSVC()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer

clf = OneVsRestClassifier(sgd)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
y_test_bin = multilabel.inverse_transform(y_test)
predictions_bin = multilabel.inverse_transform(y_pred)
jaccard_index = jaccard_score(y_test, y_pred, average='micro')

print(f'Accuracy using Jaccard Index: {jaccard_index:.4f}')

Accuracy using Jaccard Index: 0.6842


Used LinearSVC, SGDClassifier and Logistic Regression.Among All SGDClassifier gave the best jaccard score.That's why I used SGDClassifier as final model.

In [None]:
print(y_test_bin[0])
print(predictions_bin[0])

('', 'active bleeding', 'atraumatic', 'bleeding', 'body', 'cut on foot', 'emergency room reports', 'foot', 'foot pain', 'foreign body', 'injuries', 'laceration', 'piece of glass')
('', 'active bleeding', 'atraumatic', 'bleeding', 'cut on foot', 'foot', 'foot pain', 'foreign body', 'general medicine', 'injuries', 'laceration')


Using Pretrained Model Like BART

In [35]:
data=df[["transcription_lower","keywords"]]

In [None]:
from transformers import BartForConditionalGeneration
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base", padding_side="left",truncation_side='right')
if torch.cuda.is_available():
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to("cuda")


In [37]:
class MedicalDataset(Dataset):
    def __init__(self, df, transcript_lower, keywords, tokenizer, input_length, output_length):
        self.df = df
        self.transcript_lower = transcript_lower
        self.keywords = keywords
        self.tokenizer = tokenizer
        self.input_length = input_length
        self.output_length = output_length

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        transcript_tokens = self.tokenizer(self.df[self.transcript_lower].iloc[idx],padding='max_length',truncation=True, max_length=self.input_length,return_tensors='pt')['input_ids']
        keyword_tokens = self.tokenizer(self.df[self.keywords].iloc[idx],padding="max_length", truncation=True,max_length=self.output_length,return_tensors='pt')['input_ids']

        if torch.cuda.is_available():
            transcript_tokens = transcript_tokens.to("cuda")
            keyword_tokens = keyword_tokens.to("cuda")
        return transcript_tokens[0,:], keyword_tokens[0,:]

In [38]:
data_train, data_test = train_test_split(data, train_size=0.8, random_state=0)
train_data = MedicalDataset(data_train,'transcription_lower','keywords',tokenizer,750, 100)
test_data = MedicalDataset(data_test,'transcription_lower','keywords',tokenizer,750, 100)

batch_size = 6

dl_train = DataLoader(train_data, batch_size=batch_size, shuffle=True)
dl_test = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [39]:
optimizer = Adam(model.parameters(),lr=2e-4)
epochs = 3
def num_batches(total, batch_size):
    if total % batch_size == 0:
        return total // batch_size
    else:
        return total // batch_size + 1

In [40]:
train_batches = num_batches(data_train.shape[0],batch_size)
test_batches = num_batches(data_test.shape[0],batch_size)

In [41]:
### Function to train model
def training(data,num_batches, model, optimizer):
    model.train()
    model_loss = 0
    model_acc = 0
    i = 0
    for tr, kw in data:
        optimizer.zero_grad()
        out = model(tr, labels=kw)

        ### Loss computation
        r_loss = out.loss
        model_loss += r_loss.item()

        ### Accuracy Computation
        logits = out.logits
        preds = torch.softmax(logits,dim=2)
        preds = torch.argmax(preds,dim=2)
        acc = torch.sum(kw == preds).item()/(kw.shape[0]*kw.shape[1])
        model_acc += acc

        r_loss.backward()
        optimizer.step()

        i+=1
        print(f"loss={model_loss/i} accuracy={model_acc/i}",end="\r")

    print(f"loss={model_loss/i} accuracy={model_acc/i}",end="\n")

In [42]:
def testing(data,num_batches, model):
    model.eval()
    model_loss = 0
    model_acc = 0
    i = 0
    for tr, kw in data:
        out = model(tr, labels=kw)

        r_loss = out.loss
        model_loss += r_loss.item()

        logits = out.logits
        preds = torch.softmax(logits,dim=2)
        preds = torch.argmax(preds,dim=2)
        acc = torch.sum(kw == preds).item()/(kw.shape[0]*kw.shape[1])
        model_acc += acc

        i+=1
        print(f"loss={model_loss/i} accuracy={model_acc/i}",end="\r")

    print(f"loss={model_loss/i} accuracy={model_acc/i}",end="\n")

In [43]:
for e in range(epochs):
    training(dl_train,train_batches,model, optimizer)
    testing(dl_test, test_batches,model)

loss=1.33675569237902 accuracy=0.780758754863813
loss=0.8575120438901029 accuracy=0.8302325581395353
loss=0.7801822902859417 accuracy=0.8398378728923481
loss=0.756462503311246 accuracy=0.8418087855297164
loss=0.6538055993703553 accuracy=0.8543547341115446
loss=0.6739316739777262 accuracy=0.8529844961240307


In [44]:
def generate_result(df,transcription, model, tokenizer):
    df['Predicted'] = df[transcription].apply(lambda x: tokenizer(x, max_length=750,padding='max_length', truncation=True, return_tensors='pt')['input_ids'])
    if torch.cuda.is_available():
        df['Predicted'] = df['Predicted'].apply(lambda x: x.to("cuda"))

    df['Predicted'] = df['Predicted'].apply(lambda x: model.generate(x,min_length=20,max_length=100 ))
    df['Predicted'] = df['Predicted'].apply(lambda x: tokenizer.batch_decode(x,skip_special_tokens=True))
    return df

In [45]:
df_res = generate_result(data_test,'transcription_lower',model,tokenizer)

In [46]:
model.generate(tokenizer(data_test['transcription_lower'].iloc[0],max_length=750,padding="max_length", truncation=True,return_tensors='pt')['input_ids'].to('cuda'),max_length=100)

tensor([[    2,     0,     1,     1,     1,     0, 22494, 15911,   636,     6,
           847,    15,  2767,     6,   741,  4668, 10100,     6,  1145,  7889,
         19898, 36063,     6, 30249,   354,  1264,   738,     6, 13206,   636,
          3876, 22090,     6, 34867,  1571,     6, 24295, 15010, 30404,     6,
          2849, 27643,  1258,     6,  2849,  8267, 33101,     6,  2849,  5901,
          6204,     6, 34867,  2192,     6,  2767,     6,   784, 11937,  1258,
         46599,     6,    35,    20,  8738, 28584, 38131,  1131, 37118,  7728,
           690,     8,  7721,    32,  1286,    30,  1337,  1434,     8,  1322,
            13,  5135,  3508,   129,     4,   256,  3732,   523,   642, 18997,
           473,    45, 33446,  8611,     8,  1318,     9,  7728,   690,     2]],
       device='cuda:0')

In [47]:
tokenizer(data_test['transcription_lower'].iloc[0],max_length=750,padding="max_length", truncation=True)

{'input_ids': [0, 15276, 3674, 46686, 1437, 847, 15, 2767, 482, 37283, 9, 1455, 5467, 46686, 1437, 42, 16, 10, 2107, 12, 180, 12, 279, 2943, 54, 56, 10, 2125, 9, 4049, 1136, 15, 7, 39, 235, 2767, 452, 4, 1437, 5, 3186, 21, 2273, 142, 9, 5, 1280, 9, 13162, 14, 2756, 19, 24, 4, 1437, 5, 13162, 34, 57, 2294, 8, 5, 3186, 473, 45, 33, 143, 2400, 4, 1437, 5, 3186, 34, 2340, 304, 9, 39, 2767, 6, 89, 16, 117, 31086, 1825, 50, 8269, 6, 5, 3186, 16, 441, 7, 13569, 10246, 157, 396, 143, 19535, 4, 1437, 5, 3186, 9118, 143, 1746, 7, 143, 97, 4745, 9, 39, 809, 4, 1437, 37, 34, 45, 56, 143, 485, 5467, 4, 1437, 5, 3186, 34, 117, 97, 1272, 50, 4496, 482, 33456, 1131, 750, 46686, 1437, 18881, 482, 28311, 8456, 35, 2156, 1076, 4297, 254, 1168, 482, 1250, 11249, 918, 46686, 1437, 117, 684, 1262, 26331, 482, 19027, 750, 35, 2156, 5, 3186, 16, 10, 40345, 482, 41359, 9027, 46686, 1437, 4874, 2434, 35, 1437, 5181, 8757, 4, 398, 11651, 6, 1925, 1164, 19376, 73, 5334, 6, 22293, 16, 5553, 6, 8, 44266, 1635, 545,

In [48]:
df_res['Predicted'] = df_res['Predicted'].apply(lambda x: x[0])
print(df_res['keywords'].iloc[0])
print(df_res['Predicted'].iloc[0])

emergency room reports, foot pain, cut on foot, piece of glass, foreign body, active bleeding, foot, injuries, atraumatic, laceration, bleeding, body, 
orthopedic, cut on foot, bursitis, metzenbaum scissors, cortisone shot, radicose vein, extremity, femoral artery, subluxation, subcutaneous, subcarinal, extremities, foot, lacerationNOTE,: Thesetranscribed medical transcription sample reports and examples are provided by various users andare for reference purpose only. MTHelpLine does not certify accuracy and quality of sample reports


In [49]:
df_res.head()

Unnamed: 0,transcription_lower,keywords,Predicted
3863,"chief complaint:, cut on foot.,history of pre...","emergency room reports, foot pain, cut on foot...","orthopedic, cut on foot, bursitis, metzenbaum ..."
4703,"preoperative diagnosis:, large recurrent righ...","cardiovascular / pulmonary, chest tube talc pl...","cardiovascular / pulmonary, chest tube talc pl..."
1918,"title of operation:, mediastinal exploration ...","pediatrics - neonatal, mediastinal exploration...","cardiovascular / pulmonary, sano modification,..."
3696,"preoperative diagnoses:,1. ventilator-depende...","ent - otolaryngology, ventilator-dependent res...","ent - otolaryngology, ventilator dependent res..."
1140,"preoperative diagnosis: , breast mass, left.,p...","surgery, breast mass excision, freely mobile, ...","obstetrics / gynecology, breast mass, excision..."


In [50]:
for i in range(10):

    print(f"Row no {i+1}")
    print("Transcription:")
    print(df_res['transcription_lower'].iloc[i])
    print("\n")
    print("Keywords:")
    print(df_res['keywords'].iloc[i])
    print("\n")
    print("Predicted Keywords:")
    print(df_res['Predicted'].iloc[i])
    print("\n")

Row no 1
Transcription:
chief complaint:,  cut on foot.,history of present illness:,  this is a 32-year-old male who had a piece of glass fall on to his right foot today.  the patient was concerned because of the amount of bleeding that occurred with it.  the bleeding has been stopped and the patient does not have any pain.  the patient has normal use of his foot, there is no numbness or weakness, the patient is able to ambulate well without any discomfort.  the patient denies any injuries to any other portion of his body.  he has not had any recent illness.  the patient has no other problems or complaints.,past medical history:,  asthma.,current medication: , albuterol.,allergies:,  no known drug allergies.,social history: , the patient is a smoker.,physical examination:,  vital signs:  temperature 98.8 oral, blood pressure 132/86, pulse is 76, and respirations 16.  oxygen saturation is 100% on room air and interpreted as normal.  constitutional:  the patient is well-nourished, well-d