In [None]:
!pip install accelerate -U

In [None]:
!pip install datasets

In [None]:
!pip install transformers

In [2]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: NVIDIA L4


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv('overview-of-recordings.csv')
data1 = data[['phrase','prompt']]
data1.sample(5)

Unnamed: 0,phrase,prompt
72,My son nicked his neck with an old razor and t...,Infected wound
5669,i feel head ache,Head ache
6101,I feel a burning sensation in my shoulder muscle.,Muscle pain
3317,I feel pain inside and I can not identify it,Internal pain
4507,When I stand up too quickly I start to feel di...,Feeling dizzy


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = data1.copy()
df.isna().sum()
df['prompt'].value_counts()
print('Total number of intents: %d'%(len(df['prompt'].value_counts().index)))

label_encoder = LabelEncoder()# Encode the labels
df['encoded_prompt'] = label_encoder.fit_transform(df['prompt'])
X, sentence_test, y, intent_test = train_test_split(df.phrase, df.encoded_prompt, stratify = df.encoded_prompt, test_size=0.2, random_state=4612)
sentence_train, sentence_val, intent_train, intent_val = train_test_split(X, y, stratify = y, test_size=0.125, random_state=4612)
print(f"#examples in training set:{ sentence_train.shape[0]}\n#examples in validation set:{ sentence_val.shape[0]}\n#examples in test set:{ sentence_test.shape[0]}")


Total number of intents: 25
#examples in training set:4662
#examples in validation set:666
#examples in test set:1333


In [5]:
TRAIN_BATCH_SIZE = 32 #hyperparameter
VALID_BATCH_SIZE = 64
EPSILON = 1e-08
EPOCHS = 20
LEARNING_RATE = 2e-5
SEED = 1215

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences

model = AutoModelForSequenceClassification.from_pretrained("medicalai/ClinicalBERT", num_labels=len(df['prompt'].value_counts().index))
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
model.cuda()

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [7]:
max_len = 0
input_ids = []
lengths = []

for sent in sentence_train:
    encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
    input_ids.append(encoded_sent)
    lengths.append(len(encoded_sent))
    max_len = max(max_len, len(encoded_sent))

mean_len = sum(lengths) / len(lengths)
print(f'Max sentence length: {max_len} \nMean sentence length: {mean_len}')
input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")# Pad sequences

attention_masks = []
for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)


Max sentence length: 42 
Mean sentence length: 15.351780351780352


In [8]:
train_inputs = torch.tensor(input_ids) # Convert to tensors
train_labels = torch.tensor(intent_train.values, dtype=torch.long)
train_masks = torch.tensor(attention_masks)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

In [9]:
val_input_ids = tokenizer(sentence_val.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=max_len)["input_ids"] #val_dataset
val_attention_masks = tokenizer(sentence_val.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=max_len)["attention_mask"]

val_inputs = torch.tensor(val_input_ids)
val_labels = torch.tensor(intent_val.values, dtype=torch.long)
val_masks = torch.tensor(val_attention_masks)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=VALID_BATCH_SIZE)

  val_inputs = torch.tensor(val_input_ids)
  val_masks = torch.tensor(val_attention_masks)


In [10]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)# Optimizer, scheduler
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [11]:
def train(epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for step, batch in enumerate(train_dataloader):
            batch_input_ids, batch_input_mask, batch_labels = batch
            batch_input_ids = batch_input_ids.to(device)
            batch_input_mask = batch_input_mask.to(device)
            batch_labels = batch_labels.to(device)

            model.zero_grad()
            outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1}, Training loss: {avg_train_loss:.2f}')

train(EPOCHS)

Epoch 1, Training loss: 2.38
Epoch 2, Training loss: 0.71
Epoch 3, Training loss: 0.18
Epoch 4, Training loss: 0.06
Epoch 5, Training loss: 0.03
Epoch 6, Training loss: 0.03
Epoch 7, Training loss: 0.02
Epoch 8, Training loss: 0.01
Epoch 9, Training loss: 0.01
Epoch 10, Training loss: 0.01
Epoch 11, Training loss: 0.01
Epoch 12, Training loss: 0.01
Epoch 13, Training loss: 0.01
Epoch 14, Training loss: 0.01
Epoch 15, Training loss: 0.00
Epoch 16, Training loss: 0.00
Epoch 17, Training loss: 0.00
Epoch 18, Training loss: 0.00
Epoch 19, Training loss: 0.00
Epoch 20, Training loss: 0.00


In [12]:
import os
output_dir = './Intent_detect_ClinicalBERT/' #save the model

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Model saved to %s" % output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Model saved to ./Intent_detect_ClinicalBERT/


('./Intent_detect_ClinicalBERT/tokenizer_config.json',
 './Intent_detect_ClinicalBERT/special_tokens_map.json',
 './Intent_detect_ClinicalBERT/vocab.txt',
 './Intent_detect_ClinicalBERT/added_tokens.json',
 './Intent_detect_ClinicalBERT/tokenizer.json')

#### Inference on test data

In [13]:
from sklearn.metrics import classification_report

test_input_ids = tokenizer(sentence_test.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=max_len)["input_ids"]# Preprocessing test data
test_attention_masks = tokenizer(sentence_test.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=max_len)["attention_mask"]

test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(intent_test.values, dtype=torch.long)
test_masks = torch.tensor(test_attention_masks)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=VALID_BATCH_SIZE)

# inference
def inference(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []
    for batch in dataloader:
        batch_input_ids, batch_input_mask, batch_labels = batch
        batch_input_ids = batch_input_ids.to(device)
        batch_input_mask = batch_input_mask.to(device)
        batch_labels = batch_labels.to(device)

        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_input_mask)
            logits = outputs.logits

        preds = torch.argmax(logits, dim=1).flatten()
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(batch_labels.cpu().numpy())

    return predictions, true_labels

predictions, true_labels = inference(model, test_dataloader)
accuracy = (np.array(predictions) == np.array(true_labels)).mean()
print(f'Test set accuracy: {accuracy:.2f}')

labels = label_encoder.inverse_transform(range(len(df['prompt'].value_counts().index)))
print(classification_report(true_labels, predictions, target_names=labels))


  test_inputs = torch.tensor(test_input_ids)
  test_masks = torch.tensor(test_attention_masks)


Test set accuracy: 1.00
                    precision    recall  f1-score   support

              Acne       1.00      1.00      1.00        66
         Back pain       1.00      1.00      1.00        52
     Blurry vision       1.00      1.00      1.00        49
   Body feels weak       1.00      1.00      1.00        48
             Cough       1.00      1.00      1.00        59
          Ear ache       1.00      1.00      1.00        54
    Emotional pain       1.00      1.00      1.00        46
      Feeling cold       0.98      1.00      0.99        53
     Feeling dizzy       1.00      1.00      1.00        57
         Foot ache       1.00      1.00      1.00        44
  Hair falling out       1.00      1.00      1.00        53
    Hard to breath       1.00      1.00      1.00        47
         Head ache       1.00      1.00      1.00        53
       Heart hurts       1.00      1.00      1.00        55
    Infected wound       1.00      1.00      1.00        61
Injury from spo

In [14]:
test_results = pd.DataFrame({
    'phrase': sentence_test.tolist(),
    'true_prompt': label_encoder.inverse_transform(true_labels),
    'predicted_prompt': label_encoder.inverse_transform(predictions)
})

test_results.to_csv('test_predictions.csv', index=False)

In [15]:
# label encoder
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

###### # Load the trained model and tokenizer


In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pickle

model_path = './Intent_detect_ClinicalBERT'
loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

with open('label_encoder.pkl', 'rb') as f:
    loaded_label_encoder = pickle.load(f)

In [17]:
def medical_symptom_detector(intent, max_length=128):
    pt_batch = loaded_tokenizer(
        intent,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    pt_batch = {k: v.to(device) for k, v in pt_batch.items()}
    loaded_model.eval()
    with torch.no_grad():
        pt_outputs = loaded_model(**pt_batch)

    #  predicted label
    __, id = torch.max(pt_outputs.logits, dim=1)
    prediction = loaded_label_encoder.inverse_transform([id.item()])[0]

    print(f'You may have a medical condition: {prediction}. Would you like me to transfer your call to your doctor?')
    return prediction

intent = "I have a headache and feel nauseous."
medical_symptom_detector(intent)

You may have a medical condition: Head ache. Would you like me to transfer your call to your doctor?


'Head ache'

In [18]:
intents = [
    "I have a headache and feel nauseous.",
    "My throat hurts and I have a fever.",
    "I'm experiencing chest pain and shortness of breath."
]

for intent in intents:
    medical_symptom_detector(intent)

You may have a medical condition: Head ache. Would you like me to transfer your call to your doctor?
You may have a medical condition: Feeling cold. Would you like me to transfer your call to your doctor?
You may have a medical condition: Hard to breath. Would you like me to transfer your call to your doctor?


#########Perform inference on the first N samples


In [19]:
import pandas as pd
df = pd.read_csv('/content/overview-of-recordings.csv')
N = 5
for i in range(N):
    intent = df['phrase'].iloc[i]
    print(f"\nSample {i+1}: {intent}")
    medical_symptom_detector(intent)


Sample 1: When I remember her I feel down
You may have a medical condition: Emotional pain. Would you like me to transfer your call to your doctor?

Sample 2: When I carry heavy things I feel like breaking my back
You may have a medical condition: Back pain. Would you like me to transfer your call to your doctor?

Sample 3: there is too much pain when i move my arm
You may have a medical condition: Heart hurts. Would you like me to transfer your call to your doctor?

Sample 4: My son had his lip pierced and it is swollen and the skin inside on his lip is grey and looks infected.
You may have a medical condition: Infected wound. Would you like me to transfer your call to your doctor?

Sample 5: My muscles in my lower back are aching
You may have a medical condition: Infected wound. Would you like me to transfer your call to your doctor?


In [20]:
df.head(5)

Unnamed: 0,audio_clipping,audio_clipping:confidence,background_noise_audible,background_noise_audible:confidence,overall_quality_of_the_audio,quiet_speaker,quiet_speaker:confidence,speaker_id,file_download,file_name,phrase,prompt,writer_id
0,no_clipping,1.0,light_noise,1.0,3.33,audible_speaker,1.0,43453425,https://ml.sandbox.cf3.us/cgi-bin/index.cgi?do...,1249120_43453425_58166571.wav,When I remember her I feel down,Emotional pain,21665495
1,light_clipping,0.6803,no_noise,0.6803,3.33,audible_speaker,1.0,43719934,https://ml.sandbox.cf3.us/cgi-bin/index.cgi?do...,1249120_43719934_43347848.wav,When I carry heavy things I feel like breaking...,Hair falling out,44088126
2,no_clipping,1.0,no_noise,0.6655,3.33,audible_speaker,1.0,43719934,https://ml.sandbox.cf3.us/cgi-bin/index.cgi?do...,1249120_43719934_53187202.wav,there is too much pain when i move my arm,Heart hurts,44292353
3,no_clipping,1.0,light_noise,1.0,3.33,audible_speaker,1.0,31349958,https://ml.sandbox.cf3.us/cgi-bin/index.cgi?do...,1249120_31349958_55816195.wav,My son had his lip pierced and it is swollen a...,Infected wound,43755034
4,no_clipping,1.0,no_noise,1.0,4.67,audible_speaker,1.0,43719934,https://ml.sandbox.cf3.us/cgi-bin/index.cgi?do...,1249120_43719934_82524191.wav,My muscles in my lower back are aching,Infected wound,21665495


In [None]:
!huggingface-cli login

In [None]:
new_model="Shobhank-iiitdwd/NLP-Medical-Intent-Detector"
model.push_to_hub(new_model, use_auth_token = True)
tokenizer.push_to_hub(new_model, use_auth_token = True)

### Load model from Hugging Face Hub

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "Shobhank-iiitdwd/NLP-Medical-Intent-Detector"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [23]:
import torch

data = pd.read_csv('overview-of-recordings.csv')
data1 = data[['phrase', 'prompt']]

In [24]:
N = 5
sample_data = data1.head(N)
inputs = tokenizer(sample_data['phrase'].tolist(), padding=True, truncation=True, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [25]:
# inference
model.eval()
with torch.no_grad():
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

predictions = torch.argmax(logits, dim=1).cpu().numpy()#predictions
sample_data['predicted_intent'] = predictions
print(sample_data)

                                              phrase            prompt  \
0                    When I remember her I feel down    Emotional pain   
1  When I carry heavy things I feel like breaking...  Hair falling out   
2          there is too much pain when i move my arm       Heart hurts   
3  My son had his lip pierced and it is swollen a...    Infected wound   
4             My muscles in my lower back are aching    Infected wound   

   predicted_intent  
0                 6  
1                 1  
2                13  
3                14  
4                14  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data['predicted_intent'] = predictions


In [31]:
import joblib
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']

In [33]:
intents = [
    "I have a headache and feel nauseous.",
    "My throat hurts and I have a fever.",
    "I'm experiencing chest pain and shortness of breath."
]

for intent in intents:
    medical_symptom_detector(intent)

You may have a medical condition: Head ache. Would you like me to transfer your call to your doctor?
You may have a medical condition: Feeling cold. Would you like me to transfer your call to your doctor?
You may have a medical condition: Hard to breath. Would you like me to transfer your call to your doctor?


###  inference on a sample