In [11]:
! pip install transformers torch



In [12]:
import pandas as pd

# Load dataset
df = pd.read_csv("DB/Sentence1.csv")
sentences = df["Sentence"].values
labels = df["Symptom"].values

In [13]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize sentences
inputs = tokenizer(list(sentences), padding=True, truncation=True, return_tensors="pt", max_length=128)


In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

In [15]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Split data
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    inputs['input_ids'], labels_encoded, test_size=0.2, random_state=42)

train_masks, val_masks = train_test_split(inputs['attention_mask'], test_size=0.2, random_state=42)

# Convert to PyTorch tensors
import torch

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

val_inputs = torch.tensor(val_inputs)
val_labels = torch.tensor(val_labels)
val_masks = torch.tensor(val_masks)

# Create DataLoader
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size)


  train_inputs = torch.tensor(train_inputs)
  train_masks = torch.tensor(train_masks)
  val_inputs = torch.tensor(val_inputs)
  val_masks = torch.tensor(val_masks)


In [16]:
from transformers import BertForSequenceClassification

# Load BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=len(label_encoder.classes_)
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(model.parameters(), lr=2e-5)

scheduler = StepLR(optimizer, step_size=1, gamma=0.9)




In [21]:
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3
loss_fn = CrossEntropyLoss()

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        batch_inputs, batch_masks, batch_labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        outputs = model(batch_inputs, attention_mask=batch_masks)
        batch_labels = batch_labels.to(torch.long)
        loss = loss_fn(outputs.logits, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}: Loss = {total_loss / len(train_dataloader)}")
    scheduler.step()


100%|██████████| 19/19 [00:28<00:00,  1.49s/it]


Epoch 1: Loss = 2.8860598237890946


100%|██████████| 19/19 [00:27<00:00,  1.44s/it]


Epoch 2: Loss = 2.649165304083573


100%|██████████| 19/19 [00:34<00:00,  1.80s/it]

Epoch 3: Loss = 2.365257351022018





In [22]:
from sklearn.metrics import classification_report

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_dataloader:
        batch_inputs, batch_masks, batch_labels = [b.to(device) for b in batch]
        
        outputs = model(batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(batch_labels.cpu().numpy())

print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))


                      precision    recall  f1-score   support

        Blocked Nose       0.00      0.00      0.00         2
 Burning Micturition       1.00      0.50      0.67         8
              Chills       0.25      1.00      0.40         1
 Continuous Sneezing       0.00      0.00      0.00         4
               Cough       1.00      0.50      0.67         2
            Diarrhea       1.00      0.33      0.50         3
       Feeling Tired       1.00      0.50      0.67         4
    High Temperature       0.67      0.67      0.67         3
        Hoarse Voice       1.00      0.67      0.80         6
       Loss of Smell       0.50      1.00      0.67         2
       Loss of Taste       0.00      0.00      0.00         6
         Muscle Pain       0.00      0.00      0.00         5
Nodal Skin Eruptions       1.00      0.40      0.57         5
          Runny Nose       0.08      1.00      0.15         1
           Shivering       0.31      0.67      0.42         6
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
model.save_pretrained("bert_symptom_classifier")
tokenizer.save_pretrained("bert_symptom_classifier")


('bert_symptom_classifier\\tokenizer_config.json',
 'bert_symptom_classifier\\special_tokens_map.json',
 'bert_symptom_classifier\\vocab.txt',
 'bert_symptom_classifier\\added_tokens.json')

In [24]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load the trained model and tokenizer
model_path = "bert_symptom_classifier"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [60]:
# Example sentence
example_sentence = "I have a rash on my skin"

# Tokenize the input sentence
inputs = tokenizer(
    example_sentence, 
    padding=True, 
    truncation=True, 
    return_tensors="pt", 
    max_length=128
)

# Move tensors to device
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)

In [61]:
# Get model predictions
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

# Convert logits to predicted label
predicted_label_id = torch.argmax(logits, dim=1).item()

In [62]:
predicted_label = label_encoder.inverse_transform([predicted_label_id])[0]
print(f"Predicted Symptom: {predicted_label}")

Predicted Symptom: Spotting Urination


In [28]:
# Initialize lists to store predictions and true labels
predictions, true_labels = [], []

# Set model to evaluation mode
model.eval()

# Iterate through the validation DataLoader
with torch.no_grad():
    for batch in val_dataloader:
        # Unpack the batch and move to the device
        batch_inputs, batch_masks, batch_labels = [b.to(device) for b in batch]
        
        # Get model outputs
        outputs = model(batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits

        # Store predictions and true labels
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(batch_labels.cpu().numpy())


In [29]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 52.63%


In [30]:
from sklearn.metrics import classification_report

# Print classification report
report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)
print(report)

                      precision    recall  f1-score   support

        Blocked Nose       0.00      0.00      0.00         2
 Burning Micturition       1.00      0.50      0.67         8
              Chills       0.25      1.00      0.40         1
 Continuous Sneezing       0.00      0.00      0.00         4
               Cough       1.00      0.50      0.67         2
            Diarrhea       1.00      0.33      0.50         3
       Feeling Tired       1.00      0.50      0.67         4
    High Temperature       0.67      0.67      0.67         3
        Hoarse Voice       1.00      0.67      0.80         6
       Loss of Smell       0.50      1.00      0.67         2
       Loss of Taste       0.00      0.00      0.00         6
         Muscle Pain       0.00      0.00      0.00         5
Nodal Skin Eruptions       1.00      0.40      0.57         5
          Runny Nose       0.08      1.00      0.15         1
           Shivering       0.31      0.67      0.42         6
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
