In [94]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd



In [95]:
#load data
data = pd.read_csv('Symptom2Disease.csv')

#split data into train and test
train_data = data.sample(frac=0.8, random_state=0)
test_data = data.drop(train_data.index)

print(len(train_data))
print(f"{len(test_data)}, {test_data.shape})")

for i,x in enumerate(test_data):
    if(i == 10):
        break
    print(x)
    


960
240, (240, 3))
Unnamed: 0
label
text


In [96]:
#connect to GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [97]:
#train data
train_labels = train_data['label'].values
train_text = train_data['text'].values

#test data
test_labels = test_data['label'].values
test_text = test_data['text'].values

In [98]:
#tokenize data
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#tokenize and convert each sentence into tensors
input_ids = []
for sent in train_text:

    encoded_sent = tokenizer.encode(sent,add_special_tokens = True)
    input_ids.append(torch.tensor(encoded_sent))
    

test_input_ids = []
for sent in test_text:
        encoded_sent = tokenizer.encode(sent,add_special_tokens = True)
        test_input_ids.append(torch.tensor(encoded_sent))

#tokenize and convert each sentence into tensors
label_ids = []
for label in train_labels:
    print(label)
    encoded = tokenizer.encode(label,add_special_tokens = True)
    label_ids.append(torch.tensor(encoded))

test_label_ids = []
for label in test_labels:
    encoded = tokenizer.encode(label,add_special_tokens = True)
    test_label_ids.append(torch.tensor(encoded))


print('Original: ', train_text[0])
print('Token IDs:', input_ids[0])


Chicken pox
Fungal infection
Psoriasis
Dengue
Bronchial Asthma
Arthritis
Chicken pox
allergy
peptic ulcer disease
drug reaction
Fungal infection
drug reaction
Fungal infection
Common Cold
Common Cold
Pneumonia
allergy
Impetigo
Varicose Veins
drug reaction
allergy
allergy
drug reaction
drug reaction
allergy
Psoriasis
Migraine
Impetigo
Malaria
peptic ulcer disease
Psoriasis
Psoriasis
Jaundice
Bronchial Asthma
Acne
peptic ulcer disease
Impetigo
urinary tract infection
Jaundice
Pneumonia
diabetes
Chicken pox
Arthritis
Typhoid
Hypertension
Arthritis
Fungal infection
gastroesophageal reflux disease
drug reaction
Hypertension
Hypertension
gastroesophageal reflux disease
peptic ulcer disease
Jaundice
gastroesophageal reflux disease
peptic ulcer disease
Psoriasis
Dengue
Acne
allergy
Psoriasis
Pneumonia
Fungal infection
Bronchial Asthma
drug reaction
Acne
Acne
Hypertension
Cervical spondylosis
gastroesophageal reflux disease
Dimorphic Hemorrhoids
Malaria
Migraine
Dimorphic Hemorrhoids
Cervical s

In [99]:
from torch.nn.utils.rnn import pad_sequence

#pad the tensors to make them of equal length
padded_input_ids = pad_sequence(input_ids, batch_first=True) #78 size
testing = pad_sequence(test_input_ids, batch_first=True)  #68 size

#pad them so they are the same size
padded_test_input_ids = torch.nn.functional.pad(testing, (0, 78 - testing.size(1)), value=0)

print(padded_input_ids.shape)
print(padded_test_input_ids.shape)


#pad the tensors to make them of equal length
padded_labels = pad_sequence(label_ids, batch_first=True)
padded_test_labels = pad_sequence(test_label_ids, batch_first=True, padding_value = 0)





torch.Size([960, 78])
torch.Size([240, 78])


In [107]:
#define hyperparameters
learning_rate = 0.001
epochs = 1000


input_size = padded_input_ids.shape[1]
hidden_size = 50
output_size = padded_labels.shape[1]

input_size1 = padded_test_input_ids.shape[1]
hidden_size = 50
output_size1 = padded_test_labels.shape[1]

print(f"input_size: {input_size}, output_size: {output_size}, hidden_size: {hidden_size}")
print(f"input_size1: {input_size1}, output_size1: {output_size1}, hidden_size: {hidden_size}")

input_size: 78, output_size: 12, hidden_size: 50
input_size1: 78, output_size1: 12, hidden_size: 50


In [101]:
#create a simple fnn model that takes in the input size
class FNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out
    




In [102]:
#define optimizer and loss function
net = FNN(input_size, hidden_size, output_size)
net.to(device)

criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate)

In [103]:

#maybe implement scedulers
for epoch in range(epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i in range(len(input_ids)):
        # get the inputs; data is a list of [inputs, labels]
        inputs = padded_input_ids[i].to(device)
        labels = padded_labels[i].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs.float())
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print(f'Epoch: {epoch + 1}, Batch: {i + 1}, Loss: {running_loss / 100}')
            running_loss = 0.0

print('Finished Training')

Epoch: 1, Batch: 100, Loss: 307059.03984375
Epoch: 1, Batch: 200, Loss: 347080.4239453125
Epoch: 1, Batch: 300, Loss: 334171.090703125
Epoch: 1, Batch: 400, Loss: 349051.2576171875
Epoch: 1, Batch: 500, Loss: 332556.5062109375
Epoch: 1, Batch: 600, Loss: 325230.422265625
Epoch: 1, Batch: 700, Loss: 329001.340859375
Epoch: 1, Batch: 800, Loss: 328117.340390625
Epoch: 1, Batch: 900, Loss: 337897.1733203125
Epoch: 2, Batch: 100, Loss: 307016.17375
Epoch: 2, Batch: 200, Loss: 347080.4239453125
Epoch: 2, Batch: 300, Loss: 334171.090703125
Epoch: 2, Batch: 400, Loss: 349051.2576171875
Epoch: 2, Batch: 500, Loss: 332556.5062109375
Epoch: 2, Batch: 600, Loss: 325230.422265625
Epoch: 2, Batch: 700, Loss: 329001.340859375
Epoch: 2, Batch: 800, Loss: 328117.340390625
Epoch: 2, Batch: 900, Loss: 337897.1733203125
Epoch: 3, Batch: 100, Loss: 307016.17375
Epoch: 3, Batch: 200, Loss: 347080.4239453125
Epoch: 3, Batch: 300, Loss: 334171.090703125
Epoch: 3, Batch: 400, Loss: 349051.2576171875
Epoch: 3,

In [108]:
# Test model for symptom predictor
correct = 0
total = 0
with torch.no_grad():
    for i in range(len(test_input_ids)):
        inputs = padded_test_input_ids[i].to(device)
        labels = padded_test_labels[i].to(device)

        outputs = net(inputs.float())
        predicted = torch.round(outputs.data)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the symptom predictor on the {len(test_input_ids)} test inputs: {100 * correct / total}%')

Accuracy of the symptom predictor on the 240 test inputs: 58.50694444444444%
