In [68]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd



In [69]:
#load data
data = pd.read_csv('Symptom2Disease.csv')

#seperate data into the text and label
symptoms = data['text'].values
diseases = data['label'].values

print(len(symptoms))
print(len(diseases))
    


1200
1200


In [70]:
#connect to GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [71]:
#tokenize data and pad them to equal sequences

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

#turning it into a set suppositly makes it faster
stopwords_set = set(stopwords.words())


#Preprocess function will remove stopwords, punctuation, lowercase the text
#Might add stemming and lemmatization later on
def preprocess(text):

    #lowercase
    text = word_tokenize(text)

    #remove stopwords and punctuation
    processed_text = [word.lower() for word in text if not word.lower() in stopwords_set and word.isalpha()]
    

    return processed_text

print(preprocess("I am going to SCHOOL. Where is my bag, lunch, and books?"))
print(f"\nBefore: {symptoms[0]}")

#preprcoess all data
symptoms = [preprocess(text) for text in symptoms]

print(f"After: {symptoms[0]}")


['school', 'bag', 'lunch', 'books']

Before: I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bobth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bobth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


After: ['experiencing', 'skin', 'rash', 'arms', 'legs', 'torso', 'past', 'weeks', 'red', 'itchy', 'covered', 'dry', 'scaly', 'patches']


In [72]:
#imports
from nltk.probability import FreqDist

#Basically encoding the text

#vocab creation
word_frequency = FreqDist([word.lower() for text in symptoms for word in text])

# Create the vocabulary by assigning a unique index to each word
vocab = {word: idx+1 for idx, (word, _) in enumerate(word_frequency.items())}

# encode text
def encode(text, vocab):
    encoded = []
    for word in text:
        encoded.append(vocab.get(word,0))
    return encoded

#encode all data
print(symptoms[0])

symptoms = [encode(text, vocab) for text in symptoms]


print(f"Encoded: {symptoms[0]}")



['experiencing', 'skin', 'rash', 'arms', 'legs', 'torso', 'past', 'weeks', 'red', 'itchy', 'covered', 'dry', 'scaly', 'patches']
Encoded: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


In [73]:
#pad sequences
from torch.nn.utils.rnn import pad_sequence

#pad sequences
symptoms = pad_sequence([torch.tensor(text) for text in symptoms], batch_first=True)



In [74]:
#encoding labels

label_mapping = {label: i for i, label in enumerate(np.unique(diseases))}
diseases = np.array([label_mapping[label] for label in diseases])



In [75]:
#split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(symptoms, diseases, test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


torch.Size([960, 25])
torch.Size([240, 25])
(960,)
(240,)


In [76]:
# #convert to tensors
# print(X_train.dtype)
# print(y_train.dtype)

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)

X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)
 
print(f"training text: {X_train.shape}")
print(f"training labels: {y_train.shape}")

print(f"testing text: {X_test.shape}")
print(f"testing labels: {y_test.shape}")

print(X_train[0])

batch_size = 32

# Reshape training data
num_train_batches = X_train.shape[0] // batch_size
X_train = X_train[:num_train_batches * batch_size]  # Trim the data to have a multiple of batch_size
y_train = y_train[:num_train_batches * batch_size]  # Trim the labels accordingly
X_train = X_train.view(num_train_batches, batch_size, -1)  # Reshape to (num_batches, batch_size, input_size)
y_train = y_train.view(num_train_batches, batch_size, -1)  # Reshape labels similarly

# Reshape testing data
num_test_batches = X_test.shape[0] // batch_size
X_test = X_test[:num_test_batches * batch_size]  # Trim the data to have a multiple of batch_size
y_test = y_test[:num_test_batches * batch_size]  # Trim the labels accordingly
X_test = X_test.view(num_test_batches, batch_size, -1)  # Reshape to (num_batches, batch_size, input_size)
y_test = y_test.view(num_test_batches, batch_size, -1)  # Reshape labels similarly

print(f"training text: {X_train.shape}")
print(f"training labels: {y_train.shape}")

print(f"testing text: {X_test.shape}")
print(f"testing labels: {y_test.shape}")




training text: torch.Size([960, 25])
training labels: torch.Size([960])
testing text: torch.Size([240, 25])
testing labels: torch.Size([240])
tensor([  46,  466, 1151,  556,   46,  575, 1054, 1036,  598, 1262, 1253, 1263,
        1264,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0])
training text: torch.Size([30, 32, 25])
training labels: torch.Size([30, 32, 1])
testing text: torch.Size([7, 32, 25])
testing labels: torch.Size([7, 32, 1])


  X_train = torch.tensor(X_train)
  X_test = torch.tensor(X_test)


In [77]:
#create a simple fnn model that takes in the input size
class FNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out
    
#rnn model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        out = self.rnn(x)[0]
        out = self.linear2(out)
        out = self.relu(out)
        out = self.linear(out)
        out = self.softmax(out)
        return out
    
#LSTM model
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out
    def predict(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out
    
    




#### 24 represents the 24 unique diseases in the data set and 25 represents to length of each encoded vector


In [78]:
#define hyperparameters
learning_rate = 0.001
epochs = 1500

input_size = X_train.shape[2]
hidden_size = 25
output_size = len(label_mapping)

print(f"Input size: {input_size}")
print(f"out size: {output_size}")




Input size: 25
out size: 24


In [79]:
model_num = 1
model = None
if(model_num == 0):
    model = FNN(input_size, hidden_size, output_size)

elif(model_num == 1):
    #Reshape for RNN
    model = RNN(input_size, hidden_size, output_size)


model.to(device)

RNN(
  (rnn): RNN(25, 25, batch_first=True)
  (linear2): Linear(in_features=25, out_features=25, bias=True)
  (linear): Linear(in_features=25, out_features=24, bias=True)
  (softmax): Softmax(dim=0)
  (relu): ReLU()
)

In [80]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [81]:
model.train()
#train the model
for epoch in range(epochs):
    
    for batch_idx in range(len(X_train)):

        data = X_train[batch_idx].float().to(device)
        target = y_train[batch_idx].long().to(device)

        #Foward passs
        output = model(data)
        loss = criterion(output, target.squeeze())

        #backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if(batch_idx % 100 == 0):
            print(f"Epoch: {epoch} Batch: {batch_idx} Loss: {loss.item()}")






    

        
    


Epoch: 0 Batch: 0 Loss: 3.1762733459472656
Epoch: 1 Batch: 0 Loss: 3.175520658493042


Epoch: 2 Batch: 0 Loss: 3.175175189971924
Epoch: 3 Batch: 0 Loss: 3.1735124588012695
Epoch: 4 Batch: 0 Loss: 3.171276569366455
Epoch: 5 Batch: 0 Loss: 3.167186975479126
Epoch: 6 Batch: 0 Loss: 3.1599295139312744
Epoch: 7 Batch: 0 Loss: 3.1542069911956787
Epoch: 8 Batch: 0 Loss: 3.1485114097595215
Epoch: 9 Batch: 0 Loss: 3.157748222351074
Epoch: 10 Batch: 0 Loss: 3.147198438644409
Epoch: 11 Batch: 0 Loss: 3.1470580101013184
Epoch: 12 Batch: 0 Loss: 3.1448676586151123
Epoch: 13 Batch: 0 Loss: 3.1492087841033936
Epoch: 14 Batch: 0 Loss: 3.1472620964050293
Epoch: 15 Batch: 0 Loss: 3.140993595123291
Epoch: 16 Batch: 0 Loss: 3.1412429809570312
Epoch: 17 Batch: 0 Loss: 3.1373183727264404
Epoch: 18 Batch: 0 Loss: 3.1395294666290283
Epoch: 19 Batch: 0 Loss: 3.1291797161102295
Epoch: 20 Batch: 0 Loss: 3.1297154426574707
Epoch: 21 Batch: 0 Loss: 3.132261276245117
Epoch: 22 Batch: 0 Loss: 3.1425044536590576
Epoch: 23 Batch: 0 Loss: 3.1355419158935547
Epoch: 24 Batch: 0 Loss: 3.121699810028076
Epo

In [82]:
model.eval()
#test the model
with torch.no_grad():
    correct = 0
    total = 0
    for batch_idx in range(len(X_test)):
        data = X_test[batch_idx].float().to(device)
        target = y_test[batch_idx].long().to(device)
        
        output = model(data)
        _, predicted = torch.max(output.data, 1)
        total += target.size(0)
        correct += (predicted == target.squeeze()).sum().item()
        
    print(f"Accuracy: {100*correct/total}")


Accuracy: 10.267857142857142


In [83]:
#f1score
from sklearn.metrics import f1_score
y_true = []
y_pred = []
with torch.no_grad():
    for batch_idx in range(len(X_test)):
        data = X_test[batch_idx].float().to(device)
        target = y_test[batch_idx].long().to(device)
        
        output = model(data)
        _, predicted = torch.max(output.data, 1)
        y_true.extend(target.squeeze().tolist())
        y_pred.extend(predicted.tolist())

print(f"F1 score: {f1_score(y_true, y_pred, average='macro')}")
print(f"F1 score: {f1_score(y_true, y_pred, average='micro')}")
print(f"F1 score: {f1_score(y_true, y_pred, average='weighted')}")
print(f"F1 score: {f1_score(y_true, y_pred, average=None)}")




F1 score: 0.09779085592101588
F1 score: 0.10267857142857142
F1 score: 0.10669521969831566
F1 score: [0.10526316 0.         0.         0.         0.         0.11111111
 0.         0.1        0.         0.11764706 0.         0.52631579
 0.37037037 0.         0.52631579 0.11111111 0.         0.125
 0.1        0.         0.         0.         0.15384615 0.        ]
