In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd



In [23]:
#load data
data = pd.read_csv('Symptom2Disease.csv')

#seperate data into the text and label
symptoms = data['text'].values
diseases = data['label'].values

print(len(symptoms))
print(len(diseases))
    


1200
1200


In [24]:
#connect to GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [25]:
#tokenize data and pad them to equal sequences

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

#turning it into a set suppositly makes it faster
stopwords_set = set(stopwords.words())


#Preprocess function will remove stopwords, punctuation, lowercase the text
#Might add stemming and lemmatization later on
def preprocess(text):

    #lowercase
    text = word_tokenize(text)

    #remove stopwords and punctuation
    processed_text = [word.lower() for word in text if not word.lower() in stopwords_set and word.isalpha()]
    

    return processed_text

print(preprocess("I am going to SCHOOL. Where is my bag, lunch, and books?"))
print(f"\nBefore: {symptoms[0]}")

#preprcoess all data
symptoms = [preprocess(text) for text in symptoms]

print(f"After: {symptoms[0]}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bobth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bobth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['school', 'bag', 'lunch', 'books']

Before: I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.
After: ['experiencing', 'skin', 'rash', 'arms', 'legs', 'torso', 'past', 'weeks', 'red', 'itchy', 'covered', 'dry', 'scaly', 'patches']


In [26]:
#imports
from nltk.probability import FreqDist

#Basically encoding the text

#vocab creation
word_frequency = FreqDist([word.lower() for text in symptoms for word in text])

# Create the vocabulary by assigning a unique index to each word
vocab = {word: idx+1 for idx, (word, _) in enumerate(word_frequency.items())}

# encode text
def encode(text, vocab):
    encoded = []
    for word in text:
        encoded.append(vocab.get(word,0))
    return encoded

#encode all data
print(symptoms[0])

symptoms = [encode(text, vocab) for text in symptoms]


print(f"Encoded: {symptoms[0]}")



['experiencing', 'skin', 'rash', 'arms', 'legs', 'torso', 'past', 'weeks', 'red', 'itchy', 'covered', 'dry', 'scaly', 'patches']
Encoded: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


In [27]:
#pad sequences
from torch.nn.utils.rnn import pad_sequence

#pad sequences
symptoms = pad_sequence([torch.tensor(text) for text in symptoms], batch_first=True)



In [28]:
#encoding labels

label_mapping = {label: i for i, label in enumerate(np.unique(diseases))}
diseases = np.array([label_mapping[label] for label in diseases])



In [29]:
#split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(symptoms, diseases, test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


torch.Size([960, 25])
torch.Size([240, 25])
(960,)
(240,)


In [30]:
#convert to tensors
print(X_train.dtype)
print(y_train.dtype)

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)

X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)
 
print(f"training text: {X_train.shape}")
print(f"training labels: {y_train.shape}")

print(f"testing text: {X_test.shape}")
print(f"testing labels: {y_test.shape}")

print(X_train[0])



torch.int64
int32
training text: torch.Size([960, 25])
training labels: torch.Size([960])
testing text: torch.Size([240, 25])
testing labels: torch.Size([240])
tensor([  46,  272, 1055,  562,    2,  132,   46,   12,  507, 1056,  239,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0])


  X_train = torch.tensor(X_train)
  X_test = torch.tensor(X_test)


In [44]:
#create a simple fnn model that takes in the input size
class FNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out
    
#rnn model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        out = self.rnn(x)[0]
        out = self.linear2(out)
        out = self.relu(out)
        out = self.linear(out)
        out = self.softmax(out)
        return out
    
#LSTM model
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out
    def predict(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out
    
    




#### 24 represents the 24 unique diseases in the data set and 25 represents to length of each encoded vector


In [45]:
#define hyperparameters
learning_rate = 0.001
epochs = 10

input_size = X_train.shape[1]
hidden_size = 20
output_size = len(label_mapping)

print(f"Input size: {input_size}")
print(f"out size: {output_size}")




Input size: 25
out size: 24


In [46]:
model_num = 1
model = None
if(model_num == 0):
    model = FNN(input_size, hidden_size, output_size)

elif(model_num == 1):
    #Reshape for RNN
    model = RNN(input_size, hidden_size, output_size)


model.to(device)

RNN(
  (rnn): RNN(25, 20, batch_first=True)
  (linear): Linear(in_features=20, out_features=24, bias=True)
  (linear2): Linear(in_features=20, out_features=20, bias=True)
  (softmax): Softmax(dim=0)
  (relu): ReLU()
)

In [47]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [48]:
model.train()
#train the model
for epoch in range(epochs):
    
    for batch_idx in range(len(X_train)):
        #get data

        #Adds one extra meaningless dimension to the tensor, so that it can be passed into the model
        # dim = 0 makes it so it adds it to the front, -1 will add dimension to back
        data = X_train[batch_idx].float().unsqueeze(dim = 0).to(device)
        target = y_train[batch_idx].long().unsqueeze(dim = 0).to(device)
        
        #forward
        output = model(data)
        loss = criterion(output, target)
        
        #backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print(f"Epoch: {epoch} Loss: {loss.item()}")
        
    

        
    


Epoch: 0 Loss: 3.178053855895996
Epoch: 0 Loss: 3.178053855895996
Epoch: 0 Loss: 3.178053855895996
Epoch: 0 Loss: 3.178053855895996
Epoch: 0 Loss: 3.178053855895996
Epoch: 0 Loss: 3.178053855895996
Epoch: 0 Loss: 3.178053855895996
Epoch: 0 Loss: 3.178053855895996
Epoch: 0 Loss: 3.178053855895996
Epoch: 0 Loss: 3.178053855895996
Epoch: 1 Loss: 3.178053855895996
Epoch: 1 Loss: 3.178053855895996
Epoch: 1 Loss: 3.178053855895996
Epoch: 1 Loss: 3.178053855895996
Epoch: 1 Loss: 3.178053855895996
Epoch: 1 Loss: 3.178053855895996
Epoch: 1 Loss: 3.178053855895996
Epoch: 1 Loss: 3.178053855895996
Epoch: 1 Loss: 3.178053855895996
Epoch: 1 Loss: 3.178053855895996
Epoch: 2 Loss: 3.178053855895996
Epoch: 2 Loss: 3.178053855895996
Epoch: 2 Loss: 3.178053855895996
Epoch: 2 Loss: 3.178053855895996
Epoch: 2 Loss: 3.178053855895996
Epoch: 2 Loss: 3.178053855895996
Epoch: 2 Loss: 3.178053855895996
Epoch: 2 Loss: 3.178053855895996
Epoch: 2 Loss: 3.178053855895996
Epoch: 2 Loss: 3.178053855895996
Epoch: 3 L

In [49]:
model.eval()
#test the model
with torch.no_grad():
    correct = 0
    total = 0
    for batch_idx in range(len(X_test)):
        data = X_test[batch_idx].float().unsqueeze(dim = 0).to(device)
        target = y_test[batch_idx].long().unsqueeze(dim = 0).to(device)
        
        
        output = model(data)
        _, predicted = torch.max(output.data, 1)
        
        total += target.size(0)
        correct += (predicted == target).sum().item()
        
    print(f"Accuracy: {100*correct/total}")


Accuracy: 3.75


In [38]:
#f1score
from sklearn.metrics import f1_score


model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for batch_idx in range(len(X_test)):
        data = X_test[batch_idx].float().unsqueeze(dim = 0).to(device)
        target = y_test[batch_idx].long().unsqueeze(dim = 0).to(device)
        
        
        output = model(data)
        _, predicted = torch.max(output.data, 1)
        
        y_true.append(target.cpu().numpy())
        y_pred.append(predicted.cpu().numpy())

y_true = np.concatenate(y_true)
y_pred = np.concatenate(y_pred)

print(f"F1 score: {f1_score(y_true, y_pred, average='weighted')}")
print(f"Accuracy: {100*correct/total}")



F1 score: 0.002710843373493976


ZeroDivisionError: division by zero