In [115]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd



In [116]:
#load data
data = pd.read_csv('Symptom2Disease.csv')

#split data into train and test
train_data = data.sample(frac=0.8, random_state=0)
test_data = data.drop(train_data.index)

print(len(train_data))
print(f"{len(test_data)}, {test_data.shape})")

for i,x in enumerate(test_data):
    if(i == 10):
        break
    print(x)
    


960
240, (240, 3))
Unnamed: 0
label
text


In [117]:
#connect to GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [118]:
#train data
X_train = train_data['text'].values
y_train = train_data['label'].values

#test data
X_test = test_data['text'].values
y_test = test_data['label'].values

In [119]:
#tokenize data and pad them to equal sequences

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

#turning it into a set suppositly makes it faster
stopwords_set = set(stopwords.words())


#Preprocess function will remove stopwords, punctuation, lowercase the text
#Might add stemming and lemmatization later on
def preprocess(text):

    #lowercase
    text = word_tokenize(text)

    #remove stopwords and punctuation
    processed_text = [word.lower() for word in text if not word.lower() in stopwords_set and word.isalpha()]
    

    return processed_text

print(preprocess("I am going to SCHOOL. Where is my bag, lunch, and books?"))
print(f"\nBefore: {X_train[0]}")

#preprcoess all data
X_train = [preprocess(text) for text in X_train]
X_test = [preprocess(text) for text in X_test]

print(f"Processed: {X_train[0]}")

['school', 'bag', 'lunch', 'books']

Before: I have seen rashes on my arms and neck and it itches if I scratch them. I've also had a high fever for a few days. I have no idea what is causing it. The itching is causing me a lot of discomforts.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bobth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bobth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processed: ['rashes', 'arms', 'neck', 'itches', 'scratch', 'high', 'fever', 'days', 'idea', 'causing', 'itching', 'causing', 'lot', 'discomforts']


In [120]:
#imports
from nltk.probability import FreqDist

#Basically encoding the text

#vocab creation
word_frequency = FreqDist([word.lower() for text in X_train for word in text])

# Create the vocabulary by assigning a unique index to each word
vocab = {word: idx+1 for idx, (word, _) in enumerate(word_frequency.items())}

# encode text
def encode(text, vocab):
    encoded = []
    for word in text:
        encoded.append(vocab.get(word,0))
    return encoded

#encode all data
print(X_train[0])
print(X_train[4])
X_train = [encode(text, vocab) for text in X_train]
X_test = [encode(text, vocab) for text in X_test]

print(f"Encoded: {X_train[0]}")
print(f"Encoded:{X_train[4]}")


['rashes', 'arms', 'neck', 'itches', 'scratch', 'high', 'fever', 'days', 'idea', 'causing', 'itching', 'causing', 'lot', 'discomforts']
['breathing', 'issues', 'persistent', 'cough', 'exhaustion', 'coughing', 'lot', 'thick', 'mucoid', 'sputum', 'high', 'fever', 'feeling', 'exhausted', 'tired', 'cope']
Encoded: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 12, 13]
Encoded:[52, 53, 54, 55, 56, 57, 12, 58, 59, 60, 6, 7, 61, 62, 63, 64]


In [121]:
#pad sequences
from torch.nn.utils.rnn import pad_sequence

#pad sequences
X_train = pad_sequence([torch.tensor(text) for text in X_train], batch_first=True)
X_test = pad_sequence([torch.tensor(text) for text in X_test], batch_first=True)


In [122]:
#encoding labels

label_mapping = {label: i for i, label in enumerate(np.unique(y_train))}
y_train = np.array([label_mapping[label] for label in y_train])
y_test = np.array([label_mapping[label] for label in y_test])


In [123]:
#convert to tensors
print(X_train.dtype)
print(y_train.dtype)

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)

X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)

torch.int64
int32


  X_train = torch.tensor(X_train)
  X_test = torch.tensor(X_test)


In [124]:
#create a simple fnn model that takes in the input size
class FNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out
    
#rnn model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out
    
#LSTM model
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out
    def predict(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out
    
    




In [125]:
#define hyperparameters
learning_rate = 0.001
epochs = 10
batch_size = 1



In [126]:
model_num = 1
model = None
if(model_num == 0):
    model = FNN(input_size, hidden_size, output_size)

elif(model_num == 1):
    #Reshape for RNN
    padded_input_ids = padded_input_ids.reshape(1, seq_len_train, input_size)
    padded_test_input_ids = padded_test_input_ids.reshape(1, seq_len_test, input_size1)
    padded_labels = padded_labels.reshape(1, seq_len_train1, output_size)
    padded_test_labels = padded_test_labels.reshape(1, seq_len_test1, output_size1)
    
    print(padded_input_ids.shape)
    print(padded_test_input_ids.shape)
    print(padded_labels.shape)
    print(padded_test_labels.shape)
    
    model = RNN(input_size, hidden_size, output_size)


model.to(device)

NameError: name 'padded_input_ids' is not defined

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
model.train()
#train the model
for epoch in range(epochs):
    for i in range(0, seq_len_train):
        inputs = padded_input_ids.to(device)
        labels = padded_labels.to(device)
        if(i == 0):
            print(inputs[0])
            print(labels[0])
        
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.float())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
        if (i+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{seq_len_train}], Loss: {loss.item():.4f}')

print('Finished Training')

tensor([[  101,  1045,  2031,  ...,     0,     0,     0],
        [  101,  1045,  1005,  ...,     0,     0,     0],
        [  101,  2026, 10063,  ...,     0,     0,     0],
        ...,
        [  101,  2067,  3255,  ...,     0,     0,     0],
        [  101,  1045,  1005,  ...,     0,     0,     0],
        [  101,  1045,  2031,  ...,     0,     0,     0]], device='cuda:0')
tensor([[  101,  7975, 13433,  ...,     0,     0,     0],
        [  101, 28079,  8985,  ...,     0,     0,     0],
        [  101,  8827, 11069,  ...,     0,     0,     0],
        ...,
        [  101, 28711, 11867,  ...,     0,     0,     0],
        [  101,  8827, 11069,  ...,     0,     0,     0],
        [  101, 28711, 11867,  ...,     0,     0,     0]], device='cuda:0')


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/10], Step [10/960], Loss: 61041380.0000
Epoch [1/10], Step [20/960], Loss: 61041380.0000
Epoch [1/10], Step [30/960], Loss: 61041380.0000
Epoch [1/10], Step [40/960], Loss: 61041380.0000
Epoch [1/10], Step [50/960], Loss: 61041380.0000
Epoch [1/10], Step [60/960], Loss: 61041380.0000
Epoch [1/10], Step [70/960], Loss: 61041380.0000
Epoch [1/10], Step [80/960], Loss: 61041380.0000
Epoch [1/10], Step [90/960], Loss: 61041380.0000
Epoch [1/10], Step [100/960], Loss: 61041380.0000
Epoch [1/10], Step [110/960], Loss: 61041380.0000
Epoch [1/10], Step [120/960], Loss: 61041380.0000
Epoch [1/10], Step [130/960], Loss: 61041380.0000
Epoch [1/10], Step [140/960], Loss: 61041380.0000
Epoch [1/10], Step [150/960], Loss: 61041380.0000
Epoch [1/10], Step [160/960], Loss: 61041380.0000
Epoch [1/10], Step [170/960], Loss: 61041380.0000
Epoch [1/10], Step [180/960], Loss: 61041380.0000
Epoch [1/10], Step [190/960], Loss: 61041380.0000
Epoch [1/10], Step [200/960], Loss: 61041380.0000
Epoch [1/

In [None]:
model.eval()
#test the model
with torch.no_grad():
    correct = 0
    total = 0
    for i in range(0, seq_len_test):
        inputs = padded_test_input_ids.to(device)
        labels = padded_test_labels.to(device)
        
        outputs = model(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        _, actual = torch.max(labels.data, 1)
        total += labels.size(0)
        correct += (predicted == actual).sum().item()
        
    print(f'Accuracy of the network on the {total} test images: {100 * correct / total} %')



Accuracy of the network on the 240 test images: 100.0 %


In [None]:
#f1score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

with torch.no_grad():
    correct = 0
    total = 0
    y_pred = []
    y_true = []
    for i in range(0, seq_len_test):
        inputs = padded_test_input_ids.to(device)
        labels = padded_test_labels.to(device)
        
        outputs = model(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        _, actual = torch.max(labels.data, 1)
        total += labels.size(0)
        correct += (predicted == actual).sum().item()
        y_pred.append(predicted)
        y_true.append(actual)
    
    #[y_pred, y_true] = [torch.cat(y_pred, dim=0).cpu().numpy(), torch.cat(y_true, dim=0).cpu().numpy()]

    print(y_pred)
    print(f'Accuracy of the network on the {total} test images: {100 * correct / total} %')
    print(f'f1 score: {f1_score(y_true, y_pred, average="macro")}')
    print(f'precision score: {precision_score(y_true, y_pred, average="macro")}')
    print(f'recall score: {recall_score(y_true, y_pred, average="macro")}')

[tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

: 