In [None]:
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence #unique words

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,SimpleRNN, Dropout, Embedding

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df=pd.read_csv('/content/Symptom2Disease.csv')
df

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."
...,...,...,...
1195,295,diabetes,I'm shaking and trembling all over. I've lost ...
1196,296,diabetes,"Particularly in the crevices of my skin, I hav..."
1197,297,diabetes,I regularly experience these intense urges and...
1198,298,diabetes,"I have trouble breathing, especially outside. ..."


In [None]:
df.drop('Unnamed: 0',inplace=True,axis=1)

In [None]:
df.head()

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


In [None]:
print(df['label'].shape)

(1200,)


In [None]:
df['label'].value_counts()

Psoriasis                          50
Varicose Veins                     50
peptic ulcer disease               50
drug reaction                      50
gastroesophageal reflux disease    50
allergy                            50
urinary tract infection            50
Malaria                            50
Jaundice                           50
Cervical spondylosis               50
Migraine                           50
Hypertension                       50
Bronchial Asthma                   50
Acne                               50
Arthritis                          50
Dimorphic Hemorrhoids              50
Pneumonia                          50
Common Cold                        50
Fungal infection                   50
Dengue                             50
Impetigo                           50
Chicken pox                        50
Typhoid                            50
diabetes                           50
Name: label, dtype: int64

In [None]:
def cleantext(text):
  tokens = word_tokenize(text.lower())
  ftoken = [t for t in tokens if(t.isalpha())]

  stop = stopwords.words("english")
  ctoken = [t for t in ftoken if(t not in stop)]

  lemma = WordNetLemmatizer()
  ltoken = [lemma.lemmatize(t) for t in ctoken]

  return " ".join(ltoken)

In [None]:
df["clean_text"]=df["text"].apply(cleantext)
df

Unnamed: 0,label,text,clean_text
0,Psoriasis,I have been experiencing a skin rash on my arm...,experiencing skin rash arm leg torso past week...
1,Psoriasis,"My skin has been peeling, especially on my kne...",skin peeling especially knee elbow scalp peeli...
2,Psoriasis,I have been experiencing joint pain in my fing...,experiencing joint pain finger wrist knee pain...
3,Psoriasis,"There is a silver like dusting on my skin, esp...",silver like dusting skin especially lower back...
4,Psoriasis,"My nails have small dents or pits in them, and...",nail small dent pit often feel inflammatory te...
...,...,...,...
1195,diabetes,I'm shaking and trembling all over. I've lost ...,shaking trembling lost sense taste smell exhau...
1196,diabetes,"Particularly in the crevices of my skin, I hav...",particularly crevice skin skin rash irritation...
1197,diabetes,I regularly experience these intense urges and...,regularly experience intense urge want urinate...
1198,diabetes,"I have trouble breathing, especially outside. ...",trouble breathing especially outside start fee...


In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['label']=le.fit_transform(df['label'])

In [None]:
df

Unnamed: 0,label,text,clean_text
0,15,I have been experiencing a skin rash on my arm...,experiencing skin rash arm leg torso past week...
1,15,"My skin has been peeling, especially on my kne...",skin peeling especially knee elbow scalp peeli...
2,15,I have been experiencing joint pain in my fing...,experiencing joint pain finger wrist knee pain...
3,15,"There is a silver like dusting on my skin, esp...",silver like dusting skin especially lower back...
4,15,"My nails have small dents or pits in them, and...",nail small dent pit often feel inflammatory te...
...,...,...,...
1195,19,I'm shaking and trembling all over. I've lost ...,shaking trembling lost sense taste smell exhau...
1196,19,"Particularly in the crevices of my skin, I hav...",particularly crevice skin skin rash irritation...
1197,19,I regularly experience these intense urges and...,regularly experience intense urge want urinate...
1198,19,"I have trouble breathing, especially outside. ...",trouble breathing especially outside start fee...


In [None]:
x=df['clean_text']
y=df['label']

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=1)

In [None]:
sentlen = []

for sent in df["clean_text"]:
  sentlen.append(len(word_tokenize(sent)))

df["SentLen"] = sentlen
df.head()

Unnamed: 0,label,text,clean_text,SentLen
0,15,I have been experiencing a skin rash on my arm...,experiencing skin rash arm leg torso past week...,14
1,15,"My skin has been peeling, especially on my kne...",skin peeling especially knee elbow scalp peeli...,12
2,15,I have been experiencing joint pain in my fing...,experiencing joint pain finger wrist knee pain...,14
3,15,"There is a silver like dusting on my skin, esp...",silver like dusting skin especially lower back...,15
4,15,"My nails have small dents or pits in them, and...",nail small dent pit often feel inflammatory te...,13


In [None]:
max(sentlen)

27

In [None]:
max_len = np.quantile(sentlen, 0.95)
max_len

21.0

In [None]:
tok = Tokenizer(char_level=False, split=" ")


tok.fit_on_texts(xtrain)
tok.index_word

{1: 'also',
 2: 'lot',
 3: 'skin',
 4: 'pain',
 5: 'really',
 6: 'feel',
 7: 'fever',
 8: 'feeling',
 9: 'experiencing',
 10: 'headache',
 11: 'high',
 12: 'neck',
 13: 'rash',
 14: 'get',
 15: 'cough',
 16: 'hurt',
 17: 'chest',
 18: 'muscle',
 19: 'quite',
 20: 'throat',
 21: 'severe',
 22: 'sore',
 23: 'weak',
 24: 'body',
 25: 'discomfort',
 26: 'chill',
 27: 'red',
 28: 'back',
 29: 'time',
 30: 'itching',
 31: 'frequently',
 32: 'ache',
 33: 'difficult',
 34: 'joint',
 35: 'coughing',
 36: 'painful',
 37: 'stomach',
 38: 'uncomfortable',
 39: 'nausea',
 40: 'like',
 41: 'day',
 42: 'recently',
 43: 'temperature',
 44: 'stiff',
 45: 'lost',
 46: 'trouble',
 47: 'appetite',
 48: 'breathing',
 49: 'vomiting',
 50: 'swollen',
 51: 'nose',
 52: 'well',
 53: 'go',
 54: 'leg',
 55: 'occasionally',
 56: 'ca',
 57: 'additionally',
 58: 'arm',
 59: 'exhausted',
 60: 'extremely',
 61: 'night',
 62: 'dizziness',
 63: 'indigestion',
 64: 'addition',
 65: 'anus',
 66: 'pimple',
 67: 'along',
 

In [None]:
vocab_len = len(tok.index_word)
vocab_len

1223

In [None]:
seqtrain = tok.texts_to_sequences(xtrain) #step1
seqtrain

[[8, 5, 172, 138, 139, 64, 45, 68, 43, 91, 134, 3, 210, 74, 57, 14, 37, 4],
 [551, 71, 15, 48, 69, 52, 11, 7, 2, 173, 423, 6, 314, 23, 86],
 [183,
  315,
  671,
  2,
  97,
  424,
  144,
  33,
  144,
  16,
  53,
  65,
  316,
  5,
  38,
  2,
  25,
  16,
  60,
  145],
 [5, 76, 79, 290, 72, 3, 1, 122, 232, 130, 477, 160, 3, 197, 337, 80],
 [64, 233, 28, 4, 1, 35, 98, 8, 23, 8, 184, 552, 185, 12, 16, 52],
 [13, 3, 478, 234, 24, 198, 17, 186, 76, 38, 73, 103, 61, 1, 425, 3, 270],
 [2, 46, 144, 222, 5, 36, 92, 135, 4, 371, 65, 5, 76, 235],
 [372,
  186,
  32,
  6,
  40,
  553,
  844,
  146,
  78,
  479,
  9,
  672,
  120,
  24,
  43,
  845,
  236,
  23],
 [123, 116, 20, 31, 6, 846, 55, 14, 480, 199, 317, 163, 6, 111],
 [49, 9, 104, 37, 151, 62, 45, 481, 6, 23, 223, 4, 237, 70],
 [9, 183, 318, 291, 847, 73, 238, 224, 239],
 [554, 93, 271, 338, 29, 373, 2, 4, 54, 14, 151, 848, 482, 292, 293, 426, 54],
 [236,
  427,
  135,
  91,
  211,
  673,
  41,
  87,
  81,
  152,
  153,
  16,
  40,
  849,
  

In [None]:
seqmattrain = sequence.pad_sequences(seqtrain, maxlen= int(max_len)) #step2
seqmattrain

array([[  0,   0,   0, ...,  14,  37,   4],
       [  0,   0,   0, ..., 314,  23,  86],
       [  0, 183, 315, ...,  16,  60, 145],
       ...,
       [  0,   0,   0, ...,  69, 838, 776],
       [  0,   0,   0, ..., 287, 108,   7],
       [  0,   0,   0, ...,  34, 327,   4]], dtype=int32)

In [None]:
seqtest = tok.texts_to_sequences(xtest)
seqmattest = sequence.pad_sequences(seqtest, maxlen=int(max_len))

In [None]:
rnn = Sequential()

rnn.add(Embedding(vocab_len+1,10, input_length=int(max_len), mask_zero=True))
rnn.add(SimpleRNN(units=32, activation="tanh"))
rnn.add(Dense(units=32, activation="relu"))
rnn.add(Dropout(0.2))

rnn.add(Dense(24,activation='softmax'))

rnn.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history=rnn.fit(seqmattrain,ytrain,epochs=25,batch_size=20)



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
seqmattest[0]

array([  0,   0, 412, 145,  15,  48,  90,  35,   2, 119, 182, 193,  11,
         7,   5, 649, 649, 424,   8, 303, 351], dtype=int32)

In [None]:
yprob=rnn.predict(seqmattest)
yprob[0]



array([1.6337988e-05, 4.4784702e-08, 9.6961492e-01, 1.2566698e-05,
       2.5015682e-02, 2.2391765e-04, 3.0633251e-04, 3.2349789e-04,
       2.0135569e-05, 4.9529290e-06, 1.5749361e-08, 7.3317679e-08,
       2.8465544e-07, 5.0431518e-06, 4.3330705e-04, 1.7891165e-06,
       1.9478456e-04, 1.3971436e-07, 4.1136440e-04, 1.7360627e-03,
       1.3943565e-03, 1.3721984e-05, 2.6482789e-04, 5.8027886e-06],
      dtype=float32)

In [None]:
ypred=yprob.argmax(axis=1)
ypred

array([ 2,  9, 21,  4, 20,  6, 11, 21,  1,  9, 10, 16,  8,  6, 11,  8, 16,
       10,  7, 21, 19,  8, 22,  0, 17, 11,  0,  4,  6,  7, 20, 19, 16, 17,
       17,  6,  1,  8, 18,  4,  7,  7, 15, 17,  2, 11,  4, 20,  4, 21, 21,
        8,  2,  3,  7, 11,  1,  4,  8, 19,  0,  6,  4,  3, 22,  8, 23, 19,
       23, 20,  5, 11,  1, 20, 11, 19, 16,  6, 11, 14, 21,  1, 16,  6, 11,
        1,  9, 12,  2,  1, 21, 14, 23,  9, 11, 16,  2, 11, 11,  0, 23, 22,
       23,  8,  7,  6,  7,  1,  2, 18,  2, 18, 20, 17,  2, 17, 12, 10,  0,
       14, 17, 17,  1, 13, 11, 16, 10, 22,  4, 11, 12, 19, 23, 16,  8, 12,
       15, 21, 10,  4, 13,  0, 20, 19, 16, 18, 23,  3,  0, 19,  2,  7,  9,
        0, 22,  2, 10,  0, 10,  3,  2,  4, 12,  1, 16, 16, 10, 10,  2, 13,
        8,  9,  5, 11, 20, 21,  9,  9, 17,  0,  9, 16, 17, 22,  5, 14,  7,
       12,  4, 22, 18, 23,  7, 17,  8, 13, 19, 17, 15, 12, 17, 12, 18,  7,
       11, 15, 10,  6, 19,  3,  5, 12, 17,  6, 17,  8, 21,  5, 17,  0, 21,
       23,  9,  3,  1,  7

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       1.00      0.88      0.94        17
           1       0.86      1.00      0.92        12
           2       0.53      0.69      0.60        13
           3       0.77      0.83      0.80        12
           4       0.53      0.57      0.55        14
           5       1.00      0.64      0.78        11
           6       0.31      0.40      0.35        10
           7       0.74      0.88      0.80        16
           8       0.79      0.71      0.75        21
           9       0.75      0.80      0.77        15
          10       0.87      0.68      0.76        19
          11       0.79      1.00      0.88        22
          12       0.82      0.93      0.87        15
          13       0.67      0.55      0.60        11
          14       0.64      0.58      0.61        12
          15       0.50      0.20      0.29        15
          16       0.57      0.57      0.57        14
          17       0.73    