In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import LabelEncoder
from gensim.models import KeyedVectors
import tensorflow as tf

import os

In [2]:
df = pd.read_csv('./dist/archive/dataset.csv')
df = df.fillna(" ")
df['Symptom']=""
for index, row in df.iterrows():
    symptoms = []
    for i in range(1, 18):  
        symptom = str(row[i]) 
        symptom = symptom.replace('_', ' ') 
        symptoms.append(symptom) 
    df.at[index, 'Symptom'] = ''.join(symptoms)  

for i in range(1, 18):
    df = df.drop(f"Symptom_{i}", axis = 1)

pretrained_model_path  = "./dist/glove.6B.300d/glove.6B.300d.txt"  
embeddings = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=False, no_header=True)

  symptom = str(row[i])


In [3]:
embeddings

<gensim.models.keyedvectors.KeyedVectors at 0x222529b70b0>

In [4]:
def get_symptom_vector(tokens, embeddings, dim=300):
    vectors = [embeddings[word] for word in tokens if word in embeddings]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)  
    else:
        return np.zeros(dim)

In [5]:
df['Symptom_tokens'] = df['Symptom'].apply(lambda x: x.split())
df['Symptom_vector'] = df['Symptom_tokens'].apply(lambda x: get_symptom_vector(x, embeddings))

In [6]:
df

Unnamed: 0,Disease,Symptom,Symptom_tokens,Symptom_vector
0,Fungal infection,itching skin rash nodal skin eruptions dischro...,"[itching, skin, rash, nodal, skin, eruptions, ...","[0.2070357, 0.01904114, -0.0055758613, -0.1377..."
1,Fungal infection,skin rash nodal skin eruptions dischromic pa...,"[skin, rash, nodal, skin, eruptions, dischromi...","[0.24659766, -0.0013919994, -0.010601004, -0.1..."
2,Fungal infection,itching nodal skin eruptions dischromic patch...,"[itching, nodal, skin, eruptions, dischromic, ...","[0.07035, 0.08717581, -0.028830606, -0.0361490..."
3,Fungal infection,itching skin rash dischromic patches ...,"[itching, skin, rash, dischromic, patches]","[0.178511, -0.07981275, -0.05286075, -0.213686..."
4,Fungal infection,itching skin rash nodal skin eruptions ...,"[itching, skin, rash, nodal, skin, eruptions]","[0.30039498, 0.048597995, 0.050351497, -0.1581..."
...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting headache nausea spinning movements l...,"[vomiting, headache, nausea, spinning, movemen...","[0.11061343, 0.17821288, 0.11190355, -0.191091..."
4916,Acne,skin rash pus filled pimples blackheads scurr...,"[skin, rash, pus, filled, pimples, blackheads,...","[0.097137995, -0.1723305, -0.039824996, -0.223..."
4917,Urinary tract infection,burning micturition bladder discomfort foul s...,"[burning, micturition, bladder, discomfort, fo...","[0.05448175, 0.15117364, 0.23599534, -0.430794..."
4918,Psoriasis,skin rash joint pain skin peeling silver like...,"[skin, rash, joint, pain, skin, peeling, silve...","[0.059871808, 0.053128403, -0.13207932, -0.143..."


In [7]:
from sklearn.preprocessing import LabelEncoder

X = np.array(df['Symptom_vector'].tolist())
le = LabelEncoder()
y = le.fit_transform(df['Disease']) 

In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding
from tensorflow.keras.utils import to_categorical

In [14]:
X.dtype

dtype('float32')

In [10]:
y_one_hot = to_categorical(y)

input_layer = Input(shape=(X.shape[1],))
hidden_layer = Dense(128, activation='relu')(input_layer)  
output_layer = Dense(len(le.classes_), activation='softmax')(hidden_layer) 

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y_one_hot, epochs=10, batch_size=32)

disease_embedding_layer = Model(inputs=input_layer, outputs=hidden_layer)
disease_embeddings = disease_embedding_layer.predict(X)

Epoch 1/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 691us/step - accuracy: 0.5083 - loss: 2.6959
Epoch 2/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 639us/step - accuracy: 0.9976 - loss: 0.3015
Epoch 3/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 638us/step - accuracy: 0.9997 - loss: 0.0867
Epoch 4/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 627us/step - accuracy: 1.0000 - loss: 0.0400
Epoch 5/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 643us/step - accuracy: 1.0000 - loss: 0.0229
Epoch 6/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 631us/step - accuracy: 1.0000 - loss: 0.0141
Epoch 7/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 641us/step - accuracy: 1.0000 - loss: 0.0096
Epoch 8/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 623us/step - accuracy: 1.0000 - loss: 0.0074
Epoch 9/10
[1m154/154[

In [68]:
text = 'Diarrhea fever'
vector = get_symptom_vector(text.split(), embeddings, dim = 300)
vector.shape

(300,)

In [69]:
vector.dtype

dtype('float32')

In [70]:
vector = np.expand_dims(vector, axis = 0)
vector.shape

(1, 300)

In [71]:
embedding = model.predict(vector)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


In [72]:
pred = embedding.argmax()

In [73]:
pred

6

In [74]:
le.classes_[pred]

'Bronchial Asthma'

In [None]:
tf