# Import and Preparing Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,Unnamed: 133
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo,
4916,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,Acne,
4917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Urinary tract infection,
4918,0,1,0,0,0,0,1,0,0,0,...,0,1,1,1,1,0,0,0,Psoriasis,


In [4]:
train.columns

Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'scurring', 'skin_peeling', 'silver_like_dusting',
       'small_dents_in_nails', 'inflammatory_nails', 'blister',
       'red_sore_around_nose', 'yellow_crust_ooze', 'prognosis',
       'Unnamed: 133'],
      dtype='object', length=134)

In [5]:
train.drop('Unnamed: 133', axis=1, inplace=True)

In [6]:
train['prognosis'].value_counts()

prognosis
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemmorhoids(piles)               120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                                120
Hepatitis B                                120
All

In [7]:
train['prognosis']=train['prognosis'].str.lower()

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = train.drop('prognosis', axis=1)
y = train['prognosis']

le = LabelEncoder()
y=le.fit_transform(y)

In [9]:
max_len=41

y_final = []

for val in y:
    tensor = np.zeros(max_len)
    tensor[val] = 1
    y_final.append(tensor)

In [10]:
y = np.array(y_final)

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

In [12]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

In [13]:
le_name_mapping

{'(vertigo) paroymsal  positional vertigo': 0,
 'acne': 1,
 'aids': 2,
 'alcoholic hepatitis': 3,
 'allergy': 4,
 'arthritis': 5,
 'bronchial asthma': 6,
 'cervical spondylosis': 7,
 'chicken pox': 8,
 'chronic cholestasis': 9,
 'common cold': 10,
 'dengue': 11,
 'diabetes ': 12,
 'dimorphic hemmorhoids(piles)': 13,
 'drug reaction': 14,
 'fungal infection': 15,
 'gastroenteritis': 16,
 'gerd': 17,
 'heart attack': 18,
 'hepatitis a': 19,
 'hepatitis b': 20,
 'hepatitis c': 21,
 'hepatitis d': 22,
 'hepatitis e': 23,
 'hypertension ': 24,
 'hyperthyroidism': 25,
 'hypoglycemia': 26,
 'hypothyroidism': 27,
 'impetigo': 28,
 'jaundice': 29,
 'malaria': 30,
 'migraine': 31,
 'osteoarthristis': 32,
 'paralysis (brain hemorrhage)': 33,
 'peptic ulcer diseae': 34,
 'pneumonia': 35,
 'psoriasis': 36,
 'tuberculosis': 37,
 'typhoid': 38,
 'urinary tract infection': 39,
 'varicose veins': 40}

In [14]:
X_train.columns

Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'pus_filled_pimples', 'blackheads', 'scurring', 'skin_peeling',
       'silver_like_dusting', 'small_dents_in_nails', 'inflammatory_nails',
       'blister', 'red_sore_around_nose', 'yellow_crust_ooze'],
      dtype='object', length=132)

# Model Building

In [15]:
from keras import Sequential
from keras.layers import Input, Dense, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping

In [16]:
model = Sequential()
model.add(Input(132,))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(41, activation='softmax'))

model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

In [17]:
history = model.fit(X_train, y_train, epochs=25, validation_data=(X_val, y_val))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [18]:
prediction=model.predict([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])[0]



In [19]:
disease_name=le.inverse_transform([np.argmax(prediction)])[0]

In [20]:
disease_name

'fungal infection'

# Testing

In [22]:
test['prognosis']=test['prognosis'].str.lower()

In [23]:
X_test = test.drop('prognosis', axis=1)
y_test = le.transform(test['prognosis'])

In [24]:
max_len=41

y_final = []

for val in y_test:
    tensor = np.zeros(max_len)
    tensor[val] = 1
    y_final.append(tensor)

In [25]:
y_test = np.array(y_final)

In [26]:
model.evaluate(X_test, y_test)



[0.0041177840903401375, 1.0]

# Usage

In [27]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_md")

columns = X.columns.values

In [36]:
import json

f = open('intents.json')
intents = json.load(f)

In [29]:
from nltk.stem.porter import PorterStemmer

ps=PorterStemmer()

def stem(x):
    L = []

    for i in x:
        L.append(ps.stem(i.lower()))
    return " ".join(L)

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
import re

def check_confidence(text, word_list):
    stemmed_word_list = [stem([x]) for x in word_list]

    word_pattern = r'\b(' + '|'.join(re.escape(word) for word in stemmed_word_list) + r')\b'
    matching_words = re.findall(word_pattern, text, flags=re.IGNORECASE)
    if len(matching_words)!=0:
        return 1

    cv = CountVectorizer(max_features=100, stop_words='english')

    l = text.split(' ')
    inputs = len(l)
    l.extend(np.unique(stemmed_word_list))
    comparision_arr = np.array(l)

    if len(np.unique(comparision_arr)) < len(comparision_arr):
      return 1

    vectors = cv.fit_transform(comparision_arr).toarray()
    similarities = cosine_similarity(vectors)

    distances = similarities[0]

    for conf in distances[inputs:]:
      if conf>0.8:
        return 1

    modified_confidences = [len(word_list)/3 if conf > 0.6 else conf for conf in distances[inputs:]]

    return np.mean(modified_confidences)

In [31]:
def extract_features(text):
    extracted_features = []

    doc = nlp(text)
    tokens = []

    for token in doc:
        if token.text.lower() not in STOP_WORDS and not token.is_punct:
            tokens.append(token.lemma_)

    text = stem(tokens)

    for key in intents:
      confidence = check_confidence(text, intents[key])
      if confidence>0.5:
        extracted_features.append(key)

    return extracted_features

In [32]:
def preprocess_input(str):
    features = extract_features(str)
    feature_keys = []
    for feature in features:
        if feature in columns:
            feature_keys.append(np.where(columns==feature)[0][0])

    tensor = np.zeros(len(columns))
    for i in feature_keys:
        tensor[i] = 1

    return tensor

In [33]:
def make_predictions(str):
    tensor=preprocess_input(str)
    if len(np.unique(tensor)) == 1:
      return [], []
    
    tensor=tensor.reshape(1,132)
    prediction=model.predict(tensor)[0]

    top_indices = np.argsort(prediction)[::-1][:3]
    top_3_indices = top_indices[:3]

    disease_names=[le.inverse_transform([x])[0] for x in top_3_indices]

    confidences = [x*100 for x in prediction[top_3_indices]]

    confidence_dict = {}

    for i in range (0,3):
      confidence_dict[disease_names[i]]=round(confidences[i], 2)

    return confidence_dict

In [41]:
make_predictions('I have neck pain and dizziness')



{'cervical spondylosis': 96.55, 'osteoarthristis': 0.8, 'arthritis': 0.35}