In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
import pickle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [2]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [3]:
# load and visualize dataset 
df = pd.read_csv('raw_dataset/symptom_disease.csv')
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [4]:
# remove hyphen, trailing space 
for column in df.columns:
    df[column] = df[column].str.replace('_',' ')
    if df[column].dtype == 'Object':
        df[column] = df[column].str.strip()
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


In [5]:
# create a new dataframe having columns as unique values from old df
select_df = df.iloc[:, 1:]
new_columns = pd.unique(select_df.values.ravel())
new_columns = [x for x in new_columns if str(x) != 'nan']
new_columns

['itching',
 'skin rash',
 'nodal skin eruptions',
 'dischromic  patches',
 'continuous sneezing',
 'shivering',
 'chills',
 'watering from eyes',
 'stomach pain',
 'acidity',
 'ulcers on tongue',
 'vomiting',
 'cough',
 'chest pain',
 'yellowish skin',
 'nausea',
 'loss of appetite',
 'abdominal pain',
 'yellowing of eyes',
 'burning micturition',
 'spotting  urination',
 'passage of gases',
 'internal itching',
 'indigestion',
 'muscle wasting',
 'patches in throat',
 'high fever',
 'extra marital contacts',
 'fatigue',
 'weight loss',
 'restlessness',
 'lethargy',
 'irregular sugar level',
 'blurred and distorted vision',
 'obesity',
 'excessive hunger',
 'increased appetite',
 'polyuria',
 'sunken eyes',
 'dehydration',
 'diarrhoea',
 'breathlessness',
 'family history',
 'mucoid sputum',
 'headache',
 'dizziness',
 'loss of balance',
 'lack of concentration',
 'stiff neck',
 'depression',
 'irritability',
 'visual disturbances',
 'back pain',
 'weakness in limbs',
 'neck pain',
 '

In [6]:
n, m = df.shape
symptoms_list = []
for i in range(n):
    val = select_df.iloc[i].values
    val = val.tolist()
    val = [x for x in val if str(x) != 'nan']
    symptoms_list.append(val)
symptoms_list

[['itching', 'skin rash', 'nodal skin eruptions', 'dischromic  patches'],
 ['skin rash', 'nodal skin eruptions', 'dischromic  patches'],
 ['itching', 'nodal skin eruptions', 'dischromic  patches'],
 ['itching', 'skin rash', 'dischromic  patches'],
 ['itching', 'skin rash', 'nodal skin eruptions'],
 ['skin rash', 'nodal skin eruptions', 'dischromic  patches'],
 ['itching', 'nodal skin eruptions', 'dischromic  patches'],
 ['itching', 'skin rash', 'dischromic  patches'],
 ['itching', 'skin rash', 'nodal skin eruptions'],
 ['itching', 'skin rash', 'nodal skin eruptions', 'dischromic  patches'],
 ['continuous sneezing', 'shivering', 'chills', 'watering from eyes'],
 ['shivering', 'chills', 'watering from eyes'],
 ['continuous sneezing', 'chills', 'watering from eyes'],
 ['continuous sneezing', 'shivering', 'watering from eyes'],
 ['continuous sneezing', 'shivering', 'chills'],
 ['shivering', 'chills', 'watering from eyes'],
 ['continuous sneezing', 'chills', 'watering from eyes'],
 ['contin

In [7]:
# create new df with columns as the unique values from the raw dataset
new_df = pd.DataFrame(columns=new_columns, index=df.index)
new_df.head()

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,bladder discomfort,foul smell of urine,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [8]:
new_df["symptoms_list"] = symptoms_list
new_df.head()

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,foul smell of urine,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze,symptoms_list
0,,,,,,,,,,,...,,,,,,,,,,"[itching, skin rash, nodal skin eruptions, dis..."
1,,,,,,,,,,,...,,,,,,,,,,"[skin rash, nodal skin eruptions, dischromic ..."
2,,,,,,,,,,,...,,,,,,,,,,"[itching, nodal skin eruptions, dischromic pa..."
3,,,,,,,,,,,...,,,,,,,,,,"[itching, skin rash, dischromic patches]"
4,,,,,,,,,,,...,,,,,,,,,,"[itching, skin rash, nodal skin eruptions]"


In [9]:
# moving disease names (labels) to new df
new_df['disease'] = df['Disease']
new_df.head()

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze,symptoms_list,disease
0,,,,,,,,,,,...,,,,,,,,,"[itching, skin rash, nodal skin eruptions, dis...",Fungal infection
1,,,,,,,,,,,...,,,,,,,,,"[skin rash, nodal skin eruptions, dischromic ...",Fungal infection
2,,,,,,,,,,,...,,,,,,,,,"[itching, nodal skin eruptions, dischromic pa...",Fungal infection
3,,,,,,,,,,,...,,,,,,,,,"[itching, skin rash, dischromic patches]",Fungal infection
4,,,,,,,,,,,...,,,,,,,,,"[itching, skin rash, nodal skin eruptions]",Fungal infection


In [10]:
# change all column to binary encoding format
for col in new_columns:
    new_df[col] = new_df["symptoms_list"].apply(lambda x:1 if col in x else 0)
    print(col)
new_df.head()

itching
skin rash
nodal skin eruptions
dischromic  patches
continuous sneezing
shivering
chills
watering from eyes
stomach pain
acidity
ulcers on tongue
vomiting
cough
chest pain
yellowish skin
nausea
loss of appetite
abdominal pain
yellowing of eyes
burning micturition
spotting  urination
passage of gases
internal itching
indigestion
muscle wasting
patches in throat
high fever
extra marital contacts
fatigue
weight loss
restlessness
lethargy
irregular sugar level
blurred and distorted vision
obesity
excessive hunger
increased appetite
polyuria
sunken eyes
dehydration
diarrhoea
breathlessness
family history
mucoid sputum
headache
dizziness
loss of balance
lack of concentration
stiff neck
depression
irritability
visual disturbances
back pain
weakness in limbs
neck pain
weakness of one body side
altered sensorium
dark urine
sweating
muscle pain
mild fever
swelled lymph nodes
malaise
red spots over body
joint pain
pain behind the eyes
constipation
toxic look (typhos)
belly pain
yellow urin

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze,symptoms_list,disease
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[itching, skin rash, nodal skin eruptions, dis...",Fungal infection
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[skin rash, nodal skin eruptions, dischromic ...",Fungal infection
2,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[itching, nodal skin eruptions, dischromic pa...",Fungal infection
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[itching, skin rash, dischromic patches]",Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[itching, skin rash, nodal skin eruptions]",Fungal infection


In [11]:
# assign newly created dataframe to df
df = new_df.drop("symptoms_list", axis=1)
df.head()

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,foul smell of urine,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze,disease
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [12]:
# store the processed dataset into a new csv file
import os
if not os.path.exists("processed_dataset/symptom_disease.csv"):
    df.to_csv("processed_dataset/symptom_disease.csv", encoding='utf-8', index=False)

In [13]:
# X, y variables assignment
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [14]:
# split dataset into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
knn = KNeighborsClassifier()

param_grid = [
    {
        'weights': ['uniform'], 
        'n_neighbors': [i for i in range(3, 7)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(3, 7)], 
        'p': [i for i in range(1, 6)]
    }
]

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

knn_best = grid_search.best_estimator_

y_pred_test = knn_best.predict(X_test)
y_pred_train = knn_best.predict(X_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

print("Best parameters:", grid_search.best_params_)
print("Accuracy for testing dataset:", accuracy_test)
print("Accuracy for training dataset:", accuracy_train)

Best parameters: {'n_neighbors': 3, 'weights': 'uniform'}
Accuracy for testing dataset: 1.0
Accuracy for training dataset: 1.0


In [16]:
import pickle
import pathlib

current_directory = os.getcwd()
chatbot_engine = os.path.abspath(os.path.join(current_directory, os.pardir))
sub_directory = "ml_model"
file_name = "knn_model.sav"
file_path = pathlib.PurePath(chatbot_engine, sub_directory, file_name)

knn = KNeighborsClassifier(n_neighbors=3, weights='uniform')
knn.fit(X, y)
pickle.dump(knn, open(file_path, "wb"))

In [17]:
X[:1]

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,bladder discomfort,foul smell of urine,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
symptoms = df.columns[:-1]
symptoms = [symptom for symptom in symptoms]
ret = 'itching' in symptoms
ret

True

In [19]:
model = pickle.load(open(file_path, 'rb'))
test_data = np.array([[0 for i in range(131)]])
y_pred = model.predict(test_data)
print(y_pred)

['Fungal infection']


