In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
#import training data
train = pd.read_csv("train.csv")
train.symptoms
train.disease

0        diabetes
1             flu
2        migraine
3          stroke
4    tuberculosis
5          dengue
Name: disease, dtype: object

In [20]:
#vectorizing the symptoms
tfid_vectorizer = TfidfVectorizer(min_df=1)
X = tfid_vectorizer.fit_transform(train.symptoms)
tfid_vectorizer.get_feature_names()

['abdominal_pain',
 'arm_weakness',
 'back_pain',
 'blurred_vision',
 'body_pain',
 'chest_pain_while_breathing',
 'chills',
 'confusion',
 'cough',
 'diarrhoea',
 'easy_bruising',
 'extreme_hunger',
 'eye_pain',
 'face_drooping',
 'fatigue',
 'feeling_very_hot_or_very_cold',
 'fever',
 'frequent_urination',
 'headache',
 'heaviness_over_chest_radiating_to_limbs',
 'increased_sensitivity_to_light_and_sound',
 'increased_thirst',
 'intense_headache',
 'irritability',
 'joint_pain',
 'loss_of_appetite',
 'loss_of_muscle',
 'malaise',
 'muscle_pain',
 'nausea',
 'night_sweats',
 'phlegm',
 'poor_concentration',
 'severe_headache',
 'severe_unintentional_weight_loss',
 'shortness_of_breath',
 'skin_rashes',
 'slow_healing_sores',
 'sore_throat',
 'speech_difficulty',
 'sweating',
 'swollen_lymph_nodes',
 'unexplained_weight_loss',
 'vomiting',
 'weakness']

In [21]:
#vectorizing the disease
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
Y = le.fit_transform(train.disease)
le.classes_

array(['dengue', 'diabetes', 'flu', 'migraine', 'stroke', 'tuberculosis'],
      dtype=object)

In [22]:
#fitting the model
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X, Y)

In [23]:
test = pd.read_csv('test.csv')
test.symptoms

0    body_pain chills cough fever headache sore_thr...
Name: symptoms, dtype: object

In [24]:
predicted = clf.predict_proba(tfid_vectorizer.transform(test.symptoms))
prob = predicted.tolist()[0]
prob = [x*100 for x in prob]

In [25]:
#probabilites of disease
output = dict(zip(le.classes_,prob))
print(pd.DataFrame(output.items()))

              0          1
0        dengue  21.692987
1      diabetes  19.169240
2           flu  20.144651
3      migraine  13.323208
4        stroke  12.483462
5  tuberculosis  13.186452


In [26]:
#Most probable
predicted = clf.predict(tfid_vectorizer.transform(test.symptoms))
print(le.classes_[predicted])

['dengue']
