In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pickle

In [59]:
df = pd.read_csv('Clean_medicine_Prediction.csv')

In [60]:
df

Unnamed: 0.1,Unnamed: 0,ID,Age,Gender,Symptoms,Duration,Disease,Medicine Given
0,0,1,5,Male,"stomach pain, sweating",5 days,Dengue,"Paracetamol, Fluids"
1,1,2,30,Other,"stomach pain, cough, vomiting",Months,Heart Attack,"Aspirin, Nitroglycerin"
2,2,5,1,Female,"sweating, nausea, fever, stomach pain",2 days,Alzheimer,Donepezil
3,3,7,51,Other,"body pain, itchy eyes, stomach pain",2 days,Allergy,Cetirizine
4,4,8,45,Male,"thirst, sneezing, fatigue",1 hour,Indigestion,"Antacid, ORS"
...,...,...,...,...,...,...,...,...
588,588,645,2,Female,"fatigue, sore throat",Months,Cold,"Cough Syrup, Antihistamine"
589,589,646,81,Male,"headache, joint pain",2 days,Diabetes,Metformin
590,590,647,17,Male,"stomach pain, cough, body pain",3 days,Indigestion,"Antacid, ORS"
591,591,648,41,Female,"rash, itchy eyes, sneezing, stomach pain",1 hour,Indigestion,"Antacid, ORS"


In [61]:
columns = ['Gender' , 'Symptoms' , 'Duration' , 'Disease' , 'Medicine Given']
for col in columns :
    df[col] = df[col].astype('string')

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593 entries, 0 to 592
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      593 non-null    int64 
 1   ID              593 non-null    int64 
 2   Age             593 non-null    int64 
 3   Gender          593 non-null    string
 4   Symptoms        593 non-null    string
 5   Duration        593 non-null    string
 6   Disease         593 non-null    string
 7   Medicine Given  593 non-null    string
dtypes: int64(3), string(5)
memory usage: 37.2 KB


In [63]:
df = df.drop(columns=["ID"])

In [64]:
# Encode Gender
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

In [65]:
# Confirm sizes
print(len(df['Symptoms']), len(df['Disease']))  # Should both be 593


593 593


In [66]:
# Assign features and target
X = df['Symptoms']
y = df['Disease']


In [67]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [68]:

# Build pipeline
model = make_pipeline(CountVectorizer(), MultinomialNB())


In [69]:
# Train model
model.fit(X_train, y_train)


In [70]:
# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


                precision    recall  f1-score   support

       Allergy       0.13      0.17      0.15        12
     Alzheimer       0.00      0.00      0.00        13
   Chikungunya       0.00      0.00      0.00        10
          Cold       0.20      0.09      0.12        11
        Dengue       0.08      0.09      0.09        11
      Diabetes       0.08      0.08      0.08        12
           Flu       0.05      0.09      0.06        11
Food Poisoning       0.14      0.14      0.14        14
  Heart Attack       0.12      0.10      0.11        10
   Indigestion       0.14      0.07      0.09        15

      accuracy                           0.08       119
     macro avg       0.10      0.08      0.08       119
  weighted avg       0.10      0.08      0.09       119



In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593 entries, 0 to 592
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      593 non-null    int64  
 1   Age             593 non-null    int64  
 2   Gender          390 non-null    float64
 3   Symptoms        593 non-null    string 
 4   Duration        593 non-null    string 
 5   Disease         593 non-null    string 
 6   Medicine Given  593 non-null    string 
dtypes: float64(1), int64(2), string(4)
memory usage: 32.6 KB


In [74]:
# Save the model to a file
with open("disease_predictor.pkl", "wb") as file:
    pickle.dump(model, file)

In [75]:
# Create a map of disease to most common medicine
med_map = df.groupby('Disease')['Medicine Given'].agg(lambda x: x.mode()[0]).to_dict()

# Save medicine map
with open("medicine_map.pkl", "wb") as file:
    pickle.dump(med_map, file)