In [2]:
# Import necessary libraries
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [3]:
# Define symptoms and their corresponding medicines
symptoms = [
    "fever", "runny nose", "fungal infection", "stomach acid",
    "diarrhea", "pain", "bacterial infection", "headache",
    "muscle pain", "sneezing", "itchy eyes", "skin rash",
    "heartburn", "nausea", "stomach cramps", "joint pain",
    "inflammation", "sore throat", "cough", "respiratory infection"
]

medicines = {
    "fever": "Paracetamol",
    "runny nose": "Cetirizine",
    "fungal infection": "Cetirizine",
    "stomach acid": "Aciloc",
    "diarrhea": "Lomotil",
    "pain": "Diclofenac",
    "bacterial infection": "Azithromycin",
    "headache": "Paracetamol",
    "muscle pain": "Paracetamol",
    "sneezing": "Cetirizine",
    "itchy eyes": "Cetirizine",
    "skin rash": "Cetirizine",
    "heartburn": "Aciloc",
    "nausea": "Aciloc",
    "stomach cramps": "Lomotil",
    "joint pain": "Diclofenac",
    "inflammation": "Diclofenac",
    "sore throat": "Azithromycin",
    "cough": "Azithromycin",
    "respiratory infection": "Azithromycin"
}


In [4]:
# Generate a dataset with 5000 rows
data = []
for _ in range(5000):
    selected_symptoms = random.sample(symptoms, random.randint(1, 3))
    suggested_medicine = list({medicines[symptom] for symptom in selected_symptoms})
    row = [" ".join(selected_symptoms)]  # Combine symptoms into one string
    row.append(", ".join(suggested_medicine))  # Medicine(s)
    data.append(row)

# Create DataFrame
columns = ["Symptoms", "Medicine"]
df = pd.DataFrame(data, columns=columns)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,Symptoms,Medicine
0,fungal infection headache,"Paracetamol, Cetirizine"
1,pain skin rash,"Diclofenac, Cetirizine"
2,skin rash,Cetirizine
3,nausea joint pain inflammation,"Aciloc, Diclofenac"
4,diarrhea heartburn,"Aciloc, Lomotil"


In [5]:
# Split the dataset into train, validation, and test sets
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42)  # 80% train
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # 10% val, 10% test

# Print the sizes of the datasets
print(f"Training Data: {len(train_data)} rows")
print(f"Validation Data: {len(val_data)} rows")
print(f"Test Data: {len(test_data)} rows")


Training Data: 4000 rows
Validation Data: 500 rows
Test Data: 500 rows


In [6]:
# Convert symptoms text into numerical feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data["Symptoms"])
X_val = vectorizer.transform(val_data["Symptoms"])
X_test = vectorizer.transform(test_data["Symptoms"])

# Display the feature names
print(f"Feature Names: {vectorizer.get_feature_names_out()}")


Feature Names: ['acid' 'bacterial' 'cough' 'cramps' 'diarrhea' 'eyes' 'fever' 'fungal'
 'headache' 'heartburn' 'infection' 'inflammation' 'itchy' 'joint'
 'muscle' 'nausea' 'nose' 'pain' 'rash' 'respiratory' 'runny' 'skin'
 'sneezing' 'sore' 'stomach' 'throat']


In [7]:
# Encode medicines as target labels
y_train = train_data["Medicine"]
y_val = val_data["Medicine"]
y_test = test_data["Medicine"]

# Display the first few labels
print("Sample Training Labels:")
print(y_train.head())


Sample Training Labels:
4227         Aciloc, Cetirizine
4676                 Cetirizine
800         Diclofenac, Lomotil
3671        Paracetamol, Aciloc
4193    Paracetamol, Diclofenac
Name: Medicine, dtype: object


In [8]:
# Train a Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

print("Model training completed.")


Model training completed.


In [9]:
# Validate the model on the validation dataset
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


Validation Accuracy: 93.40%


In [10]:
# Test the model on the test dataset
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Accuracy: 91.60%


In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Calculate Precision, Recall, and F1-Score on the validation set with zero_division set to 0
val_precision = precision_score(y_val, y_val_pred, average='weighted', zero_division=0)
val_recall = recall_score(y_val, y_val_pred, average='weighted', zero_division=0)
val_f1 = f1_score(y_val, y_val_pred, average='weighted', zero_division=0)

print(f"Validation Precision: {val_precision:.2f}")
print(f"Validation Recall: {val_recall:.2f}")
print(f"Validation F1-Score: {val_f1:.2f}")

# Detailed classification report with zero_division set to 0
print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))


Validation Precision: 0.93
Validation Recall: 0.93
Validation F1-Score: 0.92

Validation Classification Report:
                                       precision    recall  f1-score   support

                               Aciloc       1.00      1.00      1.00        28
                   Aciloc, Cetirizine       1.00      1.00      1.00        26
                   Aciloc, Diclofenac       0.88      1.00      0.93         7
       Aciloc, Diclofenac, Cetirizine       1.00      0.62      0.77         8
                      Aciloc, Lomotil       1.00      1.00      1.00         5
          Aciloc, Lomotil, Cetirizine       1.00      1.00      1.00         1
                         Azithromycin       1.00      1.00      1.00        39
                 Azithromycin, Aciloc       1.00      1.00      1.00        18
     Azithromycin, Aciloc, Cetirizine       1.00      1.00      1.00         5
     Azithromycin, Aciloc, Diclofenac       0.00      0.00      0.00         4
        Azithromyc

In [17]:
# Calculate Precision, Recall, and F1-Score on the test set with zero_division set to 0
test_precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
test_recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
test_f1 = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)

print(f"Test Precision: {test_precision:.2f}")
print(f"Test Recall: {test_recall:.2f}")
print(f"Test F1-Score: {test_f1:.2f}")

# Detailed classification report with zero_division set to 0
print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=0))


Test Precision: 0.92
Test Recall: 0.92
Test F1-Score: 0.91

Test Classification Report:
                                       precision    recall  f1-score   support

                               Aciloc       1.00      1.00      1.00        23
                   Aciloc, Cetirizine       0.94      1.00      0.97        29
                   Aciloc, Diclofenac       0.94      1.00      0.97        16
       Aciloc, Diclofenac, Cetirizine       1.00      0.80      0.89         5
          Aciloc, Diclofenac, Lomotil       0.00      0.00      0.00         1
                      Aciloc, Lomotil       1.00      1.00      1.00         7
          Aciloc, Lomotil, Cetirizine       1.00      1.00      1.00         3
                         Azithromycin       1.00      1.00      1.00        40
                 Azithromycin, Aciloc       0.95      1.00      0.97        19
     Azithromycin, Aciloc, Cetirizine       1.00      0.80      0.89        10
     Azithromycin, Aciloc, Diclofenac     

In [13]:
# Predict medicines for new symptoms
new_symptoms = ["fever runny nose", "stomach cramps diarrhea", "headache muscle pain"]
new_symptom_vectors = vectorizer.transform(new_symptoms)
predictions = model.predict(new_symptom_vectors)

# Display predictions
for i, symptoms in enumerate(new_symptoms):
    print(f"Symptoms: {symptoms} => Predicted Medicine: {predictions[i]}")


Symptoms: fever runny nose => Predicted Medicine: Paracetamol, Cetirizine
Symptoms: stomach cramps diarrhea => Predicted Medicine: Lomotil
Symptoms: headache muscle pain => Predicted Medicine: Paracetamol


In [14]:
import joblib

# Save the trained model and vectorizer
joblib.dump(model, "medicine_classifier.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("Model and vectorizer saved.")


Model and vectorizer saved.


In [15]:
# Load the trained model and vectorizer
loaded_model = joblib.load("medicine_classifier.pkl")
loaded_vectorizer = joblib.load("vectorizer.pkl")

# Predict using the loaded model
sample_symptoms = ["cough sore throat", "skin rash itchy eyes"]
sample_vectors = loaded_vectorizer.transform(sample_symptoms)
sample_predictions = loaded_model.predict(sample_vectors)

# Display predictions
for i, symptoms in enumerate(sample_symptoms):
    print(f"Symptoms: {symptoms} => Predicted Medicine: {sample_predictions[i]}")


Symptoms: cough sore throat => Predicted Medicine: Azithromycin
Symptoms: skin rash itchy eyes => Predicted Medicine: Cetirizine
