In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

class MultinomialNaiveBayes:
    def __init__(self):
        self.class_probs = None
        self.word_probs = None
        self.classes = None
        self.vocab = None

    def fit(self, X, y):
        # Count occurrences of each class
        class_counts = defaultdict(int)
        for label in y:
            class_counts[label] += 1

        # Calculate prior probabilities of each class
        total_samples = len(y)
        self.classes = list(class_counts.keys())
        self.class_probs = {label: count / total_samples for label, count in class_counts.items()}

        # Count occurrences of each word in each class
        word_counts = {label: defaultdict(int) for label in self.classes}
        for label, text in zip(y, X):
            for word in text.split():
                word_counts[label][word] += 1

        # Calculate likelihood probabilities of each word given each class
        self.vocab = set(word for text in X for word in text.split())
        self.word_probs = {label: defaultdict(float) for label in self.classes}
        for label in self.classes:
            total_words = sum(word_counts[label].values())
            for word in self.vocab:
                word_prob = (word_counts[label][word] + 1) / (total_words + len(self.vocab))  # Laplace smoothing
                self.word_probs[label][word] = word_prob

    def predict(self, X):
        predictions = []
        for text in X:
            probs = {label: np.log(self.class_probs[label]) for label in self.classes}
            for word in text.split():
                for label in self.classes:
                    probs[label] += np.log(self.word_probs[label].get(word, 1e-10))  # Smoothing for unseen words
            predicted_label = max(probs, key=probs.get)
            predictions.append(predicted_label)
        return predictions

    def predict_single(self, text):
        probs = {label: np.log(self.class_probs[label]) for label in self.classes}
        for word in text.split():
            for label in self.classes:
                probs[label] += np.log(self.word_probs[label].get(word, 1e-10))  # Smoothing for unseen words
        predicted_label = max(probs, key=probs.get)
        return predicted_label

# Load dataset from CSV
data = pd.read_csv("data567.csv")

# Drop rows with missing values
data.dropna(subset=['Symptoms', 'Disease'], inplace=True)

# Separate features and labels
X = data['Symptoms']
y = data['Disease']

# Initialize and train the classifier
nb_classifier = MultinomialNaiveBayes()
nb_classifier.fit(X, y)



# Predictions on the test set
predicted = nb_classifier.predict(X)



# Inference text for classification
input_text = [
    "bone spurs have developed around my joints.",
]

# Predictions on the inference text
for text in input_text:
    predicted_label = nb_classifier.predict_single(text)
    print(f"Text: {text}")
    print(f"Predicted Disease: {predicted_label}")
    print()


Text: bone spurs have developed around my joints.
Predicted Disease: Osteoarthritis



In [2]:
# Calculate accuracy
accuracy = np.mean(predicted == y)
print("Accuracy:", accuracy)

Accuracy: 0.9992175273865415


In [3]:
import pickle

# Save the trained classifier
with open('nb_classifier.pkl', 'wb') as f:
    pickle.dump(nb_classifier, f)


In [5]:
import pickle

# Load the saved classifier
with open('nb_classifier.pkl', 'rb') as f:
    nb_classifier_loaded = pickle.load(f)

# Inference text for classification
input_text = [
    "why am i waking up multiple times during the night to urinate. is it a disease?.",
]

# Predictions on the inference text
for text in input_text:
    predicted_label = nb_classifier_loaded.predict_single(text)
    print(f"Text: {text}")
    print(f"Predicted Disease: {predicted_label}")
    print()


Text: why am i waking up multiple times during the night to urinate. is it a disease?.
Predicted Disease: Diabetes

