In [3]:
# dataset = https://www.kaggle.com/datasets/niyarrbarman/symptom2disease

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/Symptom2Disease.csv')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1200 non-null   int64 
 1   label       1200 non-null   object
 2   text        1200 non-null   object
dtypes: int64(1), object(2)
memory usage: 28.3+ KB


In [12]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    # Tokenize the text
    words = text.split()
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,text
0,experiencing skin rash arm leg torso past week...
1,skin peeling especially knee elbow scalp peeli...
2,experiencing joint pain finger wrist knee pain...
3,silver like dusting skin especially lower back...
4,nail small dent pit often feel inflammatory te...
...,...
1195,shaking trembling lost sense taste smell exhau...
1196,particularly crevice skin skin rash irritation...
1197,regularly experience intense urge want urinate...
1198,trouble breathing especially outside start fee...


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1500)

X = tfidf_vectorizer.fit_transform(df['text']).toarray()
y = df['label']

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.925

Classification Report:
                                  precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         7
                      Arthritis       1.00      1.00      1.00        10
               Bronchial Asthma       1.00      1.00      1.00        11
           Cervical spondylosis       0.70      1.00      0.82         7
                    Chicken pox       1.00      0.92      0.96        12
                    Common Cold       1.00      1.00      1.00        12
                         Dengue       1.00      0.83      0.91        12
          Dimorphic Hemorrhoids       0.78      1.00      0.88         7
               Fungal infection       1.00      1.00      1.00        13
                   Hypertension       1.00      0.90      0.95        10
                       Impetigo       1.00      1.00      1.00        11
                       Jaundice       1.00      1.00      1.00        11
         

In [20]:
def predict_disease(symptom_description):
    processed_symptom = preprocess_text(symptom_description)
    symptom_vector = tfidf_vectorizer.transform([processed_symptom]).toarray()
    prediction = model.predict(symptom_vector)
    return prediction[0]

new_symptom = "I have a fever, cold and headace"
predicted_disease = predict_disease(new_symptom)
print(f"Predicted Disease: {predicted_disease}")

Predicted Disease: Common Cold
