Import Libraries

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

Dataset Preprocessing

In [2]:
# Load dataset
df = pd.read_csv("Symptom2Disease.csv")


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [4]:
df.shape

(1200, 3)

In [5]:
df['label'].nunique()

24

In [55]:
duplicate_names = df['label'].value_counts()
duplicate_names = duplicate_names[duplicate_names > 1]
duplicate_names


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Psoriasis,50
Varicose Veins,50
Typhoid,50
Chicken pox,50
Impetigo,50
Dengue,50
Fungal infection,50
Common Cold,50
Pneumonia,50
Dimorphic Hemorrhoids,50


In [6]:
basic_stopwords = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
    'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
    'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them',
    'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this',
    'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
    'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
    'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
    'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
    'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
    'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
    'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',
    'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
    'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too',
    'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
])

# Preprocessing function
def simple_preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in basic_stopwords]
    return ' '.join(tokens)

# Preprocess symptom texts
df['clean_symptoms'] = df['text'].apply(simple_preprocess)
# df['clean_treatments'] = df['Treatments'].apply(simple_preprocess)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,clean_symptoms
0,0,Psoriasis,I have been experiencing a skin rash on my arm...,experiencing skin rash arms legs torso past we...
1,1,Psoriasis,"My skin has been peeling, especially on my kne...",skin peeling especially knees elbows scalp pee...
2,2,Psoriasis,I have been experiencing joint pain in my fing...,experiencing joint pain fingers wrists knees p...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp...",silver like dusting skin especially lower back...
4,4,Psoriasis,"My nails have small dents or pits in them, and...",nails small dents pits often feel inflammatory...


Vectorization

In [8]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['clean_symptoms'])
y = df['label']


Test and Train Model

In [9]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Logistic Regression Model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)

Model Evaluation

In [10]:
# Evaluation
print("\n Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))

print("\n Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))


 Logistic Regression Accuracy: 0.9791666666666666

 Naive Bayes Accuracy: 0.9541666666666667


Model Prediction Function

In [15]:
# Logistic Regression model for prediction
def predict_disease_lr(symptom_text):
    cleaned = simple_preprocess(symptom_text)
    vector = vectorizer.transform([cleaned])
    prediction = lr_model.predict(vector)
    return prediction[0]

# Naive Bayes model for prediction
def predict_disease_nb(symptom_text):
    cleaned = simple_preprocess(symptom_text)
    vector = vectorizer.transform([cleaned])
    prediction = nb_model.predict(vector)
    return prediction[0]

# Example usage
user_input = "cough and cold"
print("Symptoms:", user_input)
print("Disease Prediction LR: ", predict_disease_lr(user_input))
print("Disease Prediction NB: ", predict_disease_nb(user_input))

Symptoms: cough and cold
Disease Prediction LR:  Bronchial Asthma
Disease Prediction NB:  Pneumonia
