In [1]:
import pandas as pd

# Load Dataset
dataset_path = "/content/Symptom2Disease.csv"  # Ensure the correct file path
df = pd.read_csv(dataset_path)

# Display first few rows
print("Dataset Preview:\n", df.head())

# Display column names
print("\nColumn Names in Dataset:", df.columns)


Dataset Preview:
    Unnamed: 0      label                                               text
0           0  Psoriasis  I have been experiencing a skin rash on my arm...
1           1  Psoriasis  My skin has been peeling, especially on my kne...
2           2  Psoriasis  I have been experiencing joint pain in my fing...
3           3  Psoriasis  There is a silver like dusting on my skin, esp...
4           4  Psoriasis  My nails have small dents or pits in them, and...

Column Names in Dataset: Index(['Unnamed: 0', 'label', 'text'], dtype='object')


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [3]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
import re

# Download stopwords if not already available
try:
    stop_words = set(stopwords.words("english"))
except LookupError:
    print("Downloading 'stopwords' resource...")
    nltk.download("stopwords")
    stop_words = set(stopwords.words("english"))

# Load Dataset
dataset_path = "/content/Symptom2Disease.csv"  # Ensure the correct file path
try:
    df = pd.read_csv(dataset_path)
except FileNotFoundError:
    print(" File not found. Please check the dataset path.")
    raise

# Standardize column names (Remove spaces, convert to lowercase)
df.columns = df.columns.str.strip().str.lower()

# Print available columns
print("\n Available Columns in Dataset:", df.columns.tolist())

# Auto-detect symptom and disease columns
possible_symptom_cols = ["symptom", "text", "description"]
possible_disease_cols = ["disease", "label", "condition"]

# Identify columns dynamically
symptom_col = next((col for col in df.columns if any(key in col for key in possible_symptom_cols)), None)
disease_col = next((col for col in df.columns if any(key in col for key in possible_disease_cols)), None)

# Assign default names if not detected
if not symptom_col or not disease_col:
    print(" Column names not recognized. Using default: 'text' as symptoms, 'label' as disease.")
    symptom_col = "text" if "text" in df.columns else df.columns[0]
    disease_col = "label" if "label" in df.columns else df.columns[1]

print(f"\n Detected Columns: Symptoms -> '{symptom_col}', Disease -> '{disease_col}'")

# Text Preprocessing Function
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"\W", " ", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    words = text.split()  # Faster than word_tokenize()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)

# Apply Preprocessing
df["cleaned_symptoms"] = df[symptom_col].astype(str).apply(preprocess_text)

print("\n Preprocessing Completed!")
print(df.head())


Downloading 'stopwords' resource...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



 Available Columns in Dataset: ['unnamed: 0', 'label', 'text']

 Detected Columns: Symptoms -> 'text', Disease -> 'label'

 Preprocessing Completed!
   unnamed: 0      label                                               text  \
0           0  Psoriasis  I have been experiencing a skin rash on my arm...   
1           1  Psoriasis  My skin has been peeling, especially on my kne...   
2           2  Psoriasis  I have been experiencing joint pain in my fing...   
3           3  Psoriasis  There is a silver like dusting on my skin, esp...   
4           4  Psoriasis  My nails have small dents or pits in them, and...   

                                    cleaned_symptoms  
0  experiencing skin rash arms legs torso past we...  
1  skin peeling especially knees elbows scalp pee...  
2  experiencing joint pain fingers wrists knees p...  
3  silver like dusting skin especially lower back...  
4  nails small dents pits often feel inflammatory...  


In [6]:
# Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["cleaned_symptoms"])
y = df[disease_col]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Define multiple ML models
models = {
    "Naïve Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "SVM (Support Vector Machine)": SVC(kernel="linear"),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

results = {}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results[model_name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted"),
        "Recall": recall_score(y_test, y_pred, average="weighted"),
        "F1 Score": f1_score(y_test, y_pred, average="weighted")
    }

# Display Results
print("\n Model Performance Comparison:")
for model, metrics in results.items():
    print(f"\n🔹 {model}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")



 Model Performance Comparison:

🔹 Naïve Bayes:
Accuracy: 0.9542
Precision: 0.9617
Recall: 0.9542
F1 Score: 0.9506

🔹 Logistic Regression:
Accuracy: 0.9792
Precision: 0.9812
Recall: 0.9792
F1 Score: 0.9790

🔹 SVM (Support Vector Machine):
Accuracy: 0.9792
Precision: 0.9812
Recall: 0.9792
F1 Score: 0.9792

🔹 Random Forest:
Accuracy: 0.9750
Precision: 0.9779
Recall: 0.9750
F1 Score: 0.9741

🔹 Gradient Boosting:
Accuracy: 0.8375
Precision: 0.8949
Recall: 0.8375
F1 Score: 0.8536


In [8]:
from sklearn.model_selection import GridSearchCV


In [9]:
# Define hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Define hyperparameter grid for Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}

# Run GridSearchCV for Random Forest
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
rf_grid.fit(X_train, y_train)

# Run GridSearchCV for Gradient Boosting
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
gb_grid.fit(X_train, y_train)

# Print best parameters
print(f" Best Random Forest Parameters: {rf_grid.best_params_}")
print(f" Best Gradient Boosting Parameters: {gb_grid.best_params_}")

# Evaluate the best models
rf_best = rf_grid.best_estimator_
gb_best = gb_grid.best_estimator_

rf_pred = rf_best.predict(X_test)
gb_pred = gb_best.predict(X_test)

print(f"\n Optimized Random Forest Accuracy: {accuracy_score(y_test, rf_pred):.4f}")
print(f" Optimized Gradient Boosting Accuracy: {accuracy_score(y_test, gb_pred):.4f}")


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Fitting 3 folds for each of 18 candidates, totalling 54 fits
 Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
 Best Gradient Boosting Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}

 Optimized Random Forest Accuracy: 0.9750
 Optimized Gradient Boosting Accuracy: 0.8417


In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [11]:
# Convert labels to numbers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_dl = label_encoder.fit_transform(y)

# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["cleaned_symptoms"])
X_dl = tokenizer.texts_to_sequences(df["cleaned_symptoms"])

# Pad sequences to ensure same length
X_dl = pad_sequences(X_dl, maxlen=50)


In [12]:
# Build LSTM Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=50),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_dl, y_dl, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10




[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 180ms/step - accuracy: 0.1398 - loss: 3.1377 - val_accuracy: 0.0000e+00 - val_loss: 4.3316
Epoch 2/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 105ms/step - accuracy: 0.2087 - loss: 2.7959 - val_accuracy: 0.0000e+00 - val_loss: 5.4126
Epoch 3/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 99ms/step - accuracy: 0.5030 - loss: 1.9868 - val_accuracy: 0.0000e+00 - val_loss: 6.4614
Epoch 4/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 102ms/step - accuracy: 0.7633 - loss: 1.1690 - val_accuracy: 0.0000e+00 - val_loss: 7.9131
Epoch 5/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 97ms/step - accuracy: 0.8640 - loss: 0.6553 - val_accuracy: 0.0000e+00 - val_loss: 7.7595
Epoch 6/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 140ms/step - accuracy: 0.9203 - loss: 0.4309 - val_accuracy: 0.0000e+00 - val_loss: 8.1576
Epoch 7/10
[1m3

<keras.src.callbacks.history.History at 0x7e8a29612c10>

In [13]:
pip install flask




In [14]:
import pickle
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

#  Load Dataset
dataset_path = "/content/Symptom2Disease.csv"  # Ensure correct file path
df = pd.read_csv(dataset_path)

#  Standardize column names
df.columns = df.columns.str.strip().str.lower()

#  Detect relevant columns
symptom_col = "text" if "text" in df.columns else df.columns[1]
disease_col = "label" if "label" in df.columns else df.columns[2]

#  Preprocessing function
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"\W", " ", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    words = text.split()  # Uses split instead of word_tokenize()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

#  Apply Preprocessing
df["cleaned_symptoms"] = df[symptom_col].astype(str).apply(preprocess_text)

#  TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["cleaned_symptoms"])
y = df[disease_col]

#  Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Train Model (Using Naïve Bayes)
model = MultinomialNB()
model.fit(X_train, y_train)

#  Save Model and Vectorizer
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print(" Model and Vectorizer saved successfully!")


 Model and Vectorizer saved successfully!


In [15]:
!pip install flask-ngrok streamlit pandas scikit-learn nltk


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014

In [16]:
import pandas as pd
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

#  Ensure NLTK resources
nltk.download("stopwords")

#  Load Dataset
df = pd.read_csv("Symptom2Disease.csv")

#  Detect Columns
symptom_col = "text"
disease_col = "label"

#  Preprocessing Function
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    text = re.sub(r"\W", " ", str(text).lower())  # Remove special characters
    return " ".join([word for word in text.split() if word not in stop_words])

#  Apply Preprocessing
df["cleaned_symptoms"] = df[symptom_col].apply(preprocess_text)

#  Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["cleaned_symptoms"])
y = df[disease_col]

#  Train Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

#  Save Model & Vectorizer
pickle.dump(model, open("model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))

print(" Model and Vectorizer saved successfully!")


 Model and Vectorizer saved successfully!


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
import pickle

#  Load Model and Vectorizer
model = pickle.load(open("model.pkl", "rb"))
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))

#  Function to Predict Disease
def predict_disease(symptoms):
    processed_text = " ".join(symptoms.lower().split())  # Simple cleaning
    vectorized_text = vectorizer.transform([processed_text])
    return model.predict(vectorized_text)[0]

#  Example Usage
symptoms = input("Enter your symptoms (comma-separated): ")
predicted_disease = predict_disease(symptoms)
print(f" Predicted Disease: {predicted_disease}")


Enter your symptoms (comma-separated): Joint pain
 Predicted Disease: Dengue
