# Disease Prediction 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [2]:
df = pd.read_csv(r"C:\Users\ragha\OneDrive\Desktop\Major\dataset.csv")
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [3]:
df = df.fillna("")

In [4]:
# Combine symptom columns into one
df['Symptom'] = ""
for i in range(1, 18):
    df['Symptom'] += df[f"Symptom_{i}"]

df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Symptom
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,itching skin_rash nodal_skin_eruptions dischro...
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,skin_rash nodal_skin_eruptions dischromic _pa...
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,itching nodal_skin_eruptions dischromic _patches
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,,itching skin_rash dischromic _patches
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,itching skin_rash nodal_skin_eruptions


In [5]:
# Drop individual symptom columns
df.drop(columns=[f"Symptom_{i}" for i in range(1, 18)], inplace=True)
df.head()

Unnamed: 0,Disease,Symptom
0,Fungal infection,itching skin_rash nodal_skin_eruptions dischro...
1,Fungal infection,skin_rash nodal_skin_eruptions dischromic _pa...
2,Fungal infection,itching nodal_skin_eruptions dischromic _patches
3,Fungal infection,itching skin_rash dischromic _patches
4,Fungal infection,itching skin_rash nodal_skin_eruptions


In [6]:
# Define features and target
X = df['Disease']
Y = df['Symptom']

In [7]:
# Split data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True, random_state=44)

# SVM

In [8]:
# Build the pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC()),])

In [9]:
# Train the model
text_clf.fit(Y_train, X_train)

In [10]:
# Make predictions
predictions = text_clf.predict(Y_test)

# Evaluate the model
print(metrics.confusion_matrix(X_test, predictions))
print(metrics.classification_report(X_test, predictions))
print(metrics.accuracy_score(X_test, predictions))

[[33  0  0 ...  0  0  0]
 [ 0 33  0 ...  0  0  0]
 [ 0  0 35 ...  0  0  0]
 ...
 [ 0  0  0 ... 31  0  0]
 [ 0  0  0 ...  0 31  0]
 [ 0  0  0 ...  0  0 25]]
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        33
                                   AIDS       1.00      1.00      1.00        33
                                   Acne       1.00      1.00      1.00        35
                    Alcoholic hepatitis       1.00      1.00      1.00        33
                                Allergy       1.00      1.00      1.00        25
                              Arthritis       1.00      1.00      1.00        29
                       Bronchial Asthma       1.00      1.00      1.00        30
                   Cervical spondylosis       1.00      1.00      1.00        35
                            Chicken pox       1.00      1.00      1.00        35
                    Chronic chole

In [11]:
# Predict disease for new symptoms
input_symptoms = ["headache", "nausea", "blurred vision"]
processed_input = text_clf['tfidf'].transform([" ".join(input_symptoms)])
predicted_disease = text_clf['clf'].predict(processed_input)
print(f"The predicted disease is: {predicted_disease[0]}")


The predicted disease is: (vertigo) Paroymsal  Positional Vertigo


# Medication

In [12]:
# Load and prepare data
data = pd.read_csv(r"C:\Users\ragha\OneDrive\Desktop\Major\medicaldata.csv")
data['DateOfBirth'] = pd.to_datetime(data['DateOfBirth'], errors='coerce')
data['DateOfBirth'].fillna(data['DateOfBirth'].median(), inplace=True)
categorical_columns = ['Gender', 'Symptoms', 'Causes', 'Disease', 'Medicine']
for column in categorical_columns:
    data[column].fillna(data[column].mode()[0], inplace=True)
data['Name'].fillna('Unknown', inplace=True)


  data['DateOfBirth'] = pd.to_datetime(data['DateOfBirth'], errors='coerce')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['DateOfBirth'].fillna(data['DateOfBirth'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change i

# SVM

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Ensure compatibility with latest Pandas
pd.options.mode.copy_on_write = True

# Assuming 'data' is a DataFrame
X = data['Symptoms']  # Features (Symptoms)
y = data['Medicine']  # Target variable (Medicine)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=44)

# Apply TF-IDF Vectorization for text processing and SVM Classifier
text_clf_svm = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('clf', SVC(kernel='linear', random_state=42)),
])

# Fit the model
text_clf_svm.fit(X_train, y_train)

# Make predictions
predictions_svm = text_clf_svm.predict(X_test)

# Print evaluation metrics
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, predictions_svm))
print("\nClassification Report:")
print(metrics.classification_report(y_test, predictions_svm))
print("Accuracy Score:", metrics.accuracy_score(y_test, predictions_svm))
print("F1 Score:", metrics.f1_score(y_test, predictions_svm, average='macro'))
print("Precision Score:", metrics.precision_score(y_test, predictions_svm, average='macro'))

# Get medicine recommendation for new symptoms
input_symptoms = ["fever cough"]  # Example input
recommended_medicine = text_clf_svm.predict(input_symptoms)
print(f"The recommended medicine is: {recommended_medicine[0]}")


# NB

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics


# Features and target
X = data['Symptoms']
y = data['Medicine']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=44)

# TF-IDF and Naive Bayes pipeline
text_clf_nb = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('clf', MultinomialNB()),
])

# Fit the model
text_clf_nb.fit(X_train, y_train)

# Make predictions
predictions_nb = text_clf_nb.predict(X_test)

# Evaluation metrics
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, predictions_nb))
print("\nClassification Report:")
print(metrics.classification_report(y_test, predictions_nb))
print("Accuracy Score:", metrics.accuracy_score(y_test, predictions_nb))
print("F1 Score:", metrics.f1_score(y_test, predictions_nb, average='macro'))
print("Precision Score:", metrics.precision_score(y_test, predictions_nb, average='macro'))

# Predict recommended medicine
input_symptoms = ["headache", "nausea", "blurred vision"]
processed_input = text_clf_nb['tfidf'].transform([" ".join(input_symptoms)])
recommended_medicine = text_clf_nb['clf'].predict(processed_input)
print(f"The recommended medicine is: {recommended_medicine[0]}")


# DT

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Assuming 'data' is a DataFrame
X = data['Symptoms']  # Features (Symptoms)
y = data['Medicine']  # Target variable (Medicine)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=44)

# Apply TF-IDF Vectorization for text processing and Decision Tree Classifier
text_clf_dt = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('clf', DecisionTreeClassifier(random_state=42)),
])

# Fit the model
text_clf_dt.fit(X_train, y_train)

# Make predictions
predictions_dt = text_clf_dt.predict(X_test)

# Print evaluation metrics
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, predictions_dt))
print("\nClassification Report:")
print(metrics.classification_report(y_test, predictions_dt))
print("Accuracy Score:", metrics.accuracy_score(y_test, predictions_dt))
print("F1 Score:", metrics.f1_score(y_test, predictions_dt, average='macro'))
print("Precision Score:", metrics.precision_score(y_test, predictions_dt, average='macro'))

# Get medicine recommendation for new symptoms
input_symptoms = ["fever cough"]  # Example input
recommended_medicine = text_clf_dt.predict(input_symptoms)
print(f"The recommended medicine is: {recommended_medicine[0]}")

# Random Forest

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Assuming 'data' is a DataFrame
X = data['Symptoms']  # Features (Symptoms)
y = data['Medicine']  # Target variable (Medicine)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=44)

# Apply TF-IDF Vectorization for text processing and Random Forest Classifier
text_clf_rf = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42)),
])

# Fit the model
text_clf_rf.fit(X_train, y_train)

# Make predictions
predictions_rf = text_clf_rf.predict(X_test)

# Print evaluation metrics
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, predictions_rf))
print("\nClassification Report:")
print(metrics.classification_report(y_test, predictions_rf))
print("Accuracy Score:", metrics.accuracy_score(y_test, predictions_rf))
print("F1 Score:", metrics.f1_score(y_test, predictions_rf, average='macro'))
print("Precision Score:", metrics.precision_score(y_test, predictions_rf, average='macro'))

# Get medicine recommendation for new symptoms
input_symptoms = ["fever cough"]  # Example input
recommended_medicine = text_clf_rf.predict(input_symptoms)
print(f"The recommended medicine is: {recommended_medicine[0]}")

Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 2 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Classification Report:
                      precision    recall  f1-score   support

      Dimenhydrinate       0.00      0.00      0.00         2
     Hydration, Rest       0.00      0.00      0.00         1
          Sertraline       0.00      0.00      0.00         1
   Albuterol Inhaler       0.00      0.00      0.00         2
            Antacids       0.67      1.00      0.80         2
   Antibiotics, Rest       1.00      1.00      1.00         6
       Antihistamine       1.00      1.00      1.00         3
    Antivirals, Rest       0.50      1.00      0.67         2
        CPAP Machine       0.33      1.00      0.50         1
   Cognitive Therapy       1.00      1.00      1.00         2
      Dimenhydrinate       0.60      1.00      0.75         3
  Fluids, Bland Diet       1.00      1.00      1.00         2
        Fluids, Rest       1.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# improved RF

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score
from sklearn.model_selection import GridSearchCV

# Features and target
X = data['Symptoms']
y = data['Medicine']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=44)

# Define the pipeline with TF-IDF and Random Forest
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.9, min_df=0.01, ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# Grid Search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predictions
predictions = best_model.predict(X_test)

# Evaluation
print("Best Parameters:", grid_search.best_params_)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))
print("Accuracy Score:", accuracy_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions, average='macro'))
print("Precision Score:", precision_score(y_test, predictions, average='macro'))

# Predict for new symptoms
recommended_medicine = best_model.predict(input_symptoms)
print(f"The recommended medicine is: {recommended_medicine[0]}")


In [14]:
import tkinter as tk
from tkinter import messagebox
import pickle  # To load your trained model
import numpy as np

with open("disease_model.pkl", "wb") as file:
    pickle.dump(text_clf, file)
    
with open("disease_model_med.pkl", "wb") as file:
    pickle.dump(text_clf_rf, file)

In [15]:
def predict_disease():
    try:
        # Retrieve and preprocess inputs from entry fields
        inputs = [entry.get().strip() for entry in entries]
        print("User inputs:", inputs)  # Debugging input values

        # Preprocess strings if needed
        inputs = [x.lower() for x in inputs]  # Example: Convert to lowercase
        
        # Convert to array
#         inputs_array = np.array(inputs).reshape(1, -1)
        print("Inputs array:", inputs)  # Debugging preprocessed array
        
        # Make prediction
        #         input_symptoms = ["headache nausea blurred vision"]
        predicted_disease = text_clf.predict(inputs)
        recommended_medicine = text_clf_rf.predict(inputs)
#         print(f"The predicted disease is: {predicted_disease[0]}")
        
#         prediction = text_clf.predict(inputs)
        messagebox.showinfo("Prediction Result", f"Predicted Disease: {predicted_disease[0]}")
        messagebox.showinfo("Prediction Result", f"Recommended Medicines/ Precautions: {recommended_medicine[0]}")
    
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred: {e}")

from PIL import Image, ImageTk

# Initialize the Tkinter root window
root = tk.Tk()
root.title("Disease Prediction")
root.geometry("950x400")  # Adjust the size as needed

# Add an image at the top of the GUI
image_ref = None

# Add an image at the top of the GUI
def add_image():
    global image_ref  # Ensure the image object is not garbage collected
    # Load the image using PIL
    try:
        img = Image.open("C:\\Users\\HP\\OneDrive\\Documents\\minor_srm\\disease_img.webp")  # Replace with your image path
        img = img.resize((300, 150), Image.Resampling.LANCZOS)  # Resize the image to fit the top area
        image_ref = ImageTk.PhotoImage(img)

        # Create a label to display the image
        img_label = tk.Label(root, image=image_ref)
        img_label.pack(pady=10)  # Add some padding for spacing
    except Exception as e:
        print(f"Error loading image: {e}")

# Call the function to add the image
add_image()
# Define labels and entry fields for user input
labels = [
    "Symptom 1", "Symptom 2", "Symptom 3", "Symptom 4", "Symptom 5",
    "Symptom 6", "Symptom 7", "Symptom 8", "Symptom 9", "Symptom 10"
]  # Update with actual feature names

entries = []

# Create a frame to organize the grid layout
frame = tk.Frame(root)
frame.pack(padx=10, pady=10)

# Add labels and entry fields in a grid
for idx, label_text in enumerate(labels):
    row, col = divmod(idx, 5)  # Change 5 to the number of columns you prefer
    label = tk.Label(frame, text=label_text, font=("Arial", 12))
    label.grid(row=row * 2, column=col, padx=5, pady=5, sticky="w")
    entry = tk.Entry(frame, font=("Arial", 12))
    entry.grid(row=row * 2 + 1, column=col, padx=5, pady=5, sticky="w")
    entries.append(entry)

# Prediction Button
predict_button = tk.Button(root, text="Predict", command=predict_disease, font=("Arial", 14), bg="blue", fg="white")
predict_button.pack(pady=20)

# Run the Tkinter event loop
root.mainloop()


Error loading image: [Errno 2] No such file or directory: 'C:\\Users\\HP\\OneDrive\\Documents\\minor_srm\\disease_img.webp'
User inputs: ['cough', '', '', '', '', '', '', '', '', '']
Inputs array: ['cough', '', '', '', '', '', '', '', '', '']
User inputs: ['chest pain', 'hair fall', 'skin burn', '', '', '', '', '', '', '']
Inputs array: ['chest pain', 'hair fall', 'skin burn', '', '', '', '', '', '', '']


try


import tkinter as tk
from tkinter import filedialog, messagebox
import numpy as np
import cv2
from tensorflow.keras.models import load_model
from PIL import Image, ImageTk

# Load Machine Learning and CNN models
disease_model = load_model("symptom_disease_model.h5")  # Symptom-based model
cnn_model = load_model("cnn_mri_model.h5")  # CNN Model for MRI/CT Analysis

def predict_disease():
    selected_option = var.get()
    
    if selected_option == "Symptoms":
        symptoms = [entry.get().strip() for entry in entries]
        if any(symptoms):
            prediction = disease_model.predict([symptoms])  # Use ML model for prediction
            messagebox.showinfo("Prediction Result", f"Predicted Disease: {prediction[0]}")
        else:
            messagebox.showwarning("Warning", "Please enter at least one symptom.")
    
    elif selected_option == "MRI/CT Scan":
        file_path = filedialog.askopenfilename(filetypes=[("Image Files", "*.png;*.jpg;*.jpeg")])
        if file_path:
            image = cv2.imread(file_path)
            image = cv2.resize(image, (224, 224)) / 255.0  # Resize and normalize
            image = np.expand_dims(image, axis=0)  # Prepare for CNN input
            
            prediction = cnn_model.predict(image)  # Use CNN model for prediction
            predicted_class = np.argmax(prediction, axis=1)  # Get predicted disease class
            messagebox.showinfo("Prediction Result", f"Predicted Disease from MRI/CT: {predicted_class}")

# GUI Setup
root = tk.Tk()
root.title("Disease Prediction System")
root.geometry("500x400")

var = tk.StringVar(value="Symptoms")

tk.Label(root, text="Choose Prediction Mode:", font=("Arial", 12)).pack()
tk.Radiobutton(root, text="Symptoms-Based Prediction", variable=var, value="Symptoms").pack()
tk.Radiobutton(root, text="MRI/CT Scan Analysis", variable=var, value="MRI/CT Scan").pack()

# Symptom Input Fields
entries = []
for i in range(5):
    tk.Label(root, text=f"Symptom {i+1}:").pack()
    entry = tk.Entry(root)
    entry.pack()
    entries.append(entry)

# Predict Button
tk.Button(root, text="Predict", command=predict_disease).pack()

root.mainloop()

