In [None]:
import requests
from bs4 import BeautifulSoup
import csv

url = 'https://www.nhs.uk/conditions/'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    base_url = 'https://www.nhs.uk'
    diseases = []

    for link in soup.select('ul.nhsuk-list > li > a[href^="/conditions/"]'):
        disease_name = link.text.strip()
        disease_link = base_url + link['href']
        diseases.append([disease_name, disease_link])

    with open('diseasesscrap.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Disease Name', 'Link'])
        writer.writerows(diseases)

    print(f"Scraped {len(diseases)} diseases and their links.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Scraped 1154 diseases and their links.


In [None]:
import csv
import requests
from bs4 import BeautifulSoup
import time
import random

def scrape_primary_structure(csv_file_path, output_file):
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)

        with open(output_file, 'a', encoding='utf-8') as file:
            for row in reader:
                disease_name, url = row['Disease Name'], row['Link']
                print(f"Scraping {disease_name} from {url}...")

                response = requests.get(url)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')

                    disease_name = soup.find('h1').text.strip()

                    symptoms_header = soup.find('h2', id='symptoms')
                    symptoms_paragraph = ""
                    symptoms_list = []

                    if symptoms_header:
                        paragraph = symptoms_header.find_next_sibling('p')
                        if paragraph:
                            symptoms_paragraph = paragraph.text.strip()

                        symptoms_list_element = symptoms_header.find_next_sibling('ul')
                        if symptoms_list_element:
                            symptoms_list = [li.text.strip() for li in symptoms_list_element.find_all('li')]

                    file.write(f"Disease Name: {disease_name}\n")
                    file.write(f"Symptoms Description: {symptoms_paragraph}\n")
                    file.write(f"Symptoms List: {', '.join(symptoms_list)}\n")
                    file.write("---\n")

                    print(f"Finished scraping {disease_name}")
                    time.sleep(random.uniform(1, 5))
                else:
                    print(f"Failed to retrieve {url}. Status code: {response.status_code}")

csv_file_path = 'diseasesscrap.csv'
scrape_primary_structure(csv_file_path, 'primary_structure.txt')

Scraping AAA screening, see Abdominal aortic aneurysm (AAA) screening from https://www.nhs.uk/conditions/abdominal-aortic-aneurysm-screening/...
Finished scraping Abdominal aortic aneurysm (AAA) screening
Scraping AAA, see Abdominal aortic aneurysm from https://www.nhs.uk/conditions/abdominal-aortic-aneurysm/...
Finished scraping Abdominal aortic aneurysm
Scraping Abdominal aortic aneurysm from https://www.nhs.uk/conditions/abdominal-aortic-aneurysm/...
Finished scraping Abdominal aortic aneurysm
Scraping Abdominal aortic aneurysm (AAA) screening from https://www.nhs.uk/conditions/abdominal-aortic-aneurysm-screening/...
Finished scraping Abdominal aortic aneurysm (AAA) screening
Scraping Abortion from https://www.nhs.uk/conditions/abortion/...
Finished scraping Abortion
Scraping Acanthosis nigricans from https://www.nhs.uk/conditions/acanthosis-nigricans/...
Finished scraping Acanthosis nigricans
Scraping Achalasia from https://www.nhs.uk/conditions/achalasia/...
Finished scraping Acha

In [None]:
import json
import numpy as np
import pandas as pd
import pickle
import ast
from math import log
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

def load_data():
    try:
        with open('structured_diseases-2.json', 'r') as f:
            data = json.load(f)
        df_unique = pd.read_csv('unique_diseases_data.csv')
        df_unique["Symptoms"] = df_unique["Symptoms"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
        return data, df_unique
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None

data, df_unique = load_data()
if data is None or df_unique is None:
    exit()

disease_symptoms = {entry['Disease']['Name'].strip().lower():
                    [s.strip().lower() for s in entry['Disease']['Symptoms']['List']] for entry in data}

all_symptoms = sorted({s for symptoms in disease_symptoms.values() for s in symptoms})
symptom_idf = {symptom: log(len(disease_symptoms) / (1 + sum(symptom in symptoms for symptoms in disease_symptoms.values()))) for symptom in all_symptoms}

def symptoms_to_vector(symptoms):
    symptoms_set = {s.strip().lower() for s in symptoms}
    return [1 if sym in symptoms_set else 0 for sym in all_symptoms] + [symptom_idf.get(sym, 0) if sym in symptoms_set else 0 for sym in all_symptoms]

X, y = [], []
for disease, symptoms in disease_symptoms.items():
    base_vector = symptoms_to_vector(symptoms)
    for _ in range(max(20, len(symptoms) * 2)):
        X.append(base_vector)
        y.append(disease)

X, y = np.array(X), np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

best_model, best_accuracy = None, 0
model_accuracies = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    model_accuracies[name] = acc
    print(f"Accuracy of {name}: {acc:.4f}")

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model

print(f"\nBest Model: {best_model.__class__.__name__} with Accuracy: {best_accuracy:.4f}")

with open("DiseasePredictor.pkl", "wb") as f:
    pickle.dump({'model': best_model, 'symptom_idf': symptom_idf, 'all_symptoms': all_symptoms}, f)

def predict_disease(user_symptoms):
    user_vector = np.array(symptoms_to_vector(user_symptoms)).reshape(1, -1)

    predicted_probabilities = best_model.predict_proba(user_vector)[0]
    sorted_indices = np.argsort(predicted_probabilities)[::-1]

    top_3_predictions = [(best_model.classes_[i], predicted_probabilities[i] * 100) for i in sorted_indices[:3]]

    predicted_disease, prob = top_3_predictions[0]

    if prob > 80:
        risk_level = "High Risk - Immediate Medical Attention Needed"
    elif prob > 50:
        risk_level = "Moderate Risk - Consult a Doctor"
    elif prob > 20:
        risk_level = "Low Risk - Monitor Symptoms"
    else:
        return "No specific disease found", "Normal Symptoms", "No", top_3_predictions

    presence = "Yes" if prob > 50 else "No"
    return predicted_disease, risk_level, presence, top_3_predictions

def next_steps(probability):
    if probability > 80:
        return "Consult a doctor IMMEDIATELY."
    elif probability > 50:
        return "Monitor symptoms and consult a doctor if needed."
    else:
        return "Maintain a healthy lifestyle and monitor yourself."

if __name__ == "__main__":
    user_input = input("\nEnter your symptoms (separated by commas): ")
    symptoms_list = [sym.strip().lower() for sym in user_input.split(',') if sym.strip()]

    if not symptoms_list:
        print("No valid symptoms entered.")
        exit()

    predicted_disease, risk_level, presence, top_predictions = predict_disease(symptoms_list)

    if predicted_disease == "No specific disease found":
        print("\nNo specific disease detected. Your symptoms are normal.")
        print("\nNext Steps:")
        print("Maintain a healthy lifestyle and observe any new symptoms.")
    else:
        print("\nPredicted Diseases and Probabilities:")
        for disease, prob in top_predictions:
            print(f"- {disease}: {prob:.2f}%")

        most_likely_disease, highest_prob = top_predictions[0]
        steps = next_steps(highest_prob)

        print(f"\nMost Likely Disease: {most_likely_disease}")
        print(f"Disease Likelihood: {highest_prob:.2f}%")
        print(f"Risk Level: {risk_level}")
        print(f"Disease Present: {presence}")

        print("\nNext Steps:")
        print(steps)


Accuracy of Random Forest: 0.9950
Accuracy of SVM: 0.9950
Accuracy of Logistic Regression: 0.9950
Accuracy of Decision Tree: 0.9950
Accuracy of KNN: 0.9965

Best Model: KNeighborsClassifier with Accuracy: 0.9965

Enter your symptoms (separated by commas): fever, vomit,rashes

Predicted Diseases and Probabilities:
- erythema multiforme: 100.00%
- zika virus: 0.00%
- eye cancer: 0.00%

Most Likely Disease: erythema multiforme
Disease Likelihood: 100.00%
Risk Level: High Risk - Immediate Medical Attention Needed
Disease Present: Yes

Next Steps:
Consult a doctor IMMEDIATELY.


In [None]:
import json
import numpy as np
import pandas as pd
import pickle
import ast
from math import log
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

def load_data():
    try:
        with open('structured_diseases-2.json', 'r') as f:
            data = json.load(f)
        df_unique = pd.read_csv('unique_diseases_data.csv')
        df_unique["Symptoms"] = df_unique["Symptoms"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
        return data, df_unique
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None

data, df_unique = load_data()
if data is None or df_unique is None:
    exit()

disease_symptoms = {entry['Disease']['Name'].strip().lower():
                    [s.strip().lower() for s in entry['Disease']['Symptoms']['List']] for entry in data}

all_symptoms = sorted({s for symptoms in disease_symptoms.values() for s in symptoms})
symptom_idf = {symptom: log(len(disease_symptoms) / (1 + sum(symptom in symptoms for symptoms in disease_symptoms.values()))) for symptom in all_symptoms}

def symptoms_to_vector(symptoms):
    symptoms_set = {s.strip().lower() for s in symptoms}
    return [1 if sym in symptoms_set else 0 for sym in all_symptoms] + [symptom_idf.get(sym, 0) if sym in symptoms_set else 0 for sym in all_symptoms]

X, y = [], []
for disease, symptoms in disease_symptoms.items():
    base_vector = symptoms_to_vector(symptoms)
    for _ in range(max(20, len(symptoms) * 2)):
        X.append(base_vector)
        y.append(disease)

X, y = np.array(X), np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

best_model, best_accuracy = None, 0
model_accuracies = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    model_accuracies[name] = acc
    print(f"Accuracy of {name}: {acc:.4f}")
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model

print(f"\nBest Model: {best_model.__class__.__name__} with Accuracy: {best_accuracy:.4f}")

with open("DiseasePredictor.pkl", "wb") as f:
    pickle.dump({'model': best_model, 'symptom_idf': symptom_idf, 'all_symptoms': all_symptoms}, f)

def predict_disease(user_symptoms):
    user_symptoms = set([s.strip().lower() for s in user_symptoms])

    for disease, symptoms in disease_symptoms.items():
        if set(symptoms) == user_symptoms:
            return disease, "High Risk - Immediate Medical Attention Needed", "Yes", [(disease, 100.0)]

    user_vector = np.array(symptoms_to_vector(user_symptoms)).reshape(1, -1)
    predicted_probabilities = best_model.predict_proba(user_vector)[0]
    sorted_indices = np.argsort(predicted_probabilities)[::-1]
    top_3_predictions = [(best_model.classes_[i], predicted_probabilities[i] * 100) for i in sorted_indices[:3]]
    predicted_disease, prob = top_3_predictions[0]

    if prob > 80:
        risk_level = "High Risk - Immediate Medical Attention Needed"
    elif prob > 50:
        risk_level = "Moderate Risk - Consult a Doctor"
    elif prob > 20:
        risk_level = "Low Risk - Monitor Symptoms"
    else:
        return "No specific disease found", "Normal Symptoms", "No", top_3_predictions

    presence = "Yes" if prob > 50 else "No"
    return predicted_disease, risk_level, presence, top_3_predictions

def next_steps(probability):
    if probability > 80:
        return "Consult a doctor IMMEDIATELY."
    elif probability > 50:
        return "Monitor symptoms and consult a doctor if needed."
    else:
        return "Maintain a healthy lifestyle and monitor yourself."

if __name__ == "__main__":
    user_input = input("\nEnter your symptoms (separated by commas): ")
    symptoms_list = [sym.strip().lower() for sym in user_input.split(',') if sym.strip()]

    if not symptoms_list:
        print("No valid symptoms entered.")
        exit()

    predicted_disease, risk_level, presence, top_predictions = predict_disease(symptoms_list)

    if predicted_disease == "No specific disease found":
        print("\nNo specific disease detected. Your symptoms are normal.")
        print("\nNext Steps:")
        print("Maintain a healthy lifestyle and observe any new symptoms.")
    else:
        print("\nPredicted Diseases and Probabilities:")
        for disease, prob in top_predictions:
            print(f"- {disease}: {prob:.2f}%")

        most_likely_disease, highest_prob = top_predictions[0]
        steps = next_steps(highest_prob)

        print(f"\nMost Likely Disease: {most_likely_disease}")
        print(f"Disease Likelihood: {highest_prob:.2f}%")
        print(f"Risk Level: {risk_level}")
        print(f"Disease Present: {presence}")

        print("\nNext Steps:")
        print(steps)


Accuracy of Random Forest: 0.9950
Accuracy of SVM: 0.9950
Accuracy of Logistic Regression: 0.9950
Accuracy of Decision Tree: 0.9950
Accuracy of KNN: 0.9965

Best Model: KNeighborsClassifier with Accuracy: 0.9965

Enter your symptoms (separated by commas): fever, excessive thirst, excessive urination,vomit

Predicted Diseases and Probabilities:
- diabetes insipidus: 100.00%
- zika virus: 0.00%
- eyelid problems: 0.00%

Most Likely Disease: diabetes insipidus
Disease Likelihood: 100.00%
Risk Level: High Risk - Immediate Medical Attention Needed
Disease Present: Yes

Next Steps:
Consult a doctor IMMEDIATELY.


In [None]:
import shutil
model_filename = "DiseasePredictor.pkl"
download_filename = "DiseasePredictor_Download.pkl"

try:
    shutil.copy(model_filename, download_filename)
    print(f"Model file '{model_filename}' is ready for download as '{download_filename}'.")

    from google.colab import files
    files.download(download_filename)

except FileNotFoundError:
    print(f"Error: {model_filename} not found. Train the model first.")
except Exception as e:
    print(f"Error: {e}")

Model file 'DiseasePredictor.pkl' is ready for download as 'DiseasePredictor_Download.pkl'.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pickle

model_filename = "DiseasePredictor.pkl"

try:
    with open(model_filename, "rb") as f:
        data = pickle.load(f)
    print("\n Model file loaded successfully!")
    print(" Available Modules inside Pickle File:")
    for key in data.keys():
        print(f"- {key}")

    required_keys = {"model", "symptom_idf", "all_symptoms"}
    missing_keys = required_keys - data.keys()

    if missing_keys:
        print(f"\n Warning: Missing keys in pickle file: {missing_keys}")
    else:
        print("\n Pickle file structure is valid!")

except FileNotFoundError:
    print(f"\n Error: '{model_filename}' not found. Please check the file path.")

except pickle.UnpicklingError:
    print("\n Error: Pickle file is corrupted or not a valid pickle format.")

except Exception as e:
    print(f"\n Unexpected error: {e}")



 Model file loaded successfully!
 Available Modules inside Pickle File:
- model
- symptom_idf
- all_symptoms

 Pickle file structure is valid!
