In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

In [46]:
nlp = spacy.load("en_core_web_sm")

In [47]:
def dis_sym(symp):
    # ===========================
    # Step 1: Load Data
    # ===========================
    df = pd.read_csv("Disease_symptoms_new.csv")  

    X = df['Symptoms']
    y = df['Disease']

    # ===========================
    # Step 2: Preprocess Text (Lemmatization)
    # ===========================
    def preprocess(text):
        doc = nlp(str(text).lower())
        # Keep only lemmas, remove stopwords & punctuation
        return " ".join([
            token.lemma_ for token in doc 
            if not token.is_stop and token.is_alpha
        ])

    X = X.apply(preprocess)

    # ===========================
    # Step 3: Train/Test Split
    # ===========================
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
    except ValueError:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

    # ===========================
    # Step 4: Vectorize
    # ===========================
    vectorizer = TfidfVectorizer(
        ngram_range=(1,2),    # capture unigrams & bigrams
        min_df=2              # ignore very rare terms
    )
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # ===========================
    # Step 5: Train Classifier
    # ===========================
    model = LogisticRegression(max_iter=2000)
    model.fit(X_train_tfidf, y_train)

    # ===========================
    # Step 6: Evaluate
    # ===========================
    y_pred = model.predict(X_test_tfidf)
    print("Accuracy:", accuracy_score(y_test, y_pred))

    # ===========================
    # Step 7: Prediction Function (Top-3 + Threshold)
    # ===========================
    def predict_top3(symptoms_list, threshold=0.7):
        cleaned = [preprocess(text) for text in symptoms_list]
        features = vectorizer.transform(cleaned)
        proba = model.predict_proba(features)
        classes = model.classes_
        
        results = []
        for raw_sym, clean_sym, probs in zip(symptoms_list, cleaned, proba):
            # sort probs in descending order
            top_idx = np.argsort(probs)[::-1][:3]
            top_diseases = [(classes[i], float(probs[i])) for i in top_idx]

            # main prediction with threshold
            pred = classes[top_idx[0]] if probs[top_idx[0]] >= threshold else "Uncertain"
            
            results.append({
                "Original Input": raw_sym,
                "Preprocessed": clean_sym,
                "Prediction": pred,
                "Top 3": top_diseases
            })

        confidence_df = pd.DataFrame(proba, columns=classes, index=symptoms_list)
        return results, confidence_df


    preds, confidence_matrix = predict_top3(symp)

    for res in preds:
        print(f"\nOriginal: {res['Original Input']}")
        print(f"Preprocessed: {res['Preprocessed']}")
        if res['Prediction']!='Uncertain':
            print(f"Predicted Disease: {res['Prediction']}")
        print("Top 3 likely diseases:")
        for disease, score in res['Top 3']:
            print(f"{disease}: {score:.2f}")

    print(type(preds))
    return preds
    # print("\nConfidence Matrix:\n")
    # print(confidence_matrix)


In [48]:
def dis_test(lst_dis):
    df_test = pd.read_csv('Disease_Tests.csv')
    display(df_test.head(10))  # optional

    for entry in lst_dis:
        original_input = entry['Original Input']
        top3 = entry['Top 3']

        print(f"\nSymptom Input: '{original_input}'")
        
        for disease, score in top3:
            test_row = df_test.loc[df_test['Disease'] == disease]
            if not test_row.empty:
                tests = ", ".join(test_row['Tests'].values)
                print(f"Disease: {disease}, Suggested Tests: {tests}")
            else:
                print(f"Disease: {disease}, No tests found")


In [49]:
def dis_int(lst_int):
    # Load intensity dataset
    df_test = pd.read_csv('Disease_Intensity.csv')
    display(df_test.head(10))

    for entry in lst_int:
        original_input = entry['Symptoms'] if 'Symptoms' in entry else entry['Original Input']
        top3 = entry['Top 3']

        print(f"\nSymptom Input: '{original_input}'")
        
        for disease, score in top3:
            test_row = df_test.loc[df_test['Disease'] == disease]
            if not test_row.empty:
                intensity = ", ".join(test_row['Intensity'].values)
                if intensity == 'Emergency':
                    print(f"Disease: {disease}, Intensity: {intensity} 🚨 Trigger Emergency 🚨")
                else:
                    print(f"Disease: {disease}, Intensity: {intensity}")
            else:
                print(f"Disease: {disease}, No intensity info found")

sym = ["cough, nausea"]
p = dis_sym(sym) 
dis_int(p)