In [5]:
# Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# For wider display
pd.set_option('display.max_columns', None)
#sym_path = "../data/raw/symptomdatas/"
#loading the datasets
sym_dataset = pd.read_csv( "dataset.csv")
sym_desc = pd.read_csv( "symptom_Description.csv")
sym_prec = pd.read_csv( "symptom_precaution.csv")
sym_severity = pd.read_csv( "Symptom-severity.csv")

# Preview
sym_dataset.head()



FileNotFoundError: [Errno 2] No such file or directory: 'dataset.csv'

In [None]:
print(sym_dataset.shape)
sym_dataset.info()
sym_dataset.isnull().sum()


In [None]:
sym_dataset = sym_dataset.fillna("None")


In [None]:
symptom_cols = [col for col in sym_dataset.columns if col.startswith("Symptom")]
sym_dataset["all_symptoms"] = sym_dataset[symptom_cols].values.tolist()

sym_dataset[["Disease", "all_symptoms"]].head()


In [None]:
# 1Ô∏è‚É£ Ensure all_symptoms is converted to list properly
sym_dataset["all_symptoms"] = sym_dataset["all_symptoms"].apply(
    lambda s: str(s).replace(" ", "").split(",")
)

# 2Ô∏è‚É£ Remove "None" and empty values
sym_dataset["all_symptoms"] = sym_dataset["all_symptoms"].apply(
    lambda s: [x for x in s if x and x.lower() != "none"]
)


In [None]:
print(sym_dataset["all_symptoms"].head(10))
#sym_dataset.head()

In [None]:

symptom_cols = [col for col in sym_dataset.columns if col.startswith("Symptom")]

sym_dataset["all_symptoms"] = (
    sym_dataset[symptom_cols]
    .apply(lambda row: [x for x in row if x and x != "None"], axis=1)
)
sym_dataset[["Disease", "all_symptoms"]].head(10)

In [None]:
# Clean symptom names (strip spaces and fix formatting)
def clean_symptom_list(sym_list):
    cleaned = []
    for s in sym_list:
        s = s.strip()               # remove leading/trailing spaces
        s = s.replace("  ", " ")    # fix double spaces
        s = s.replace(" _", "_")    # fix misplaced underscores
        s = s.replace(" _", "_")
        cleaned.append(s)
    return cleaned

sym_dataset["all_symptoms"] = sym_dataset["all_symptoms"].apply(clean_symptom_list)

# Preview 10 rows again
sym_dataset[["Disease", "all_symptoms"]].sample(10)


In [None]:
# Get all unique symptoms across all rows
all_symptoms_set = set()
for symptoms in sym_dataset["all_symptoms"]:
    all_symptoms_set.update(symptoms)

all_symptoms = sorted(list(all_symptoms_set))
len(all_symptoms), all_symptoms[:20]

In [None]:
sym_dataset.shape

In [None]:
import pandas as pd

print("--- üè• DATA READINESS CHECK ---")

# 1. CHECK SHAPE (Volume)
rows, cols = sym_dataset.shape
print(f"1. Row Count: {rows}")
if rows < 4000:
    print("   ‚ö†Ô∏è WARNING: Your data looks shrunk! Did you drop duplicates? (Expected ~4920)")
else:
    print("   ‚úÖ SUCCESS: Full patient volume preserved.")

# 2. CHECK TEXT CONSISTENCY (Spot Check)
# We look at the first symptom column to see if text is clean (no spaces, lowercase).
# Adjust 'Symptom_1' to whatever your first symptom column is named.
check_col = [c for c in sym_dataset.columns if 'Symptom' in c][0] 
sample_val = sym_dataset[check_col].iloc[0]

print(f"\n2. Text Format Check (Sample from {check_col}): '{sample_val}'")
if " " in str(sample_val) and "_" not in str(sample_val):
    print("   ‚ö†Ô∏è WARNING: Found spaces. Recommended to replace with underscores (e.g., 'skin rash' -> 'skin_rash').")
elif str(sample_val).lower() != str(sample_val):
    print("   ‚ö†Ô∏è WARNING: Found uppercase letters. Recommended to lowercase everything.")
else:
    print("   ‚úÖ SUCCESS: Text looks standardized (lowercase/formatted).")

# 3. CHECK MISSING VALUES
# It is NORMAL to have NaNs in symptom columns (not everyone has every symptom).
# We just want to make sure the 'Disease' column is full.
missing_diseases = sym_dataset['Disease'].isnull().sum()
print(f"\n3. Missing Diseases: {missing_diseases}")
if missing_diseases > 0:
    print("   üö® CRITICAL: You have rows with no Disease label. Drop them.")
else:
    print("   ‚úÖ SUCCESS: All rows have a target disease.")

print("-" * 30)
print("VERDICT:")
if rows > 4000 and missing_diseases == 0:
    print("üöÄ READY FOR STEP 4 (Convert to Numbers)")
else:
    print("‚ùå NOT READY. Fix the issues above.")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

print("--- üîÑ MASTER FIX: BINARY ENCODING & RANDOM FOREST ---")

# 1. DEFINE COLUMNS
symptom_cols = [col for col in sym_dataset.columns if 'Symptom' in col]
FINAL_Y_COL = 'Disease'

# 2. CLEAN DATA
df_processed = sym_dataset.copy()
for col in symptom_cols:
    df_processed[col] = df_processed[col].astype(str).str.strip().str.replace(' ', '_').str.lower().replace('none', '')

# Add ID
df_processed['Instance_ID'] = df_processed.index 

# 3. BINARY TRANSFORMATION (The Robust Fix)
print("Transforming to Binary (1/0)...")
df_long = pd.melt(
    df_processed,
    id_vars=['Instance_ID', FINAL_Y_COL], 
    value_vars=symptom_cols,
    value_name='Symptom'
)
# Filter out empty
df_long = df_long[df_long['Symptom'] != ''].copy()

# *** FIX: IGNORE WEIGHTS. SET ALL PRESENT SYMPTOMS TO 1 ***
df_long['Present'] = 1

# Pivot
X_binary = df_long.pivot_table(
    index='Instance_ID', 
    columns='Symptom',
    values='Present',
    fill_value=0, # Fill missing symptoms with 0
    aggfunc='max'
)

# Align Y
Y_binary = df_processed.set_index('Instance_ID')[FINAL_Y_COL].loc[X_binary.index]

# Define X and Y
X = X_binary
Y = Y_binary

print(f"‚úÖ Data Prepared. X Shape: {X.shape}")
print("-" * 30)

# 4. TRAIN RANDOM FOREST
print("Training Random Forest...")
X_train, X_test, Y_train, Y_test = train_test_split(
    X.values, Y.values, 
    test_size=0.2, 
    random_state=42, 
    stratify=Y.values
)

rf_model_base = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_base.fit(X_train, Y_train)

# 5. EVALUATE
acc = rf_model_base.score(X_test, Y_test)
print(f"\nüèÜ MODEL ACCURACY: {acc:.4f}")

# Save variables for the Risk Step
Y_pred_proba = rf_model_base.predict_proba(X_test)
disease_labels = rf_model_base.classes_

print("‚úÖ Model Ready for Risk Integration.")

In [None]:
import pandas as pd
import numpy as np

# --- SIMPLE TEST FUNCTION ---
def test_model_prediction(symptom_list, model, feature_columns):
    """
    Creates a binary input vector from a list of symptoms and asks the model to predict.
    """
    # 1. Create an empty row (all 0s)
    input_data = {col: 0 for col in feature_columns}
    
    # 2. Mark the input symptoms as 1 (Present)
    # We handle potential spelling mismatches by checking if the column exists
    found_symptoms = []
    for s in symptom_list:
        # Clean the input string to match column format (lowercase, underscore)
        clean_s = s.strip().replace(' ', '_').lower()
        if clean_s in input_data:
            input_data[clean_s] = 1
            found_symptoms.append(clean_s)
        else:
            print(f"‚ö†Ô∏è Warning: Symptom '{s}' not found in model features.")
            
    # 3. Convert to DataFrame for prediction
    input_df = pd.DataFrame([input_data])
    
    # 4. Predict
    prediction = model.predict(input_df.values)[0]
    probability = np.max(model.predict_proba(input_df.values)[0])
    
    return prediction, probability, found_symptoms

# --- TEST CASES ---

# TEST 1: Malaria Symptoms
malaria_symptoms = ['chills', 'vomiting', 'high_fever', 'sweating', 'headache', 'nausea']
pred, prob, found = test_model_prediction(malaria_symptoms, rf_model_base, X.columns)

print("\n--- TEST 1: MALARIA SYMPTOMS ---")
print(f"Input Symptoms Found: {found}")
print(f"Model Prediction: {pred}")
print(f"Confidence: {prob:.2%}")

# TEST 2: Allergy Symptoms
allergy_symptoms = ['continuous_sneezing', 'shivering', 'chills', 'watering_from_eyes']
pred2, prob2, found2 = test_model_prediction(allergy_symptoms, rf_model_base, X.columns)

print("\n--- TEST 2: ALLERGY SYMPTOMS ---")
print(f"Input Symptoms Found: {found2}")
print(f"Model Prediction: {pred2}")
print(f"Confidence: {prob2:.2%}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# --- CONFUSION MATRIX DIAGNOSTIC ---

# 1. Generate Predictions
# FIX: X_test is already an array, so we remove .values
Y_pred = rf_model_base.predict(X_test)

# 2. Get the Unique Labels (Sorted)
# This forces the X-axis and Y-axis to use the exact same order,
# which creates the perfect diagonal line if the model is accurate.
unique_labels = sorted(rf_model_base.classes_)

# 3. Plot with Explicit Labels
fig, ax = plt.subplots(figsize=(12, 12))

ConfusionMatrixDisplay.from_predictions(
    Y_test, 
    Y_pred, 
    labels=unique_labels,  # <--- Force alignment
    cmap=plt.cm.Blues,
    xticks_rotation='vertical',
    normalize='true', # Shows percentages (0.0 to 1.0)
    ax=ax,
    include_values=False # Hides numbers to make the pattern clearer
)

plt.title("Diagnostic Confusion Matrix")
plt.show()

In [None]:
sym_dataset['Disease'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.countplot(x='Disease', data=sym_dataset)
plt.xticks(rotation=90) # Rotates names so you can read them
plt.title("Disease Distribution (Checking for Balance)")
plt.show()

In [None]:
import pandas as pd
import numpy as np
import os

print("--- üè• PHASE 4: WHO RISK INTEGRATION ---")

# ==========================================
# 1. LOAD WHO DATA
# ==========================================
# Adjust this path if your file is named differently
who_file_path = "../data/processed/who_mortality_sample.csv" 

try:
    # Attempt to load from file
    sampled_df = pd.read_csv(who_file_path)
    print(f"‚úÖ WHO Data Loaded. Shape: {sampled_df.shape}")
except FileNotFoundError:
    print(f"‚ö†Ô∏è File not found at {who_file_path}")
    print("   Checking variables... if 'sampled_df' is already in memory, we will use it.")
    if 'sampled_df' not in locals():
        raise ValueError("üö® WHO Data not found! Please load your WHO csv file.")

# ==========================================
# 2. CALCULATE RISK SCORES
# ==========================================
WHO_CAUSE_COL = 'Cause'

# Ensure death columns are numeric
death_cols = [c for c in sampled_df.columns if 'Deaths' in c]
for col in death_cols:
    sampled_df[col] = pd.to_numeric(sampled_df[col], errors='coerce').fillna(0)

# Sum deaths per row (Total deaths for that specific record)
sampled_df['Total_Deaths'] = sampled_df[death_cols].sum(axis=1)

# Group by ICD Code (Cause) and sum total deaths
who_agg = sampled_df.groupby(WHO_CAUSE_COL)['Total_Deaths'].sum().reset_index()

# Normalize: Score between 0 and 1 (1 = The Deadliest Disease)
who_agg['Risk_Score'] = who_agg['Total_Deaths'] / who_agg['Total_Deaths'].max()
WHO_Risk_DF = who_agg[[WHO_CAUSE_COL, 'Risk_Score']]

print(f"‚úÖ WHO Risk Scores Calculated for {len(WHO_Risk_DF)} causes.")
print("   Top 3 Riskiest Codes in Sample:")
print(WHO_Risk_DF.sort_values('Risk_Score', ascending=False).head(3))

# ==========================================
# 3. DEFINE MANUAL MAPPING (Disease -> ICD)
# ==========================================
disease_to_who_map = {
    '(vertigo) Paroymsal  Positional Vertigo': 'H81', 
    'AIDS': 'B24', 
    'Acne': 'L70', 
    'Alcoholic hepatitis': 'K70.1', 
    'Allergy': 'J30', 
    'Arthritis': 'M13.9', 
    'Bronchial Asthma': 'J45', 
    'Cervical spondylosis': 'M47.9', 
    'Chicken pox': 'B01', 
    'Chronic cholestasis': 'K76.9', 
    'Common Cold': 'J00', 
    'Dengue': 'A90', 
    'Diabetes ': 'E14', 
    'Dimorphic hemmorhoids(piles)': 'I84', 
    'Drug Reaction': 'T88.7', 
    'Fungal infection': 'B49', 
    'GERD': 'K21.9', 
    'Gastroenteritis': 'A09', 
    'Heart attack': 'I21', 
    'Hepatitis B': 'B18.1', 
    'Hepatitis C': 'B18.2', 
    'Hepatitis D': 'B18.8', 
    'Hepatitis E': 'B18.8', 
    'Hypertension ': 'I10', 
    'Hyperthyroidism': 'E05.9', 
    'Hypoglycemia': 'E16.2', 
    'Hypothyroidism': 'E03.9', 
    'Impetigo': 'L01', 
    'Jaundice': 'R17', 
    'Malaria': 'B54', 
    'Migraine': 'G43.9', 
    'Osteoarthristis': 'M19.9', 
    'Paralysis (brain hemorrhage)': 'I61.9', 
    'Peptic ulcer diseae': 'K27.9', 
    'Pneumonia': 'J18.9', 
    'Psoriasis': 'L40.9', 
    'Tuberculosis': 'A16.9', 
    'Typhoid': 'A01.0', 
    'Urinary tract infection': 'N39.0', 
    'Varicose veins': 'I83.9', 
    'hepatitis A': 'B15.9'
}
print("‚úÖ Disease Mapping Dictionary Ready.")

In [1]:
import pandas as pd
import numpy as np
import warnings

print("--- üöÄ BUILDING THE FINAL APPLICATION LAYER ---")

# --- 1. HELPER: SAFE REMEDY FORMATTER ---
# Keeps original medical advice but makes it sound professional.
def format_remedy_list_SAFE(raw_precaution_list):
    if not isinstance(raw_precaution_list, list) or not raw_precaution_list:
        return ["Consult a medical professional."]

    formatted = []
    for phrase in raw_precaution_list:
        if isinstance(phrase, str) and phrase.strip():
            # Capitalize and add period
            clean = phrase.strip()[0].upper() + phrase.strip()[1:]
            if not clean.endswith('.'): clean += '.'
            formatted.append(clean)
    
    # Add standard disclaimer
    formatted.append("If symptoms persist or worsen, seek immediate medical attention.")
    return list(set(formatted))

# --- 2. THE SMART PREDICTION FUNCTION ---
def predict_disease_smart(user_symptoms, top_k=5):
    """
    Full Pipeline: Input -> Binary Vector -> Model Prob -> Risk Adjust -> Output
    """
    
    # A. INPUT PREPARATION
    # Create zero vector matching the model's training features (X)
    # We use a try/except block in case X is not globally defined, checking rf_model_base features
    try:
        feature_names = X.columns
    except NameError:
        # Fallback if X variable is lost but model exists
        feature_names = rf_model_base.feature_names_in_

    input_data = {col: 0 for col in feature_names}
    
    matched_symptoms = []
    for s in user_symptoms:
        # Clean input to match feature names
        clean_s = s.strip().replace(' ', '_').lower()
        
        # Exact match check
        if clean_s in input_data:
            input_data[clean_s] = 1
            matched_symptoms.append(clean_s)
        # Loose match check (e.g. "fever" matches "high_fever")
        else:
            for col in feature_names:
                if clean_s in col:
                    input_data[col] = 1
                    matched_symptoms.append(col)
                    
    # B. GET RAW PROBABILITIES
    # Suppress warnings about feature names
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        input_vector = pd.DataFrame([input_data]).values
        probabilities = rf_model_base.predict_proba(input_vector)[0]
    
    disease_labels = rf_model_base.classes_
    
    # C. APPLY WHO RISK ADJUSTMENT
    # Map Model Labels -> ICD Codes -> Risk Scores
    risk_lookup = pd.Series(disease_labels).map(disease_to_who_map)
    
    # Get Risk Scores from the WHO dataframe
    Risk_Score_Series = WHO_Risk_DF.set_index(WHO_CAUSE_COL)['Risk_Score']
    
    # Create the Risk Vector (filling unknowns with a tiny score)
    min_risk = Risk_Score_Series.min() if not Risk_Score_Series.empty else 0.00001
    risk_vector = risk_lookup.map(Risk_Score_Series).fillna(min_risk).values
    
    # THE CORE FORMULA
    adjusted_scores = probabilities * risk_vector
    
    # D. RANK AND FORMAT OUTPUT
    top_indices = np.argsort(adjusted_scores)[::-1][:top_k]
    
    results = []
    for i in top_indices:
        d_name = disease_labels[i]
        
        # Lookup formatting
        lookup_key = d_name.lower().replace(' ', '_').replace('(', '').replace(')', '').strip('_')
        
        # 1. Get Description
        try: 
            desc = sym_desc.loc[sym_desc['Disease'] == lookup_key, 'Description'].values[0]
        except: 
            desc = "Description unavailable."
            
        # 2. Get Precautions
        try:
            pre_row = sym_prec.loc[sym_prec["Disease"] == lookup_key]
            raw_prec = pre_row.iloc[0][1:].dropna().tolist()
            pre = format_remedy_list_SAFE(raw_prec)
        except:
            pre = ["Consult a doctor."]
            
        results.append({
            "Disease": d_name,
            "Smart Score": round(adjusted_scores[i], 5),
            "Raw Probability": f"{probabilities[i]*100:.1f}%",
            "Description": desc,
            "Remedies": pre
        })
        
    return pd.DataFrame(results)

print("‚úÖ System is LIVE. Ready for testing.")

--- üöÄ BUILDING THE FINAL APPLICATION LAYER ---
‚úÖ System is LIVE. Ready for testing.


In [2]:
# Test: Respiratory Symptoms (Could be Cold, could be Pneumonia)
test_symptoms = ['high_fever', 'chills', 'cough', 'breathlessness', 'fatigue']

print("\n--- üß™ FINAL DIAGNOSIS TEST ---")
df_result = predict_disease_smart(test_symptoms)

# Display nicely
from IPython.display import display
display(df_result)


--- üß™ FINAL DIAGNOSIS TEST ---


NameError: name 'rf_model_base' is not defined

In [None]:
import pandas as pd
import numpy as np
import warnings

print("--- üõ†Ô∏è FIXING THE SYMPTOM MATCHER ---")

# 1. INSPECT THE ACTUAL FEATURES
# Let's see what the model is actually expecting.
model_features = list(X.columns)
print(f"Model expects {len(model_features)} symptoms. Examples: {model_features[:5]}")

def predict_disease_ROBUST(user_symptoms, top_k=5):
    # 1. Input Preparation (The Robust Matcher)
    input_data = {col: 0 for col in model_features}
    matched_list = []
    
    print(f"\nüîç Scanning for symptoms: {user_symptoms}")
    
    for s in user_symptoms:
        # Clean user input
        clean_s = s.strip().lower().replace(' ', '_')
        
        found = False
        # Strategy 1: Exact Match
        if clean_s in input_data:
            input_data[clean_s] = 1
            matched_list.append(f"{s} (Exact)")
            found = True
            
        # Strategy 2: Loose Match (Substring)
        # If "fever" is typed, it catches "high_fever"
        if not found:
            for col in model_features:
                if clean_s in col or col in clean_s:
                    input_data[col] = 1
                    matched_list.append(f"{col} (Matched via '{s}')")
                    found = True
    
    print(f"‚úÖ MATCHED FEATURES: {matched_list}")
    
    if not matched_list:
        print("‚ùå ERROR: No symptoms matched! The model is guessing blindly.")
        return None

    # 2. Predict Probabilities
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        input_vector = pd.DataFrame([input_data]).values
        probs = rf_model_base.predict_proba(input_vector)[0]
    
    disease_labels = rf_model_base.classes_
    
    # 3. Apply Risk (The Boost Formula)
    risk_lookup = pd.Series(disease_labels).map(disease_to_who_map)
    Risk_Score_Series_ICD = WHO_Risk_DF.set_index(WHO_CAUSE_COL)['Risk_Score']
    min_risk = 0.0
    risk_vector = risk_lookup.map(Risk_Score_Series_ICD).fillna(min_risk).values
    
    # Boost Score = Prob * (1 + Risk)
    adjusted_scores = probs * (1 + risk_vector)
    
    # 4. Rank
    top_indices = np.argsort(adjusted_scores)[::-1][:top_k]
    
    results = []
    for i in top_indices:
        d_name = disease_labels[i]
        
        # Lookup Description
        lookup = d_name.lower().replace(' ', '_').replace('(', '').replace(')', '').strip('_')
        try: desc = sym_desc.loc[sym_desc['Disease'] == lookup, 'Description'].values[0]
        except: desc = "N/A"
        
        try: 
            pre_row = sym_prec.loc[sym_prec["Disease"] == lookup]
            pre = pre_row.iloc[0][1:].dropna().tolist()
        except: pre = ["Consult Doctor"]
            
        results.append({
            "Disease": d_name,
            "Final Score": round(adjusted_scores[i], 4),
            "Raw Probability": f"{probs[i]*100:.1f}%",
            "Description": desc,
            "Remedies": pre
        })
        
    return pd.DataFrame(results)

# --- TEST RUN ---
test_symptoms = ['high_fever', 'chills', 'cough', 'breathlessness', 'fatigue']
result = predict_disease_ROBUST(test_symptoms)

if result is not None:
    display(result)

In [None]:
import pandas as pd

print("--- üßπ FINAL COSMETIC FIX: DESCRIPTIONS ---")

# 1. RELOAD REFERENCE TABLES
# We load fresh to avoid any previous cleaning confusion
sym_desc = pd.read_csv("../data/raw/symptomdatas/symptom_Description.csv")
sym_prec = pd.read_csv("../data/raw/symptomdatas/symptom_precaution.csv")

# 2. CLEAN THE KEYS TO MATCH MODEL
# Model predicts: "Bronchial Asthma" -> We convert to "bronchial_asthma" for lookup
# So we must ensure the reference tables use "bronchial_asthma" too.
def clean_key(text):
    return str(text).lower().strip().replace(' ', '_').replace('(', '').replace(')', '')

sym_desc['Disease_Key'] = sym_desc['Disease'].apply(clean_key)
sym_prec['Disease_Key'] = sym_prec['Disease'].apply(clean_key)

# 3. UPDATE THE PREDICTION FUNCTION LOOKUP
# We update the function one last time to use this new 'Disease_Key' column
def predict_final_display(user_symptoms, top_k=5):
    # (Reuse existing prediction logic from predict_disease_ROBUST)
    result_df = predict_disease_ROBUST(user_symptoms, top_k)
    
    # Post-process for descriptions
    if result_df is not None and not result_df.empty:
        clean_descs = []
        clean_precs = []
        
        for disease in result_df['Disease']:
            lookup = clean_key(disease)
            
            # Description Lookup
            try: 
                d = sym_desc.loc[sym_desc['Disease_Key'] == lookup, 'Description'].values[0]
            except: d = "Description not found."
            clean_descs.append(d)
            
            # Precaution Lookup
            try:
                p_row = sym_prec.loc[sym_prec['Disease_Key'] == lookup].iloc[0, 1:].dropna().tolist()
                # Basic formatting
                p_fmt = [x.strip().capitalize() for x in p_row]
                p_str = ", ".join(p_fmt)
            except: p_str = "Consult doctor."
            clean_precs.append(p_str)
            
        result_df['Description'] = clean_descs
        result_df['Remedies'] = clean_precs
        
    return result_df

# --- RUN FINAL TEST ---
test_symptoms = ['high_fever', 'chills', 'cough', 'breathlessness', 'fatigue']
print("\n--- ‚ú® FINAL PROJECT OUTPUT ---")
final_table = predict_final_display(test_symptoms)
display(final_table)

In [24]:
conversation_state = {
    "symptoms":[],
    "history":[]
}

In [25]:
SYMPTOM_SYNONYMS = {
    "fever": ["hot body", "high temperature", "temperature", "hotness", "heat"],
    "chills": ["cold", "shivering", "freezing"],
    "fatigue": ["weak", "tired", "exhausted"],
    "nausea": ["vomiting", "feel like vomiting", "urge to vomit", "queasy"],
    "headache": ["head pain", "migraine", "pounding head"]
}


In [26]:
def extract_symptoms(user_text):
    detected = []
    text = user_text.lower()

    for symp, syn_list in SYMPTOM_SYNONYMS.items():
        if symp in text:
            detected.append(symp)
        else:
            for s in syn_list:
                if s in text:
                    detected.append(symp)
                    break
    return detected


In [27]:
def update_conversation(user_text):
    new_symptoms = extract_symptoms(user_text)
    
    for s in new_symptoms:
        if s not in conversation_state["symptoms"]:
            conversation_state["symptoms"].append(s)
    
    conversation_state["history"].append(user_text)
    
    return new_symptoms


In [30]:
import pickle

# Save only the model
with open("smarthealth_model.pkl", "wb") as f:
    pickle.dump(rf_model_base, f)


In [31]:
# After you have your X from training
import pickle

# Suppose X is your training dataframe with 131 features
model_features = list(X.columns)

# Save to a pickle file
with open("smarthealth_features.pkl", "wb") as f:
    pickle.dump(model_features, f)

print("‚úÖ smarthealth_features.pkl saved successfully!")


‚úÖ smarthealth_features.pkl saved successfully!


In [32]:
pd.DataFrame(X.values, columns=X_binary.columns).to_csv("smarthealth_X_train.csv", index=False)


In [42]:
import pandas as pd
import numpy as np
import warnings
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report

print("--- üöÄ INITIALIZING FINAL MEDICAL AGENT SYSTEM ---")

# ==========================================
# STEP 1: THE SYNONYM MATCHER (Natural Language Engine)
# ==========================================
symptom_synonyms = {
    # General Pain
    'ache': 'pain', 'hurts': 'pain', 'sore': 'pain', 'painful': 'pain',
    
    # Stomach/Abdominal
    'stomach_ache': 'stomach_pain', 'belly_ache': 'stomach_pain', 'tummy_pain': 'stomach_pain',
    'bloated': 'stomach_pain', 'abdominal': 'abdominal_pain',
    
    # Respiratory
    'hard_to_breathe': 'breathlessness', 'short_of_breath': 'breathlessness', 'panting': 'breathlessness',
    'runny_nose': 'runny_nose', 'sneezing': 'continuous_sneezing', 'phlegm': 'phlegm',
    
    # Temperature/Infection
    'shivering': 'chills', 'cold': 'chills', 'freezing': 'chills',
    'hot': 'high_fever', 'burning': 'high_fever', 'temp': 'high_fever', 'fever': 'high_fever',
    'sweat': 'sweating',
    
    # Digestion
    'puke': 'vomiting', 'throw_up': 'vomiting', 'nauseous': 'nausea',
    'poop': 'diarrhoea', 'loose_motion': 'diarrhoea',
    
    # Neurological/General
    'dizzy': 'dizziness', 'spinning': 'dizziness', 'lightheaded': 'dizziness',
    'weak': 'fatigue', 'tired': 'fatigue', 'exhausted': 'fatigue',
    'confused': 'altered_sensorium',
    
    # Skin
    'rash': 'skin_rash', 'itch': 'itching', 'scratch': 'itching', 'spots': 'nodal_skin_eruptions'
}

def extract_features(user_text, model_columns):
    """Step 1 Logic: Convert User Text -> Model Features"""
    user_text = user_text.lower().strip()
    found_features = set()
    
    # A. Synonym Mapping
    for phrase, mapped_col in symptom_synonyms.items():
        if phrase in user_text:
            found_features.add(mapped_col)
            
    # B. Direct Matching (Fuzzy)
    # Check if any actual column name appears in the text
    for col in model_columns:
        clean_col = col.replace('_', ' ')
        if clean_col in user_text or col in user_text:
            found_features.add(col)
            
    return list(found_features)

print("‚úÖ Step 1: Synonym Engine Ready.")


# ==========================================
# STEP 2: CALIBRATED CLASSIFIER (Probability Correction)
# ==========================================
# Random Forest is confident but often wrong about probability (e.g. says 0.9 when it means 0.6).
# Calibration fixes this so 80% confidence actually means 80% chance.

print("\n--- STEP 2 & 3: TRAINING CALIBRATED MODEL ---")

# 1. Split Data
X_train, X_test, Y_train, Y_test = train_test_split(
    X.values, Y.values, test_size=0.2, random_state=42, stratify=Y.values
)

# 2. Base Model
base_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 3. Calibrated Wrapper (Sigmoid Calibration)
calibrated_model = CalibratedClassifierCV(base_rf, method='sigmoid', cv=5)
calibrated_model.fit(X_train, Y_train)

print("‚úÖ Step 2: Model Calibrated & Trained.")


# ==========================================
# STEP 3: CROSS-VALIDATION (Robustness Check)
# ==========================================
# Verify the model is stable across different data splits

cv_scores = cross_val_score(calibrated_model, X.values, Y.values, cv=5, scoring='accuracy')
print(f"‚úÖ Step 3: Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")


# ==========================================
# STEP 4 & 5: THE AGENT CLASS (Low Confidence Rejection + Response)
# ==========================================

class Dr_AI_Advanced:
    def __init__(self, model, feature_cols, who_risk_df, map_dict, desc_df, prec_df):
        self.model = model
        self.cols = feature_cols
        self.who_risk = who_risk_df.set_index('Cause')['Risk_Score']
        self.mapping = map_dict
        self.desc = desc_df
        self.prec = prec_df
        
        # Calculate Disease Profiles (for Relevance Scoring)
        # (Assuming sym_dataset is available globally, or passed in. 
        # Ideally passed in, but using global for brevity here)
        self.disease_profiles = {}
        for d in Y.unique():
             # Get symptoms that happen > 0 times for this disease
             d_rows = X[Y == d]
             symptoms = d_rows.columns[d_rows.sum() > 0].tolist()
             self.disease_profiles[d] = symptoms

    def consult(self, user_input):
        # A. EXTRACT
        symptoms = extract_features(user_input, self.cols)
        
        if not symptoms:
            return "I couldn't detect any specific symptoms. Please describe your physical condition (e.g., 'I have a headache and fever')."
            
        # Prepare Input Vector
        input_data = np.zeros((1, len(self.cols)))
        for s in symptoms:
            if s in self.cols:
                idx = list(self.cols).index(s)
                input_data[0, idx] = 1
        
        # B. PREDICT
        probs = self.model.predict_proba(input_data)[0]
        classes = self.model.classes_
        
        # C. CALCULATE SCORES (Risk + Relevance)
        final_scores = []
        
        for i, disease in enumerate(classes):
            prob = probs[i]
            
            # 1. Risk Boost
            try:
                icd = self.mapping.get(disease)
                risk_val = self.who_risk.get(icd, 0.0)
            except: risk_val = 0.0
            
            # 2. Relevance (Symptom Overlap)
            # How many of the User's symptoms fit this Disease's profile?
            profile = self.disease_profiles.get(disease, [])
            matches = sum(1 for s in symptoms if s in profile)
            relevance = matches / len(symptoms) if len(symptoms) > 0 else 0
            
            # MASTER FORMULA:
            # Score = Probability * (1 + Risk) * (1 + Relevance^2)
            # Relevance is squared to heavily punish diseases that don't match the symptoms.
            score = prob * (1 + risk_val) * (1 + (relevance**2))
            
            final_scores.append(score)
            
        # D. SELECT WINNER
        best_idx = np.argmax(final_scores)
        winner = classes[best_idx]
        confidence = probs[best_idx]
        
        # STEP 4: LOW CONFIDENCE REJECTION
        # If the model is less than 20% sure, do not guess.
        if confidence < 0.20:
            return (f"‚ö†Ô∏è **Uncertain Diagnosis**\n"
                    f"My analysis is inconclusive (Confidence: {confidence*100:.1f}%). "
                    f"Your symptoms ({', '.join(symptoms)}) do not strongly match any specific profile in my database. "
                    f"Please provide more details or consult a doctor.")

        # STEP 5: POLISHED RESPONSE TEMPLATE
        lookup = winner.lower().replace(' ', '_').replace('(', '').replace(')', '').strip('_')
        try: d_text = self.desc.loc[self.desc['Disease'] == lookup, 'Description'].values[0]
        except: d_text = "Details unavailable."
        
        try: 
            p_row = self.prec.loc[self.prec['Disease'] == lookup].iloc[0, 1:].dropna().tolist()
            remedies = ", ".join([x.strip().capitalize() for x in p_row])
        except: remedies = "Consult doctor."

        return (f"ü©∫ **DIAGNOSIS REPORT**\n"
                f"--------------------------------\n"
                f"**Detected:** {winner}\n"
                f"**Certainty:** {confidence*100:.1f}% (Risk-Adjusted)\n\n"
                f"**Why this result?**\n"
                f"You reported *{', '.join(symptoms)}*. This pattern strongly matches {winner}, "
                f"and I have prioritized it based on clinical risk factors.\n\n"
                f"**Overview:**\n{d_text}\n\n"
                f"**Recommended Actions:**\n{remedies}\n"
                f"--------------------------------\n"
                f"(‚ö†Ô∏è AI Recommendation Only. Not a substitute for professional medical advice.)")

# --- INSTANTIATE ---
agent = Dr_AI_Advanced(calibrated_model, X.columns, WHO_Risk_DF, disease_to_who_map, sym_desc, sym_prec)
print("‚úÖ Final Agent Online.")
```

### üß™ How to Test Your Final System

Use these lines to verify the steps:

```python
# 1. Test the "Rejection" (Step 4)
print(agent.consult("I have a weird feeling in my toe")) 
# Expect: "Uncertain Diagnosis" because 'toe' isn't in your symptoms.

# 2. Test the "Smart Logic" (Risk + Relevance)
print(bot.consult("I have a high fever and I am shivering"))
# Expect: Malaria (because shivering->chills matches Malaria's profile perfectly).

SyntaxError: invalid syntax (4213025606.py, line 203)

In [39]:
import pandas as pd
import numpy as np
import random
import re  # Regex for text cleaning
import warnings

class DrAI_Agent:
    def __init__(self, model, feature_columns, who_risk_df, mapping, desc_df, prec_df):
        self.model = model
        self.features = feature_columns
        self.who_risk = who_risk_df.set_index('Cause')['Risk_Score']
        self.mapping = mapping
        self.desc_df = desc_df
        self.prec_df = prec_df
        
        # üß† SYNONYM DICTIONARY: Translates human speak to model speak
        self.synonyms = {
            'ache': 'pain', 'aching': 'pain', 'hurt': 'pain',
            'dizzleness': 'dizziness', 'dizzy': 'dizziness', 'spinning': 'dizziness',
            'puke': 'vomiting', 'throw up': 'vomiting', 'vomit': 'vomiting',
            'hot': 'fever', 'temperature': 'fever', 'burning': 'fever',
            'shiver': 'shivering', 'cold': 'chills',
            'tired': 'fatigue', 'exhausted': 'fatigue', 'weak': 'fatigue',
            'breathing': 'breathlessness', 'breath': 'breathlessness',
            'rash': 'skin_rash', 'spots': 'nodal_skin_eruptions',
            'belly': 'stomach', 'tummy': 'stomach'
        }
        
        self.greetings = ["Hello, I've analyzed your narrative.", "I have processed your symptoms."]
        self.concerns = ["However, I am prioritizing a critical risk.", "I am flagging a serious possibility."]

    def _extract_symptoms_from_text(self, text):
        """
        The NLP Brain: Converts a sentence into a list of dataset features.
        """
        # 1. Clean Text: Lowercase, remove punctuation
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text) # Remove punctuation
        
        # 2. Tokenize and Apply Synonyms
        words = text.split()
        cleaned_words = [self.synonyms.get(w, w) for w in words]
        processed_text = " " + " ".join(cleaned_words) + " " # Add padding for matching
        
        # 3. Match against Model Features (X.columns)
        matched_features = []
        
        for feature in self.features:
            # Create variations of the feature name to search for
            # e.g., 'stomach_pain' -> matches "stomach pain", "stomach_pain"
            feature_clean = feature.replace('_', ' ')
            
            # Split feature into core keywords (e.g., "stomach", "pain")
            keywords = feature.split('_')
            
            # LOGIC: If ALL keywords of a feature exist in the text, it's a match
            # Example: "stomach aches" -> "stomach pain" -> contains "stomach" AND "pain" -> Match!
            if all(k in processed_text for k in keywords):
                matched_features.append(feature)
            elif feature in processed_text: # Direct match
                matched_features.append(feature)
                
        return list(set(matched_features)) # Remove duplicates

    def _generate_reasoning(self, top_disease, raw_prob, is_risk_intervention):
        intro = random.choice(self.greetings)
        if is_risk_intervention:
            return (f"{intro}\n\n"
                    f"Based purely on the symptoms you described, this statistically looks like a common condition. "
                    f"{random.choice(self.concerns)} "
                    f"I have elevated **{top_disease}** to the top of your diagnosis list.\n\n"
                    f"üß† **My Reasoning:** While the raw statistical match is {raw_prob}, "
                    f"the mortality risk associated with {top_disease} is too high to ignore. "
                    f"Safety first.")
        else:
            return (f"{intro}\n\n"
                    f"I am diagnosing **{top_disease}** with a raw confidence of {raw_prob}. "
                    f"Your description aligns strongly with the clinical patterns for this condition.")

    def consult(self, user_input):
        # 1. NLP Extraction
        matched_symptoms = self._extract_symptoms_from_text(user_input)
        
        if not matched_symptoms:
            print("--- ü©∫ Dr. AI ---")
            print("I read your message, but I couldn't identify specific symptoms I recognize.")
            print("Could you list them more simply? (e.g., 'I have a fever and headache')")
            return

        # 2. Prepare Vector
        input_data = {col: 0 for col in self.features}
        for s in matched_symptoms:
            input_data[s] = 1
        
        # 3. Predict (Brain)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            input_vector = pd.DataFrame([input_data]).values
            probs = self.model.predict_proba(input_vector)[0]
        
        labels = self.model.classes_
        
        # 4. Risk Adjustment (Wisdom)
        risk_vector = pd.Series(labels).map(self.mapping).map(self.who_risk).fillna(0.00001).values
        adjusted_scores = probs * (1 + risk_vector) # Boost Formula
        
        # 5. Decision
        raw_winner_idx = np.argmax(probs)
        smart_winner_idx = np.argmax(adjusted_scores)
        
        raw_winner = labels[raw_winner_idx]
        smart_winner = labels[smart_winner_idx]
        
        is_risk_intervention = (raw_winner != smart_winner)
        
        # 6. Report
        print("--- ü©∫ Dr. AI Medical Report ---")
        print(f"üìù **I extracted these symptoms from your story:**\n   {', '.join(matched_symptoms)}\n")
        
        print(self._generate_reasoning(smart_winner, f"{probs[smart_winner_idx]*100:.1f}%", is_risk_intervention))
        
        # Details
        lookup = smart_winner.lower().replace(' ', '_').replace('(', '').replace(')', '').strip('_')
        try:
            desc = self.desc_df.loc[self.desc_df['Disease'] == lookup, 'Description'].values[0]
            print(f"\nüìò **What is it?**\n{desc}")
        except: pass
        
        try:
            p_row = self.prec_df.loc[self.prec_df['Disease'] == lookup].iloc[0, 1:].dropna().tolist()
            # Quick formatting
            pre = [x.strip().capitalize() for x in p_row]
            print(f"\nüõ°Ô∏è **Immediate Recommendations:**")
            for p in pre: print(f"- {p}.")
        except: pass
        
        print("\n(‚ö†Ô∏è Disclaimer: I am an AI. Please visit a real doctor for confirmation.)")

# --- INSTANTIATE THE AGENT ---
agent = DrAI_Agent(rf_model_base, X.columns, WHO_Risk_DF, disease_to_who_map, sym_desc, sym_prec)
print("‚úÖ Agent Upgrade Complete: Ready for Natural Language.")

‚úÖ Agent Upgrade Complete: Ready for Natural Language.


In [41]:
import numpy as np
from sentence_transformers import SentenceTransformer, util

model_emb = SentenceTransformer('all-MiniLM-L6-v2')

# Your symptom list
symptoms_db = list(X.columns)

symptom_embeddings = model_emb.encode(symptoms_db, convert_to_tensor=True)

def expand_and_match_symptoms(user_text):
    user_embedding = model_emb.encode(user_text, convert_to_tensor=True)

    scores = util.cos_sim(user_embedding, symptom_embeddings)[0]
    top_indices = scores.argsort(descending=True)

    matched = []
    for idx in top_indices[:10]:   # top matches
        if float(scores[idx]) > 0.45:  # threshold
            matched.append(symptoms_db[idx])

    return list(set(matched))


ModuleNotFoundError: No module named 'sentence_transformers'

In [40]:
# --- TEST WITH NATURAL LANGUAGE ---

# Your complex user story
user_story = "i have been having a bit fatigued and dizzleness every early hours of the morning and then my stomach aches, what do you think is is wrong with me and what are the preacautions i should take"

# Run the consultation
agent.consult(user_story)

--- ü©∫ Dr. AI Medical Report ---
üìù **I extracted these symptoms from your story:**
   dizziness, fatigue

I have processed your symptoms.

I am diagnosing **Cervical spondylosis** with a raw confidence of 26.0%. Your description aligns strongly with the clinical patterns for this condition.

üìò **What is it?**
Cervical spondylosis is a general term for age-related wear and tear affecting the spinal disks in your neck. As the disks dehydrate and shrink, signs of osteoarthritis develop, including bony projections along the edges of bones (bone spurs).

üõ°Ô∏è **Immediate Recommendations:**
- Use heating pad or cold pack.
- Exercise.
- Take otc pain reliver.
- Consult doctor.

(‚ö†Ô∏è Disclaimer: I am an AI. Please visit a real doctor for confirmation.)


In [205]:
import pandas as pd
import numpy as np
import random
import re
import warnings

class DrAI_Context_Agent:
    def __init__(self, model, feature_columns, who_risk_df, mapping, desc_df, prec_df):
        self.model = model
        self.features = feature_columns
        self.who_risk = who_risk_df.set_index('Cause')['Risk_Score']
        self.mapping = mapping
        self.desc_df = desc_df
        self.prec_df = prec_df
        
        # 1. SYMPTOM TRANSLATOR (Human -> Dataset)
        self.synonyms = {
            'ache': 'pain', 'aching': 'pain', 'hurt': 'pain',
            'dizzleness': 'dizziness', 'dizzy': 'dizziness', 'spinning': 'dizziness',
            'puke': 'vomiting', 'throw up': 'vomiting', 'vomit': 'vomiting',
            'hot': 'fever', 'temperature': 'fever', 'burning': 'fever',
            'shiver': 'shivering', 'cold': 'chills',
            'tired': 'fatigue', 'exhausted': 'fatigue', 'weak': 'fatigue',
            'breathing': 'breathlessness', 'breath': 'breathlessness',
            'rash': 'skin_rash', 'spots': 'nodal_skin_eruptions',
            'belly': 'stomach', 'tummy': 'stomach', 'bloated': 'stomach_pain' 
        }
        
        # 2. CONTEXT PATTERNS (The "Detail" Catcher)
        self.context_patterns = {
            'Time': ['morning', 'night', 'evening', 'afternoon', 'waking up', 'bedtime', 'early'],
            'Severity': ['very', 'severe', 'extreme', 'bad', 'mild', 'bit', 'slight', 'unbearable'],
            'Frequency': ['often', 'rarely', 'always', 'sometimes', 'once in a while', 'constant', 'every'],
            'Duration': ['days', 'weeks', 'months', 'hours', 'long time']
        }

    def _extract_context(self, text):
        """Scans the text for non-symptom details (Time, Severity, etc.)"""
        found_context = {}
        text = text.lower()
        
        for category, keywords in self.context_patterns.items():
            matches = [k for k in keywords if k in text]
            if matches:
                found_context[category] = matches[0] # Take the first match
        
        return found_context

    def _extract_symptoms(self, text):
        """Extracts symptoms for the ML model"""
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        words = text.split()
        cleaned_words = [self.synonyms.get(w, w) for w in words]
        processed_text = " " + " ".join(cleaned_words) + " "
        
        matched_features = []
        for feature in self.features:
            feature_clean = feature.replace('_', ' ')
            keywords = feature.split('_')
            if all(k in processed_text for k in keywords):
                matched_features.append(feature)
            elif feature in processed_text:
                matched_features.append(feature)
        return list(set(matched_features))

    def _generate_detailed_reasoning(self, top_disease, raw_prob, context, symptoms, is_risk):
        """Generates a specific, human-like response using captured details."""
        
        # 1. Acknowledge Context (The "I heard you" part)
        context_ack = ""
        if 'Time' in context:
            context_ack += f"noting that symptoms worsen in the **{context['Time']}**, "
        if 'Severity' in context:
            context_ack += f"and taking into account that the discomfort is **{context['Severity']}**, "
            
        # 2. Formulate the Logic
        intro = f"I have analyzed your description, {context_ack}specifically focusing on the **{', '.join(symptoms)}**."
        
        if is_risk:
            logic = (f"\n\nWhile statistically these symptoms might suggest a common issue (Raw Match: {raw_prob}), "
                     f"the combination of symptoms you described‚Äîespecially in the context of your **{context.get('Time', 'daily routine')}**‚Äî"
                     f"warrants caution.\n\n"
                     f"üö® **Decision:** I am flagging **{top_disease}** as the priority. "
                     f"Even if the probability seems moderate, the mortality risk profile requires us to rule this out first.")
        else:
            logic = (f"\n\nBased on the clinical patterns, your symptoms strongly align with **{top_disease}** (Confidence: {raw_prob}). "
                     f"The timing ({context.get('Time', 'general')}) and severity you described are consistent with this diagnosis.")

        return intro + logic

    def consult(self, user_input):
        # 1. Extract Information
        matched_symptoms = self._extract_symptoms(user_input)
        context_details = self._extract_context(user_input)
        
        if not matched_symptoms:
            return "I couldn't identify specific symptoms. Could you describe exactly what hurts?"

        # 2. ML Prediction
        input_data = {col: 0 for col in self.features}
        for s in matched_symptoms: input_data[s] = 1
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            input_vector = pd.DataFrame([input_data]).values
            probs = self.model.predict_proba(input_vector)[0]
            
        labels = self.model.classes_
        
        # 3. Risk Adjustment
        risk_vector = pd.Series(labels).map(self.mapping).map(self.who_risk).fillna(0.0).values
        adjusted_scores = probs * (1 + risk_vector)
        
        # 4. Decision
        raw_winner = labels[np.argmax(probs)]
        smart_winner = labels[np.argmax(adjusted_scores)]
        
        # 5. Generate Report
        print("--- ü©∫ Dr. AI Contextual Analysis ---")
        
        # Reasoning Engine
        reasoning = self._generate_detailed_reasoning(
            smart_winner, 
            f"{probs[np.argmax(adjusted_scores)]*100:.1f}%", 
            context_details,
            matched_symptoms,
            (raw_winner != smart_winner)
        )
        print(reasoning)
        
        # Details
        lookup = smart_winner.lower().replace(' ', '_').replace('(', '').replace(')', '').strip('_')
        try:
            desc = self.desc_df.loc[self.desc_df['Disease'] == lookup, 'Description'].values[0]
            print(f"\nüìò **Condition Overview:** {desc}")
        except: pass
        
        # Customized Advice based on Context
        print(f"\nüõ°Ô∏è **Tailored Recommendations:**")
        try:
            p_row = self.prec_df.loc[self.prec_df['Disease'] == lookup].iloc[0, 1:].dropna().tolist()
            for p in p_row:
                # If they said "morning", add specific timing advice
                if 'morning' in context_details.get('Time', '') and 'medication' in p:
                     print(f"- {p.capitalize()} (Best taken after your morning meal).")
                # If they said "severe", add urgency
                elif 'severe' in context_details.get('Severity', '') and 'consult' in p:
                     print(f"- **URGENT:** {p.capitalize()} immediately due to reported severity.")
                else:
                    print(f"- {p.capitalize()}.")
        except: pass
        
        print("\n(‚ö†Ô∏è Disclaimer: AI assistance only. Please visit a clinic.)")

# --- LOAD AGENT ---
agent_smart = DrAI_Context_Agent(rf_model_base, X.columns, WHO_Risk_DF, disease_to_who_map, sym_desc, sym_prec)
print("‚úÖ Context-Aware Agent Online.")

‚úÖ Context-Aware Agent Online.


In [207]:
user_story = "i am having headache and I'm nausiating too and i'm hungry but no food appeases me what can be the problem"

agent_smart.consult(user_story)

--- ü©∫ Dr. AI Contextual Analysis ---
I have analyzed your description, specifically focusing on the **headache**.

Based on the clinical patterns, your symptoms strongly align with **Paralysis (brain hemorrhage)** (Confidence: 31.0%). The timing (general) and severity you described are consistent with this diagnosis.

üõ°Ô∏è **Tailored Recommendations:**

(‚ö†Ô∏è Disclaimer: AI assistance only. Please visit a clinic.)


In [177]:
import numpy as np

# 1. Sum all 'Deaths' columns to get Total Deaths per record
# Create a list of all columns starting with 'Deaths'
death_cols = [col for col in who_mortality_sample.columns if col.startswith('Deaths') and len(col) > 6]

# Replace NaN (missing values) with 0 before summing
who_mortality_sample[death_cols] = who_mortality_sample[death_cols].fillna(0)

# Calculate the total deaths per row
who_mortality_sample['Total_Deaths'] = who_mortality_sample[death_cols].sum(axis=1)

# 2. Aggregate Total Deaths by Cause (ICD-10 Code)
WHO_CAUSE_COL = 'Cause'
who_agg = who_mortality_sample.groupby(WHO_CAUSE_COL)['Total_Deaths'].sum().reset_index()
who_agg = who_agg.rename(columns={'Total_Deaths': 'Death_Count'})

# 3. Use Death Count as the Risk Factor
# Normalize the Death Count (our W_d) to a scale of 0 to 1
who_agg['Risk_Score'] = who_agg['Death_Count'] / who_agg['Death_Count'].max()

# 4. Store the final WHO Risk Data
WHO_Risk_DF = who_agg[[WHO_CAUSE_COL, 'Risk_Score']]

print("WHO Risk Factor (Normalized Death Count) calculated.")
print(WHO_Risk_DF.sort_values('Risk_Score', ascending=False).head())

WHO Risk Factor (Normalized Death Count) calculated.
     Cause  Risk_Score
94     AAA    1.000000
1034   I64    0.125355
275   C349    0.123077
1127   J18    0.063283
1033  I639    0.051921


In [144]:
import pandas as pd

# --- ASSUMPTION ---
# The 'disease_labels' variable contains the array of unique disease names
# that your Random Forest model (rf_model_base) was trained on.

print("--- DEFINITIVE LIST OF ALL DISEASES FOR MANUAL MAPPING ---")
print(f"Total Unique Diseases Found: {len(disease_labels)}")

# Convert the NumPy array of labels into a list and print them in a clear, readable format
all_disease_names = sorted(list(disease_labels))

# Print the list, one item per line, for easy copying into your mapping dictionary
for name in all_disease_names:
    print(f"'{name}': '',")

--- DEFINITIVE LIST OF ALL DISEASES FOR MANUAL MAPPING ---
Total Unique Diseases Found: 41
'(vertigo) Paroymsal  Positional Vertigo': '',
'AIDS': '',
'Acne': '',
'Alcoholic hepatitis': '',
'Allergy': '',
'Arthritis': '',
'Bronchial Asthma': '',
'Cervical spondylosis': '',
'Chicken pox': '',
'Chronic cholestasis': '',
'Common Cold': '',
'Dengue': '',
'Diabetes ': '',
'Dimorphic hemmorhoids(piles)': '',
'Drug Reaction': '',
'Fungal infection': '',
'GERD': '',
'Gastroenteritis': '',
'Heart attack': '',
'Hepatitis B': '',
'Hepatitis C': '',
'Hepatitis D': '',
'Hepatitis E': '',
'Hypertension ': '',
'Hyperthyroidism': '',
'Hypoglycemia': '',
'Hypothyroidism': '',
'Impetigo': '',
'Jaundice': '',
'Malaria': '',
'Migraine': '',
'Osteoarthristis': '',
'Paralysis (brain hemorrhage)': '',
'Peptic ulcer diseae': '',
'Pneumonia': '',
'Psoriasis': '',
'Tuberculosis': '',
'Typhoid': '',
'Urinary tract infection': '',
'Varicose veins': '',
'hepatitis A': '',


In [145]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

# --- STEP 1: DEFINE THE FINAL MANUAL MAPPING DICTIONARY ---

# This dictionary maps the EXACT Disease Name (from your model) to the ICD-10 Code (from WHO data).
disease_to_who_map = {
    # Keys must match the model's output (disease_labels) exactly
    '(vertigo) Paroymsal  Positional Vertigo': 'H81',
    'AIDS': 'B24',
    'Acne': 'L70',
    'Alcoholic hepatitis': 'K70.1',
    'Allergy': 'J30',
    'Arthritis': 'M13.9',
    'Bronchial Asthma': 'J45',
    'Cervical spondylosis': 'M47.9',
    'Chicken pox': 'B01',
    'Chronic cholestasis': 'K76.9',
    'Common Cold': 'J00',
    'Dengue': 'A90',
    'Diabetes ': 'E14',
    'Dimorphic hemmorhoids(piles)': 'I84',
    'Drug Reaction': 'T88.7',
    'Fungal infection': 'B49',
    'GERD': 'K21.9',
    'Gastroenteritis': 'A09',
    'Heart attack': 'I21',
    'Hepatitis B': 'B18.1',
    'Hepatitis C': 'B18.2',
    'Hepatitis D': 'B18.8',
    'Hepatitis E': 'B18.8',
    'Hypertension ': 'I10',
    'Hyperthyroidism': 'E05.9',
    'Hypoglycemia': 'E16.2',
    'Hypothyroidism': 'E03.9',
    'Impetigo': 'L01',
    'Jaundice': 'R17',
    'Malaria': 'B54',
    'Migraine': 'G43.9',
    'Osteoarthristis': 'M19.9',
    'Paralysis (brain hemorrhage)': 'I61.9',
    'Peptic ulcer diseae': 'K27.9',
    'Pneumonia': 'J18.9',
    'Psoriasis': 'L40.9',
    'Tuberculosis': 'A16.9',
    'Typhoid': 'A01.0',
    'Urinary tract infection': 'N39.0',
    'Varicose veins': 'I83.9',
    'hepatitis A': 'B15.9'
}

# Define the WHO cause column (used as the index for risk scores)
WHO_CAUSE_COL = 'Cause'

# --- STEP 2: CREATE THE ALIGNED RISK VECTOR (W_d) ---

# Re-index the Risk_Score_Series to be indexed by ICD Code
Risk_Score_Series = WHO_Risk_DF.set_index(WHO_CAUSE_COL)['Risk_Score']

# 1. Map Model Output (Disease Name) -> ICD Code (using the map above)
risk_lookup = pd.Series(disease_labels).map(disease_to_who_map)

# 2. Map ICD Code -> Final Risk Score (W_d)
# Fill missing values (for diseases not in WHO sample) with the lowest risk score
min_risk = Risk_Score_Series.min() if not Risk_Score_Series.empty else 0.00001
risk_vector_aligned = risk_lookup.map(Risk_Score_Series).fillna(min_risk).values
risk_vector = risk_vector_aligned.reshape(1, -1)

# --- STEP 3: CALCULATE ADJUSTED SCORES AND COMPARISON ---

# 1. Calculate Adjusted Scores = Probability * Risk Weight (W_d)
adjusted_scores = Y_pred_proba * risk_vector

# 2. Determine the prediction based on the highest ADJUSTED Score
adjusted_pred_indices = np.argmax(adjusted_scores, axis=1)
Y_adjusted_pred = [disease_labels[i] for i in adjusted_pred_indices]

# 3. Get the raw model prediction and create comparison table
raw_pred = rf_model_base.predict(X_test)
comparison_df = pd.DataFrame({
    'Actual_Disease': Y_test,
    'Raw_Model_Prediction': raw_pred,
    'Risk_Adjusted_Prediction': Y_adjusted_pred
})

comparison_df['Prediction_Changed'] = (comparison_df['Raw_Model_Prediction'] != comparison_df['Risk_Adjusted_Prediction'])

print("--- FINAL RISK-ADJUSTED PREDICTION RESULTS üëë ---")
print(f"Total predictions checked: {len(Y_test)}")
print(f"Total predictions CHANGED by WHO risk weighting: **{comparison_df['Prediction_Changed'].sum()}**")
print("-" * 50)
print("\nFirst 10 Cases Where Risk Integration Changed the Diagnosis:")
print(comparison_df[comparison_df['Prediction_Changed']].head(10))

--- FINAL RISK-ADJUSTED PREDICTION RESULTS üëë ---
Total predictions checked: 984
Total predictions CHANGED by WHO risk weighting: **768**
--------------------------------------------------

First 10 Cases Where Risk Integration Changed the Diagnosis:
                  Actual_Disease          Raw_Model_Prediction  \
1                  Drug Reaction                 Drug Reaction   
2   Dimorphic hemmorhoids(piles)  Dimorphic hemmorhoids(piles)   
3                Hyperthyroidism               Hyperthyroidism   
4                Osteoarthristis               Osteoarthristis   
7                    Hepatitis B                   Hepatitis B   
8                      Pneumonia                     Pneumonia   
10                 Drug Reaction                 Drug Reaction   
11  Dimorphic hemmorhoids(piles)  Dimorphic hemmorhoids(piles)   
12                  Tuberculosis                  Tuberculosis   
13                  Hypoglycemia                  Hypoglycemia   

                   R

In [146]:
import pandas as pd
import numpy as np

# Assuming Y is your final Series of disease labels (the Answer)
# If Y is not defined, use: Y = sym_dataset['Disease']
Y_unique_diseases = Y.unique()

print(f"Total Unique Diseases in Model: {len(Y_unique_diseases)}")

# --- 2. Check the Existing Mapping Coverage ---

# NOTE: The provided list is WHO_ICD_to_Description. We need the DISEASE_NAME_to_ICD.
# Assuming your existing manual mapping dictionary is named 'disease_to_who_map'
# If it's not defined, the next step will fail.
# If you don't have the dictionary defined yet, you must define it first!

# 3. Find Unmapped Diseases

# Get the set of all unique disease names from your model (Y)
all_diseases_set = set(Y_unique_diseases)

# Get the set of diseases already mapped (from the keys of your dictionary)
# If your dictionary is backwards (ICD -> Description), you must check if the descriptions match your disease names.

# Assuming your final mapping dictionary is defined and contains the mappings you've done so far.
try:
    mapped_diseases_set = set(disease_to_who_map.keys())
except NameError:
    # If the dictionary is not defined, we can't check for unmapped ones.
    print("\nüö® ERROR: 'disease_to_who_map' is not defined. Please define it first.")
    # Exit and ask the user to paste the dictionary definition.
    raise

unmapped_diseases = all_diseases_set - mapped_diseases_set

print("\n--- üéØ REMAINING UNMAPPED DISEASES (Needs an ICD Code Match) ---")
print(f"Total Unmapped Diseases: {len(unmapped_diseases)}")
print(sorted(list(unmapped_diseases)))

Total Unique Diseases in Model: 41

--- üéØ REMAINING UNMAPPED DISEASES (Needs an ICD Code Match) ---
Total Unmapped Diseases: 0
[]


In [147]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

# --- ASSUMPTIONS ---
# The dictionary disease_to_who_map is fully defined with all 41 diseases.
# rf_model_base, X_test, Y_test, Y_pred_proba, disease_labels, and WHO_Risk_DF are all defined.

# Define the WHO cause column (used as the index for risk scores)
WHO_CAUSE_COL = 'Cause'

# Re-index the Risk_Score_Series to be indexed by ICD Code
Risk_Score_Series = WHO_Risk_DF.set_index(WHO_CAUSE_COL)['Risk_Score']

# 1. Map Model Output (Disease Name) -> ICD Code (using the map)
risk_lookup = pd.Series(disease_labels).map(disease_to_who_map)

# 2. Map ICD Code -> Final Risk Score (W_d)
# Fill missing values (for unmapped diseases) with the lowest risk score
min_risk = Risk_Score_Series.min() if not Risk_Score_Series.empty else 0.00001
risk_vector_aligned = risk_lookup.map(Risk_Score_Series).fillna(min_risk).values
risk_vector = risk_vector_aligned.reshape(1, -1)

# 3. Calculate Adjusted Scores = Probability * Risk Weight (W_d)
adjusted_scores = Y_pred_proba * risk_vector

# 4. Generate Final Predictions
adjusted_pred_indices = np.argmax(adjusted_scores, axis=1)
Y_adjusted_pred = [disease_labels[i] for i in adjusted_pred_indices]

# 5. Comparison
raw_pred = rf_model_base.predict(X_test)
comparison_df = pd.DataFrame({
    'Actual_Disease': Y_test,
    'Raw_Model_Prediction': raw_pred,
    'Risk_Adjusted_Prediction': Y_adjusted_pred
})

comparison_df['Prediction_Changed'] = (comparison_df['Raw_Model_Prediction'] != comparison_df['Risk_Adjusted_Prediction'])

print("--- FINAL RISK-ADJUSTED PREDICTION RESULTS (TESTING SMARTNESS) üëë ---")
print(f"Total predictions checked: {len(Y_test)}")
print(f"Total predictions CHANGED by WHO risk weighting: **{comparison_df['Prediction_Changed'].sum()}**")
print("-" * 50)
print("\nFirst 10 Cases Where Risk Integration Changed the Diagnosis:")
# This table proves the "smartness" by showing the model prioritized a higher-risk illness.
print(comparison_df[comparison_df['Prediction_Changed']].head(10))

--- FINAL RISK-ADJUSTED PREDICTION RESULTS (TESTING SMARTNESS) üëë ---
Total predictions checked: 984
Total predictions CHANGED by WHO risk weighting: **768**
--------------------------------------------------

First 10 Cases Where Risk Integration Changed the Diagnosis:
                  Actual_Disease          Raw_Model_Prediction  \
1                  Drug Reaction                 Drug Reaction   
2   Dimorphic hemmorhoids(piles)  Dimorphic hemmorhoids(piles)   
3                Hyperthyroidism               Hyperthyroidism   
4                Osteoarthristis               Osteoarthristis   
7                    Hepatitis B                   Hepatitis B   
8                      Pneumonia                     Pneumonia   
10                 Drug Reaction                 Drug Reaction   
11  Dimorphic hemmorhoids(piles)  Dimorphic hemmorhoids(piles)   
12                  Tuberculosis                  Tuberculosis   
13                  Hypoglycemia                  Hypoglycemia   



In [148]:
import pandas as pd
import numpy as np
import warnings # Keep this import

def predict_top_diseases_RISK_ADJUSTED(symptoms_list, top_k=5):
    """
    Predicts top diseases using the trained model and adjusts ranking based on 
    the WHO-derived Risk Score (W_d), suppressing the harmless UserWarning.
    """
    
    # --- FIX: Temporarily suppress ALL warnings during the prediction call ---
    # This prevents the harmless "X has feature names" UserWarning.
    with warnings.catch_warnings():
        warnings.simplefilter("ignore") 
        
        # --- 1. Prepare Input (Create DataFrame) ---
        input_data = {symptom: 0 for symptom in symptom_columns}
        for symptom in symptoms_list:
            if symptom in input_data:
                input_data[symptom] = 1
        input_df = pd.DataFrame([input_data])
        
        # 2. Get Raw Probabilities
        # input_df.values ensures the input is a NumPy array
        probabilities = model.predict_proba(input_df.values)[0]
    
    # WARNING SUPPRESSED - Continue with the rest of the logic
    
    disease_labels = model.classes_

    # 3. Apply WHO Risk Adjustment (W_d) (Logic remains the same)
    risk_lookup = pd.Series(disease_labels).map(disease_to_who_map)
    Risk_Score_Series_ICD = WHO_Risk_DF.set_index('Cause')['Risk_Score']
    min_risk = Risk_Score_Series_ICD.min()
    risk_vector = risk_lookup.map(Risk_Score_Series_ICD).fillna(min_risk).values
    
    # Calculate Adjusted Scores = Probability * Risk Score
    adjusted_scores = probabilities * risk_vector

    # 4. Get top K predictions based on the ADJUSTED SCORE
    top_indices = np.argsort(adjusted_scores)[::-1][:top_k]
    top_diseases = disease_labels[top_indices]
    
    # --- Output Formatting ---
    results = []
    for i in range(top_k):
        disease = top_diseases[i]
        
        # Standardize the predicted disease name for lookup keys
        lookup_key = disease.lower().replace(' ', '_').replace('(', '').replace(')', '').strip('_')

        # Lookup logic (assuming sym_desc and sym_prec are globally defined)
        try:
            desc = sym_desc.loc[sym_desc["Disease"] == lookup_key, "Description"].values[0]
        except:
            desc = "Description N/A."
        
        try:
            pre_row = sym_prec.loc[sym_prec["Disease"] == lookup_key]
            raw_precautions = pre_row.iloc[0][1:].dropna().tolist()
            # Assuming you use the SAFE FORMATTING function here:
            pre = format_remedy_list_SAFE(raw_precautions) 
        except:
            pre = ["Home Care N/A."]
            
        raw_prob_percent = probabilities[top_indices[i]] * 100
        
        results.append({
            "Rank": i + 1,
            "Disease": disease,
            "Adjusted Score (Rank Value)": round(adjusted_scores[top_indices[i]], 5),
            "Raw Probability (%)": round(raw_prob_percent, 2),
            "Description": desc,
            "Recommended Home Care": pre
        })

    return results

# --- EXECUTE THE TEST ---
# --- EXECUTE THE TEST AGAIN ---
test_symptoms = [
    'high_fever', 'chills', 'shivering', 'fatigue', 'cough', 'breathlessness', 'runny_nose'
]
print("--- TEST CASE: HIGH-RISK INFECTION SCENARIO (FIXED LOOKUP) ---")
final_diagnosis = predict_top_diseases_RISK_ADJUSTED(test_symptoms, top_k=5)
print(pd.DataFrame(final_diagnosis))
# You must execute this entire block (including the function definition)
# and then run your test call (e.g., final_diagnosis = predict_top_diseases_RISK_ADJUSTED(test_symptoms, top_k=5))

--- TEST CASE: HIGH-RISK INFECTION SCENARIO (FIXED LOOKUP) ---


NameError: name 'symptom_columns' is not defined

In [68]:
# --- Test Case 3: High-Risk Metabolic/Chronic Symptoms ---

test_symptoms_3 = [
    'polyuria',
    'excessive_hunger',
    'fatigue',
    'blurred_and_distorted_vision',
    'irregular_sugar_level',
    'fast_heart_rate'
]

print("\n--- TEST CASE 3: HIGH-RISK CHRONIC SCENARIO ---")
final_diagnosis_3 = predict_top_diseases_RISK_ADJUSTED(test_symptoms_3, top_k=5)
print(pd.DataFrame(final_diagnosis_3))


--- TEST CASE 3: HIGH-RISK CHRONIC SCENARIO ---
   Rank           Disease  Adjusted Score (Rank Value)  Raw Probability (%)  \
0     1   Gastroenteritis                      0.00020                  3.0   
1     2         Diabetes                       0.00019                 51.0   
2     3     Hypertension                       0.00004                  1.0   
3     4  Fungal infection                      0.00000                  1.0   
4     5      Tuberculosis                      0.00000                  0.0   

                                         Description  \
0  Gastroenteritis is an inflammation of the dige...   
1  Diabetes is a disease that occurs when your bl...   
2  Hypertension (HTN or HT), also known as high b...   
3  In humans, fungal infections occur when an inv...   
4  Tuberculosis (TB) is an infectious disease usu...   

                               Recommended Home Care  
0  [Try taking small sips of water., Stop eating ...  
1  [Exercise., Follow up., Ha

In [69]:
import pandas as pd
import numpy as np

# --- SAFEST HELPER FUNCTION: Preserves Original Medical Specificity ---
def format_remedy_list_SAFE(raw_precaution_list):
    """
    Formats raw precaution phrases by adding structure and definitive medical disclaimers,
    without altering the original specific instructions (e.g., 2 cups, 72 hours).
    """
    if not raw_precaution_list: 
        return ["Consult a medical professional immediately."]

    formatted_remedies = []
    
    # 1. Clean and capitalize each specific instruction
    for phrase in raw_precaution_list:
        phrase = phrase.strip()
        if phrase:
            # Capitalize the start and ensure a period is at the end
            clean_phrase = phrase[0].upper() + phrase[1:]
            if not clean_phrase.endswith('.'):
                clean_phrase += '.'
            formatted_remedies.append(clean_phrase)
    
    # 2. Add the critical medical disclaimer and persistent symptom advice
    
    # A generic high-priority warning (always safe to include)
    formatted_remedies.append("If symptoms persist beyond 72 hours, worsen, or cause severe discomfort, seek medical attention immediately.")
    
    # Remove duplicates (in case the source had identical entries)
    return list(set(formatted_remedies))


# --- REVISED PREDICTION FUNCTION (Using the Safe Formatter) ---

def predict_top_diseases_RISK_ADJUSTED(symptoms_list, top_k=5):
    # (Rest of the function logic remains the same for risk calculation)
    # ...
    
    # --- Output Formatting ---
    # ...
    
    for i in range(top_k):
        # ... (Prediction and lookup key logic) ...
        
        # 2. Fetch precautions (Using the new safe formatting function)
        try:
            pre_row = sym_prec.loc[sym_prec["Disease"] == lookup_key]
            raw_precautions = pre_row.iloc[0][1:].dropna().tolist()
            # *** APPLY THE NEW SAFE CONFIDENT FORMATTING HERE ***
            pre = format_remedy_list_SAFE(raw_precautions)
        except (IndexError, KeyError, pd.core.indexing.IndexingError):
            pre = ["Consult a medical professional if you have concerns."]
            
        # ... (Rest of the result append logic) ...

In [70]:
# --- Test Case 3: High-Risk Metabolic/Chronic Symptoms ---

test_symptoms_3 = [
    'polyuria',
    'excessive_hunger',
    'fatigue',
    'blurred_and_distorted_vision',
    'irregular_sugar_level',
    'fast_heart_rate'
]

print("\n--- TEST CASE 3: HIGH-RISK CHRONIC SCENARIO ---")
final_diagnosis_3 = predict_top_diseases_RISK_ADJUSTED(test_symptoms_3, top_k=5)
print(pd.DataFrame(final_diagnosis_3))


--- TEST CASE 3: HIGH-RISK CHRONIC SCENARIO ---


NameError: name 'lookup_key' is not defined