In [1]:
import pandas as pd

df = pd.read_csv("enhanced_medical_dataset_100k.csv")


In [2]:
symptom_texts = df["Symptom_Description"]  # or however symptoms were stored
vitals = df[["BP", "HeartRate", "Age", "Weight", "Temperature"]]


KeyError: 'Symptom_Description'

In [3]:
print(df.columns.tolist())


['Symptoms', 'Age', 'Gender', 'Severity', 'Temperature (°F)', 'Heart Rate (bpm)', 'Blood Pressure', 'Oxygen Saturation (%)', 'Disease']


In [4]:
# Extract symptom texts and vitals
symptom_texts = df["Symptoms"].astype(str)  # Convert to string in case of NaNs

vitals = df[[
    "Blood Pressure",
    "Heart Rate (bpm)",
    "Age",
    "Temperature (°F)",
    "Oxygen Saturation (%)"
]]



In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_symptoms = vectorizer.fit_transform(symptom_texts)


In [6]:
print("TF-IDF feature count:", len(vectorizer.get_feature_names_out()))


TF-IDF feature count: 96


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_vitals = scaler.fit_transform(vitals)


ValueError: could not convert string to float: '132/67'

In [8]:
# Convert "132/67" to average (132 + 67) / 2 = 99.5
def convert_bp(bp_str):
    try:
        systolic, diastolic = map(int, bp_str.split('/'))
        return (systolic + diastolic) / 2
    except:
        return np.nan  # handle bad values

df["Blood Pressure"] = df["Blood Pressure"].apply(convert_bp)


In [9]:
# Option 1: drop rows with missing vitals
df.dropna(subset=["Blood Pressure"], inplace=True)

# Option 2: fill with mean if you'd prefer
# df["Blood Pressure"].fillna(df["Blood Pressure"].mean(), inplace=True)


In [10]:
from sklearn.preprocessing import StandardScaler

vitals = df[[
    "Blood Pressure",
    "Heart Rate (bpm)",
    "Age",
    "Temperature (°F)",
    "Oxygen Saturation (%)"
]]

scaler = StandardScaler()
X_vitals = scaler.fit_transform(vitals)
