In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

df1 = pd.read_csv("D:\IMP\Amdox Projects\Healthcare Recommendation System\Disease_symptom_and_patient_profile_dataset (1).csv")

df1.columns = df1.columns.str.strip().str.lower().str.replace(" ", "_")
print(df1.columns)

Index(['disease', 'fever', 'cough', 'fatigue', 'difficulty_breathing', 'age',
       'gender', 'blood_pressure', 'cholesterol_level', 'outcome_variable'],
      dtype='object')


In [8]:
df1.fillna(0, inplace=True)

In [9]:
df1['gender'] = df1['gender'].str.lower().replace({
    "m": "male",
    "f": "female"
})

encoder = OneHotEncoder(sparse_output=False)
gender_encoded = encoder.fit_transform(df1[['gender']])

gender_df = pd.DataFrame(
    gender_encoded,
    columns=encoder.get_feature_names_out(['gender'])
)

In [11]:
print(df1['blood_pressure'].unique())
print(df1['cholesterol_level'].unique())

['Low' 'Normal' 'High']
['Normal' 'Low' 'High']


In [12]:
bp_map = {"Low": 0, "Normal": 1, "High": 2, "low": 0, "normal": 1, "high": 2}
chol_map = {"Low": 0, "Normal": 1, "High": 2, "low": 0, "normal": 1, "high": 2}

df1['blood_pressure'] = df1['blood_pressure'].map(bp_map)
df1['cholesterol_level'] = df1['cholesterol_level'].map(chol_map)

In [13]:
scaler = StandardScaler()

df1['age_scaled'] = scaler.fit_transform(df1[['age']])
df1['bp_scaled'] = scaler.fit_transform(df1[['blood_pressure']])
df1['chol_scaled'] = scaler.fit_transform(df1[['cholesterol_level']])

In [14]:
feature_df = pd.concat([
    df1[['fever', 'cough', 'fatigue', 'difficulty_breathing',
         'age_scaled', 'bp_scaled', 'chol_scaled']],
    gender_df
], axis=1)

labels = df1['disease']

In [16]:
labels.value_counts()

Asthma                            23
Stroke                            16
Osteoporosis                      14
Hypertension                      10
Diabetes                          10
                                  ..
Autism Spectrum Disorder (ASD)     1
Hypoglycemia                       1
Fibromyalgia                       1
Eating Disorders (Anorexia,...     1
Williams Syndrome                  1
Name: disease, Length: 116, dtype: int64

In [36]:
risk_map = {
    # HIGH RISK
    "Stroke": "High", "Brain Tumor": "High", "Pancreatic Cancer": "High",
    "Lung Cancer": "High", "Liver Cancer": "High", "Kidney Cancer": "High",
    "Ovarian Cancer": "High", "Colorectal Cancer": "High", "Prostate Cancer": "High",
    "Thyroid Cancer": "High", "Breast Cancer": "High", "Bladder Cancer": "High",
    "Melanoma": "High", "Leukemia": "High", "Lymphoma": "High",
    "HIV/AIDS": "High", "Ebola Virus": "High", "Rabies": "High", "Sepsis": "High",
    "Chronic Obstructive Pulmonary Disease (COPD)": "High",
    "Pneumocystis Pneumonia (PCP)": "High",
    "Coronary Artery Disease": "High", "Myocardial Infarction (Heart Attack)": "High",
    "Chronic Kidney Disease": "High", "Kidney Disease": "High",
    "Parkinson's Disease": "High", "Alzheimer's Disease": "High", "Dementia": "High",
    "Hemophilia": "High", "Sickle Cell Anemia": "High", "Spina Bifida": "High",
    "Pancreatitis": "High", "Cirrhosis": "High", "Liver Disease": "High",

    # MEDIUM RISK
    "Asthma": "Medium", "Hypertension": "Medium", "Hyperthyroidism": "Medium",
    "Hypothyroidism": "Medium", "Diabetes": "Medium",
    "Rheumatoid Arthritis": "Medium", "Osteoporosis": "Medium",
    "Osteoarthritis": "Medium", "Crohn's Disease": "Medium",
    "Ulcerative Colitis": "Medium", "Eczema": "Medium",
    "Psoriasis": "Medium", "Migraine": "Medium", "Gastroenteritis": "Medium",
    "Epilepsy": "Medium", "Fibromyalgia": "Medium",
    "Multiple Sclerosis": "Medium", "Bipolar Disorder": "Medium",
    "Depression": "Medium", "Anxiety Disorders": "Medium",
    "Obsessive-Compulsive Disorder (OCD)": "Medium",
    "Sleep Apnea": "Medium", "Anemia": "Medium", "Hypoglycemia": "Medium",
    "Hyperglycemia": "Medium", "Lyme Disease": "Medium", "Tuberculosis": "Medium",
    "Polycystic Ovary Syndrome (PCOS)": "Medium", "Endometriosis": "Medium",

    # LOW RISK
    "Common Cold": "Low", "Influenza": "Low",
    "Allergic Rhinitis": "Low", "Sinusitis": "Low",
    "Conjunctivitis (Pink Eye)": "Low", "Tonsillitis": "Low",
    "Acne": "Low", "Chickenpox": "Low", "Rubella": "Low",
    "Measles": "Low", "Mumps": "Low", "Cholera": "Low",
    "Zika Virus": "Low", "Otitis Media (Ear Infection)": "Low",
    "Dengue Fever": "Low", "Pneumonia": "Low",
    "Bronchitis": "Low", "Urinary Tract Infection": "Low",
    "Appendicitis": "Low"
}

df1["risk_level"] = df1["disease"].map(risk_map)
df1["risk_level"].fillna("Medium", inplace=True)

In [37]:
df1["risk_level"].value_counts()

Medium    198
High       90
Low        61
Name: risk_level, dtype: int64

In [39]:
labels = df1["risk_level"]

In [40]:
feature_df = df1.drop(["risk_level", "disease"], axis=1)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    feature_df, labels, test_size=0.2, random_state=42, stratify=labels
)

In [42]:
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)

feature_df.to_csv("clean_final_dataset.csv", index=False)

In [43]:
df1.to_csv("full_processed_dataset.csv", index=False)

In [46]:
print(feature_df.columns)

Index(['fever', 'cough', 'fatigue', 'difficulty_breathing', 'age', 'gender',
       'blood_pressure', 'cholesterol_level', 'outcome_variable', 'age_scaled',
       'bp_scaled', 'chol_scaled'],
      dtype='object')


In [47]:
print(df1.columns)

Index(['disease', 'fever', 'cough', 'fatigue', 'difficulty_breathing', 'age',
       'gender', 'blood_pressure', 'cholesterol_level', 'outcome_variable',
       'age_scaled', 'bp_scaled', 'chol_scaled', 'risk_level'],
      dtype='object')


In [48]:
print(df1.head)

<bound method NDFrame.head of          disease fever cough fatigue difficulty_breathing  age  gender  \
0      Influenza   Yes    No     Yes                  Yes   19  female   
1    Common Cold    No   Yes     Yes                   No   25  female   
2         Eczema    No   Yes     Yes                   No   25  female   
3         Asthma   Yes   Yes      No                  Yes   25    male   
4         Asthma   Yes   Yes      No                  Yes   25    male   
..           ...   ...   ...     ...                  ...  ...     ...   
344       Stroke   Yes    No     Yes                   No   80  female   
345       Stroke   Yes    No     Yes                   No   85    male   
346       Stroke   Yes    No     Yes                   No   85    male   
347       Stroke   Yes    No     Yes                   No   90  female   
348       Stroke   Yes    No     Yes                   No   90  female   

     blood_pressure  cholesterol_level outcome_variable  age_scaled  \
0         