In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from joblib import dump

In [9]:
df = pd.read_csv("../data/patient_dataset.csv")
print(df.columns.tolist())
df['Transfusion Count'] = df['Transfusion History'].str.extract(r'(\d+)').astype(float)

['Hemoglobin (Hb)', 'Serum Ferritin', 'RBC Count', 'MCV', 'MCH', 'MCHC', 'Reticulocyte Count', 'LDH', 'Transfusion History', 'Splenectomy Status', 'Heart Rate (BPM)', 'Blood Pressure (BP)', 'Oxygen Saturation (SpO2)', 'Body Temperature', 'Weight (kg)', 'Height (cm)', 'Age', 'Gender', 'Geographical Location', 'Family History', 'Compliance to Therapy', 'Future_Hb', 'Anemia_Risk']


In [10]:
df['Future_Hb'] = df['Hemoglobin (Hb)'].shift(-1)
df.dropna(inplace=True)
df['Anemia_Risk'] = df['Future_Hb'] < 8

In [5]:
features = [
    'Hemoglobin (Hb)',
    'Serum Ferritin',
    'Transfusion Count',     # ✅ use the numeric one
    'Compliance to Therapy',
    'Heart Rate (BPM)',
    'Oxygen Saturation (SpO2)',
    'Age'
]

X = df[features]
y = df['Anemia_Risk'].astype(int)


In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
print(model.classes_)  # It should show: [0, 1]

[1]


In [15]:
dump(model, "../models/anemia_predictor.pkl")
dump(scaler, "../models/scaler.pkl")

['../models/scaler.pkl']