In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import joblib



# Load your correct path
df = pd.read_csv("data/synthetic_patients.csv")

# --- VERIFY NO MISSING VALUES ---
print(df.isna().sum())

# Target column
y = df["readmitted"]       # <-- YOUR TARGET (always 0/1, never NaN)

# Features
X = df.drop(["readmitted", "patient_id"], axis=1)

# Identify column types
numeric_features = [
    "age", "length_of_stay", "previous_admissions",
    "lab_glucose", "lab_hemoglobin", "cholesterol",
    "creatinine", "blood_pressure", "heart_rate", "temperature"
]

categorical_features = [
    "gender", "admission_type"
]

# Preprocessing
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Model
model = RandomForestClassifier(n_estimators=200, random_state=42)

# Full pipeline
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
clf.fit(X_train, y_train)

print("ðŸŽ‰ Model trained successfully!")

# Save model + preprocessor
joblib.dump(clf, "model.pkl")
joblib.dump(preprocessor, "preprocessor.pkl")

print("âœ… Saved as model.pkl and preprocessor.pkl")


patient_id             0
age                    0
gender                 0
admission_type         0
length_of_stay         0
previous_admissions    0
lab_glucose            0
lab_hemoglobin         0
cholesterol            0
creatinine             0
blood_pressure         0
heart_rate             0
temperature            0
readmitted             0
dtype: int64
ðŸŽ‰ Model trained successfully!
âœ… Saved as model.pkl and preprocessor.pkl


In [23]:
import pickle
model_path = "/home/ec2-user/SageMaker/healthcare/model.pkl"
preprocessor_path = "/home/ec2-user/SageMaker/healthcare/preprocessor.pkl"


In [24]:
type(model), type(preprocessor)


(sklearn.ensemble._forest.RandomForestClassifier,
 sklearn.compose._column_transformer.ColumnTransformer)

In [25]:
import joblib

# Load preprocessor
preprocessor_path = "/home/ec2-user/SageMaker/healthcare/preprocessor.pkl"
preprocessor = joblib.load(preprocessor_path)
print("Preprocessor loaded successfully!")

# Load model
model_path = "/home/ec2-user/SageMaker/healthcare/model.pkl"
model = joblib.load(model_path)
print("Model loaded successfully!")


Preprocessor loaded successfully!
Model loaded successfully!


In [26]:
import joblib

model_path = "/home/ec2-user/SageMaker/healthcare/model.pkl"
preprocessor_path = "/home/ec2-user/SageMaker/healthcare/preprocessor.pkl"

model = joblib.load(model_path)
preprocessor = joblib.load(preprocessor_path)

print("Model and preprocessor loaded successfully!")


Model and preprocessor loaded successfully!
