In [2]:

# Patient Readmission Prediction


# 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 2. Generate synthetic dataset
np.random.seed(42)
n_samples = 1000

data = pd.DataFrame({
    'PatientID': range(1, n_samples+1),
    'Age': np.random.randint(18, 90, n_samples),               
    'Gender': np.random.choice(['Male', 'Female'], n_samples), 
    'NumPrevAdmissions': np.random.randint(0, 10, n_samples),  
    'NumMedications': np.random.randint(1, 20, n_samples),     
    'PrimaryDiagnosis': np.random.choice(
        ['Diabetes', 'Heart Disease', 'Infection', 'Respiratory', 'Other'], n_samples
    ),
    'LengthOfStay': np.random.randint(1, 30, n_samples),       
    'HasChronicDisease': np.random.choice([0,1], n_samples),   
    'Readmitted': np.random.choice([0,1], n_samples, p=[0.7, 0.3])  
})

# save dataset
data.to_csv('synthetic_patient_data.csv', index=False)
print("Dataset generated. Preview:")
print(data.head())

# 3. Preprocess data
# Drop PatientID (not a feature)
data = data.drop(['PatientID'], axis=1)

# Fill missing values properly
numeric_cols = data.select_dtypes(include=np.number).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

# Encode categorical columns
for col in categorical_cols:
    data[col] = data[col].astype('category').cat.codes

# 4. Split features and target
X = data.drop('Readmitted', axis=1)
y = data['Readmitted']

# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Initialize Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 7. Train model
rf_model.fit(X_train, y_train)

# 8. Make predictions
y_pred = rf_model.predict(X_test)

# 9. Evaluate model
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\n### Model Evaluation ###")
print("Accuracy:", acc)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)


Dataset generated. Preview:
   PatientID  Age  Gender  NumPrevAdmissions  NumMedications PrimaryDiagnosis  \
0          1   69    Male                  7               1      Respiratory   
1          2   32    Male                  6               3        Infection   
2          3   89  Female                  8               2        Infection   
3          4   78    Male                  3               9    Heart Disease   
4          5   38    Male                  1              19         Diabetes   

   LengthOfStay  HasChronicDisease  Readmitted  
0            10                  0           0  
1            22                  0           1  
2            10                  1           0  
3            21                  0           1  
4            15                  0           0  

### Model Evaluation ###
Accuracy: 0.705
Confusion Matrix:
 [[140   3]
 [ 56   1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71     