In [None]:
# Simplify target variable
df['readmitted_binary'] = df['readmitted'].apply(lambda x: 0 if x == 'NO' else 1)

# Verify distribution
print(df['readmitted_binary'].value_counts())

In [None]:
df.drop(['encounter_id', 'patient_nbr', 'readmitted'], axis=1, inplace=True)


In [None]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Verify encoded dataframe
df_encoded.head()


In [None]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop('readmitted_binary', axis=1)
y = df_encoded['readmitted_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

# Evaluate Logistic Regression
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate Random Forest
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
}



In [None]:
import numpy as np
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Select Top 50 or 100 important features
top_n = 100
top_features = X_train.columns[indices][:top_n]

# Reduce your dataset clearly:
X_train_reduced = X_train[top_features]
X_test_reduced = X_test[top_features]


In [None]:
# Reduced grid search
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           scoring='accuracy',
                           verbose=2)

grid_search.fit(X_train_reduced, y_train)

# Evaluate the best model
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test_reduced)

print("Tuned Random Forest Accuracy (Reduced Features):", accuracy_score(y_test, y_pred_best_rf))
print(classification_report(y_test, y_pred_best_rf))


In [None]:
import joblib

joblib.dump(best_rf, "C:/Users/Vigneshwaran/Downloads/random_forest_readmission_final.pkl")

In [None]:
# Save training and testing datasets for reproducibility
X_train_reduced.to_csv("C:/Users/Vigneshwaran/Downloads/X_train_reduced.csv", index=False)
X_test_reduced.to_csv("C:/Users/Vigneshwaran/Downloads/X_test_reduced.csv", index=False)
y_train.to_csv("C:/Users/Vigneshwaran/Downloads/y_train.csv", index=False)
y_test.to_csv("C:/Users/Vigneshwaran/Downloads/y_test.csv", index=False)
