In [1]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import warnings
import seaborn as sns

In [2]:
warnings.filterwarnings('ignore')

df = pd.read_csv("HDHI Admission data.csv")


In [3]:
# Replace 'EMPTY' strings with NaN
df.replace('EMPTY', pd.NA, inplace=True)

In [4]:
# Impute missing values
numerical_cols = df.select_dtypes(include=['number']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())


In [5]:
# Convert 'D.O.A' to datetime
df['D.O.A'] = pd.to_datetime(df['D.O.A'], errors='coerce')

# Sort by 'MRD No.' and 'D.O.A' to ensure the latest entry is first
df.sort_values(by=['MRD No.', 'D.O.A'], ascending=[True, False], inplace=True)


In [6]:
# Add 'Readmission Count' and 'Readmission' columns
df['Readmission Count'] = df.groupby('MRD No.')['MRD No.'].transform('count')
df['Readmission'] = df['Readmission Count'].apply(lambda x: 1 if x > 1 else 0)


In [7]:
# Drop duplicates keeping the oldest entry based on 'D.O.A'
df_latest = df.drop_duplicates(subset=['MRD No.'], keep='last')

In [8]:
# Drop unnecessary columns
to_drop = ['SNO', 'D.O.D', 'month year']
df_latest.drop(columns=to_drop, inplace=True)

In [9]:
# Encode categorical variables
label_encoders = {
    'GENDER': LabelEncoder(),
    'RURAL': LabelEncoder(),
    'TYPE OF ADMISSION-EMERGENCY/OPD': LabelEncoder(),
    'OUTCOME': LabelEncoder()
}

for col, le in label_encoders.items():
    if col in df_latest.columns:
        df_latest[col] = le.fit_transform(df_latest[col])

In [10]:
# Create input features and target
input_df = df_latest.drop(columns=['OUTCOME', 'D.O.A', 'MRD No.', 'Readmission', 'Readmission Count'])
target = df_latest['Readmission Count']


In [11]:
# Handle any remaining NaN values
input_df.fillna(0, inplace=True)

new_input_df = input_df[['GENDER', 'TYPE OF ADMISSION-EMERGENCY/OPD', 'RURAL', 'PLATELETS', 'GLUCOSE', 'TLC', 'UREA', 'HB', 'AGE', 'CREATININE', 'DURATION OF STAY', 'EF', 'BNP', 'duration of intensive unit stay']]


In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(new_input_df, target, test_size=0.2, random_state=42)


In [15]:
# Define models and parameter grids for regression models
regression_models_and_parameters = {
    "Decision Tree Regressor": (Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', DecisionTreeRegressor())
    ]), {
        'regressor__max_depth': [None, 10, 20, 30],
        'regressor__min_samples_split': [2, 5, 10]
    }),

    "Random Forest Regressor": (Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', RandomForestRegressor())
    ]), {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [None, 10, 20],
        'regressor__min_samples_split': [2, 5, 10]
    }),
    
    "KNN Regressor": (Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', KNeighborsRegressor())
    ]), {
        'regressor__n_neighbors': [3, 5, 7],
        'regressor__weights': ['uniform', 'distance']
    }),

    "Linear Regression": (Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', LinearRegression())
    ]), {})
}

In [16]:
# Iterate through regression models and perform GridSearchCV
for name, (pipeline, params) in regression_models_and_parameters.items():
    grid_search = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Get R2 score and predictions
    r2_score = best_model.score(X_test, y_test)
    predictions = best_model.predict(X_test)
    
    print(f"\n{name} Best Model Parameters: {grid_search.best_params_}")
    print(f"{name} Best Model R2 Score: {r2_score}")
    print(f"{name} Predictions vs Actual:")
    print(pd.DataFrame({"Predicted": predictions, "Actual": y_test}))

    # Confusion Matrix and Classification Report (not applicable for regression but we can show other metrics)
    print(f"\n{name} Model Mean Squared Error:")

    print(mean_squared_error(y_test, predictions))


Fitting 5 folds for each of 12 candidates, totalling 60 fits

Decision Tree Regressor Best Model Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_split': 10}
Decision Tree Regressor Best Model R2 Score: -0.127607590608535
Decision Tree Regressor Predictions vs Actual:
       Predicted  Actual
12118   1.283713       1
12618   1.348348       2
2997    1.193114       1
11637   1.250000       1
10367   1.000000       1
...          ...     ...
4969    1.154506       1
15453   1.212644       1
1653    1.406250       1
5219    1.193114       1
14817   1.193182       2

[2449 rows x 2 columns]

Decision Tree Regressor Model Mean Squared Error:
0.7047625230346098
Fitting 5 folds for each of 27 candidates, totalling 135 fits

Random Forest Regressor Best Model Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 200}
Random Forest Regressor Best Model R2 Score: 0.020168274445878542
Random Forest Regressor Predictions vs Actual:


In [18]:
if hasattr(best_model.named_steps['regressor'], 'feature_importances_'):
    importance = best_model.named_steps['regressor'].feature_importances_
    features = new_input_df.columns
    importance_df = pd.DataFrame({'Feature': features, 'Importance': importance})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    print(f"{name} Feature Importances:")
    print(importance_df)

    print(importance_df)
