In [None]:
def start_signature():
    print("="*40)
    print("START OF NOTEBOOK — AT")
    print("="*40)

start_signature()


In [None]:
import pandas as pd
import numpy as np

# lets look at the base table - transfers 
df_transfers = pd.read_csv(r'C:\Users\arpitha_work\Downloads\TRU MSCDS\Sem 3\Graduate Project\MIMIC\mimic-iv-3.1\hosp\transfers.csv')
df_transfers.head()

In [None]:
# I want to know the unique number of patients and unique admissions 

unique_patients = df_transfers["subject_id"].nunique()
unique_admissions = df_transfers["hadm_id"].nunique()
unique_transfers = df_transfers["transfer_id"].nunique()
print("the no of unique patients :", unique_patients)
print(" the no of unique admissions :", unique_admissions)
print(" the no of unique transfers :", unique_transfers)

In [None]:
df_transfers.info()

In [None]:
# i am gonna calculate our output variable now - time spent in ED. Now for that first I need only ED data , everything else not needed so need to remove that
# Secondly remove any blank values and then create a new variable called LOS_ED - length of stay in ED
# Goal is to predict how long a patient will stay in the ED during a single hospital visit, so we will  use the admission level.
# LOS_ED_houurs - defined as the time spent by a patient in emergency department calculated by the difference between intime and outtime 

units = ['Emergency Department','Emergency Department Observation']
df_transfers = df_transfers[df_transfers['careunit'].isin(units)].copy()
# df_transfers = df_transfers.dropna(subset =['intime','outtime'])

In [None]:
df_transfers.head()

In [None]:
# now lets keep datetime format and calculate LOS_ED (hours)
df_transfers['intime'] = pd.to_datetime(df_transfers['intime'])
df_transfers['outtime'] = pd.to_datetime(df_transfers['outtime'])

df_transfers['LOS_ED'] = (df_transfers['outtime']-df_transfers['intime']).dt.total_seconds()/3600

In [None]:
# lets aggregate at admission id level, need to sum the times spent in Emergency department and emergency department observation
# if a patient had multiple ED transfers hence why.
# keeping subject_id , so that if i want later can keep the count of patients 

df_transfers = df_transfers.groupby('hadm_id',as_index= False).agg({
    'LOS_ED':'sum',
    'subject_id':'first'}).rename(columns={'LOS_ED':'LOS_ED_hours'})

df_transfers['LOS_ED_hours']=df_transfers['LOS_ED_hours'].round(2)

In [None]:
print (df_transfers.head())

In [None]:
df_transfers.info()
#df_transfers.isnull().any().any()
df_transfers.isnull().sum()


In [None]:
df_transfers['LOS_ED_hours'].describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,6))
sns.histplot(df_transfers['LOS_ED_hours'], bins=400, kde=True, color='blue')
plt.title('Distribution of ED Length of Stay (hours) with Density Curve')
plt.xlabel('LOS_ED_hours')
plt.ylabel('Density / Count')
plt.show()




In [None]:
df_transfers['LOS_ED_hours'].describe(percentiles=[0.25, 0.5, 0.75, 0.90, 0.95, 0.99])


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,4))
sns.boxplot(x=df_transfers['LOS_ED_hours'], color='lightblue')
plt.title('Boxplot of Target Variable')
plt.xlabel('Target')
plt.show()


In [None]:
Q1 = df_transfers['LOS_ED_hours'].quantile(0.25)
Q3 = df_transfers['LOS_ED_hours'].quantile(0.75)
IQR = Q3 - Q1

upper_limit = Q3 + 1.5 * IQR
lower_limit = Q1 - 1.5 * IQR

outliers = df_transfers[(df_transfers['LOS_ED_hours'] > upper_limit) | (df_transfers['LOS_ED_hours'] < lower_limit)]
print(f"Outlier count: {len(outliers)} ({len(outliers)/len(df_transfers)*100:.2f}% of total)")


In [None]:
upper_limit = df_transfers['LOS_ED_hours'].quantile(0.75) + 1.5 * (df_transfers['LOS_ED_hours'].quantile(0.75) - df_transfers['LOS_ED_hours'].quantile(0.25))
print("Upper limit (IQR method):", upper_limit)

df_transfers.loc[df_transfers['LOS_ED_hours'] > upper_limit, 'LOS_ED_hours'].describe()


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14,6))

# Original
sns.histplot(df_transfers['LOS_ED_hours'], kde=True, bins=80, color='lightblue', stat='count', ax=axes[0])
axes[0].set_xlim(0, df_transfers['LOS_ED_hours'].quantile(0.99))
axes[0].set_title('Original Data')

# Without outliers
sns.histplot(df_transfers.loc[df_transfers['LOS_ED_hours'] <= upper_limit, 'LOS_ED_hours'], kde=True, bins=80, color='salmon', stat='count', ax=axes[1])
axes[1].set_xlim(0, df_transfers['LOS_ED_hours'].quantile(0.99))
axes[1].set_title('Without Outliers')

plt.tight_layout()
plt.show()


In [None]:
df_transfers['is_outlier'] = df_transfers['LOS_ED_hours'] > 24

plt.figure(figsize=(10,6))
sns.histplot(data=df_transfers, x='LOS_ED_hours', hue='is_outlier', bins=100, stat='density',
             palette={False: 'skyblue', True: 'salmon'}, kde=True)
plt.xlim(0, df_transfers['LOS_ED_hours'].quantile(0.99))
plt.title('Distribution — Highlighting Outliers (>24)')
plt.show()


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14,6))

sns.histplot(df_transfers['LOS_ED_hours'], kde=True, bins=80, stat='count', color='skyblue', ax=axes[0])
axes[0].set_xlim(0, df_transfers['LOS_ED_hours'].quantile(0.99))
axes[0].set_title('Raw Target (0–99th percentile)')

sns.histplot(np.log1p(df_transfers['LOS_ED_hours']), kde=True, bins=80, stat='count', color='coral', ax=axes[1])
axes[1].set_title('Log(1+Target)')

plt.show()


In [None]:
df_patients = pd.read_csv(r'C:\Users\arpitha_work\Downloads\TRU MSCDS\Sem 3\Graduate Project\MIMIC\mimic-iv-3.1\hosp\patients.csv')
df_patients.info()

In [None]:
df_patients.head()

In [None]:
df_patients.isnull().sum()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df_patients['anchor_age'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Patient Age')
plt.xlabel('Age (years)')
plt.ylabel('Density / Count')
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='gender', data=df_patients, palette='pastel')
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.ylabel('Number of Patients')
plt.show()


In [None]:
# lets keep only required columns in patients table
df_patients = df_patients.drop(columns=['anchor_year','anchor_year_group','dod'])

In [None]:
df_merge1=df_transfers.merge(df_patients,on='subject_id',how='left')
df_merge1.head()

In [None]:
df_merge1.info()

In [None]:
df_merge1=df_merge1[df_merge1['anchor_age'] >=65]
df_merge1.head()

In [None]:
df_merge1.info()

In [None]:
plt.figure(figsize=(8,5))
sns.regplot(x='anchor_age', y='LOS_ED_hours', data=df_merge1, scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
plt.title('ED LOS vs Age with Trend Line')
plt.xlabel('Age (years)')
plt.ylabel('LOS_ED_hours')
plt.show()


In [None]:
corr = df_merge1['anchor_age'].corr(df_merge1['LOS_ED_hours'])
print(f"Correlation between age and ED LOS: {corr:.2f}")


In [None]:
plt.figure(figsize=(6,5))
sns.boxplot(x='gender', y='LOS_ED_hours', data=df_merge1, palette='viridis')
plt.title('ED LOS by Gender')
plt.xlabel('Gender')
plt.ylabel('LOS_ED_hours')
plt.show()


In [None]:
df_admissions = pd.read_csv(r'C:\Users\arpitha_work\Downloads\TRU MSCDS\Sem 3\Graduate Project\MIMIC\mimic-iv-3.1\hosp\admissions.csv')
df_admissions.head()

In [None]:
df_admissions.info()

In [None]:
df_admissions = df_admissions.drop(columns=['dischtime','discharge_location','language',
                                            'edregtime','edouttime','hospital_expire_flag','deathtime'])

In [None]:
df_admissions = df_admissions.dropna(subset=['marital_status'])


In [None]:
df_admissions.isnull().sum()

In [None]:
df_admissions.info()

In [None]:
#lets understand about provider

df_admissions['admit_provider_id'].nunique()

In [None]:
patients_per_doctor = df_admissions.groupby('admit_provider_id')['subject_id'].nunique().reset_index()
patients_per_doctor.rename(columns={'subject_id': 'unique_patients'}, inplace=True)

print(patients_per_doctor)

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='admission_type', data=df_admissions, palette='Set2')
plt.title('Type of Admission - Distribution', pad=15)   
plt.xlabel('Admission_Type')
plt.ylabel('No of Patients')

# trying to rotate axis coz the titles were overlapping here
plt.xticks(rotation=15, ha='right') 
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='admission_location', data=df_admissions, palette='Set2')
plt.title(' Admission Location - Distribution', pad=20)   
plt.xlabel('Admission_Location')
plt.ylabel('No of Patients')

# trying to rotate axis coz the titles were overlapping here
plt.xticks(rotation=25, ha='right') 
plt.tight_layout()
plt.show()


In [None]:
insurance_counts = df_admissions['insurance'].value_counts()

plt.figure(figsize=(7,7))
plt.pie(
    insurance_counts, 
    labels=insurance_counts.index, 
    autopct='%1.1f%%', 
    startangle=90, 
    colors=plt.cm.Pastel1.colors
)
plt.title('Insurance Type Distribution', pad=20)
plt.show()


In [None]:
df_merge2 = pd.merge(df_merge1, df_admissions, on=['hadm_id','subject_id'], how='inner')
#using inner here instead of left coz i need matching records from both tables.

In [None]:
df_merge2.info()

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(y='insurance', x='LOS_ED_hours', data=df_merge2, palette='Set2')
plt.title('ED LOS by Insurance Type', pad=15)
plt.xlabel('LOS_ED_hours')
plt.ylabel('Insurance Type')
plt.xticks(rotation=30)  # Rotate labels if too long
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(y='admission_type', x='LOS_ED_hours', data=df_merge2, palette='Set2')
plt.title('ED LOS by Admission Type', pad=15)
plt.xlabel('LOS_ED_hours')
plt.ylabel('Admission Type')
plt.xticks(rotation=30)  # Rotate labels if too long
plt.show()


In [None]:
df_merge2['LOS_ED_hours'].describe()

In [None]:
mean_los_race = df_merge2.groupby('race')['LOS_ED_hours'].mean().reset_index()
mean_los_race = mean_los_race.sort_values(by='LOS_ED_hours', ascending=False)

plt.figure(figsize=(8,5))
sns.barplot(
    y='race', 
    x='LOS_ED_hours', 
    data=mean_los_race,
    palette='pastel'
)
plt.title('Average ED LOS by Race (Sorted)', pad=15)
plt.xlabel('Average LOS_ED_hours')
plt.ylabel('Race')
plt.show()


In [None]:
mean_los_race = df_merge2.groupby('marital_status')['LOS_ED_hours'].mean().reset_index()
mean_los_race = mean_los_race.sort_values(by='LOS_ED_hours', ascending=False)

plt.figure(figsize=(8,5))
sns.barplot(
    y='marital_status', 
    x='LOS_ED_hours', 
    data=mean_los_race,
    palette='viridis'
)
plt.title('Average ED LOS by marital_Status (Sorted)', pad=15)
plt.xlabel('Average LOS_ED_hours')
plt.ylabel('marital_status')
plt.show()


In [None]:
mean_los_race = df_merge2.groupby('admission_location')['LOS_ED_hours'].mean().reset_index()
mean_los_race = mean_los_race.sort_values(by='LOS_ED_hours', ascending=False)

plt.figure(figsize=(8,5))
sns.barplot(
    y='admission_location', 
    x='LOS_ED_hours', 
    data=mean_los_race,
    palette='Blues'
)
plt.title('Average ED LOS by admission_location (Sorted)', pad=15)
plt.xlabel('Average LOS_ED_hours')
plt.ylabel('admission_location')
plt.show()


In [None]:
df_merge2['admittime'] = pd.to_datetime(df_merge2['admittime'], errors='coerce')
df_merge2['admit_hour'] = df_merge2['admittime'].dt.hour
df_merge2['admit_day'] = df_merge2['admittime'].dt.dayofweek  # 0=Monday, 6=Sunday


In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='admit_hour', y='LOS_ED_hours', data=df_merge2, palette='pastel')
plt.title('ED LOS by Hour of Admission', pad=15)
plt.xlabel('Hour of Admission')
plt.ylabel('LOS_ED_hours')
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(x='admit_day', y='LOS_ED_hours', data=df_merge2, palette='Set2')
plt.title('ED LOS by Day of Week', pad=15)
plt.xlabel('Day of Week (0=Mon)')
plt.ylabel('LOS_ED_hours')
plt.show()


In [None]:
sns.scatterplot(
    x='admit_hour',
    y='LOS_ED_hours',
    data=df_merge2,
    color='skyblue',
    alpha=0.4,
    s=50
)

sns.regplot(
    x='admit_hour',
    y='LOS_ED_hours',
    data=df_merge2,
    scatter=False,          
    color='red',
    line_kws={'lw':2}
)

plt.title('ED LOS by Hour of Admission with Trend', pad=15)
plt.xlabel('Hour of Admission (0=Midnight)')
plt.ylabel('LOS_ED_hours')
plt.xticks(range(0,24))
plt.show()


In [None]:
file_path = r'C:\Users\arpitha_work\Downloads\TRU MSCDS\Sem 3\Graduate Project\MIMIC\mimic-iv-3.1\hosp\diagnoses_icd.csv.gz'
df_diagnoses = pd.read_csv(file_path, compression ='gzip')
df_diagnoses.head()


In [None]:
print('There are {} unique ICD9 codes in this dataset.'.format(df_diagnoses['icd_code'].value_counts().count()))

In [None]:
def categorize_icd9_short(code):
    """Return short ICD-9 category name"""
    if pd.isna(code):
        return 'misc'
    
    code_str = str(code).strip().upper()
    if code_str.startswith(('E', 'V')):
        return 'misc'  
    
    try:
        num = int(code_str[:3])
    except ValueError:
        return 'misc'
    
    ranges = [
        ((1, 139), 'infectious'),
        ((140, 239), 'neoplasms'),
        ((240, 279), 'endocrine'),
        ((280, 289), 'blood'),
        ((290, 319), 'mental'),
        ((320, 389), 'nervous'),
        ((390, 459), 'circulatory'),
        ((460, 519), 'respiratory'),
        ((520, 579), 'digestive'),
        ((580, 629), 'genitourinary'),
        ((630, 679), 'pregnancy'),
        ((680, 709), 'skin'),
        ((710, 739), 'muscular'),
        ((740, 759), 'congenital'),
        ((760, 779), 'prenatal'),
        ((780, 799), 'misc'),
        ((800, 999), 'injury')
    ]
    
    for (low, high), label in ranges:
        if low <= num <= high:
            return label
    return 'misc'

In [None]:
df_diagnoses['icd_category'] = df_diagnoses['icd_code'].apply(categorize_icd9_short)

diag_counts = (
    df_diagnoses.groupby(['hadm_id', 'icd_category'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

In [None]:
df_merge2 = df_merge2.merge(diag_counts, on='hadm_id', how='left')

In [None]:

df_merge2.fillna(0, inplace=True)

for col in diag_counts.columns:
    if col != 'hadm_id':
        df_merge2[col] = df_merge2[col].astype(int)


print(df_merge2.shape)
df_merge2.info()

In [None]:
df_merge2.head()

In [None]:
diseases = [
    'blood', 'circulatory', 'congenital', 'digestive', 'endocrine',
    'genitourinary', 'infectious', 'injury', 'mental', 'misc',
    'muscular', 'neoplasms', 'nervous', 'respiratory', 'skin'
]

mean_los = {}
for disease in diseases:
    mean_los[disease] = df_merge2.loc[df_merge2[disease] > 0, 'LOS_ED_hours'].mean()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,6))
bars = sns.barplot(x=list(mean_los.keys()), y=list(mean_los.values()), palette='viridis')


for bar in bars.patches:
    height = bar.get_height()
    bars.annotate(f'{height:.2f}',  # 2 decimal places
                  xy=(bar.get_x() + bar.get_width() / 2, height),
                  xytext=(0, 5),  
                  textcoords='offset points',
                  ha='center', va='bottom')

plt.xticks(rotation=45)
plt.ylabel('Mean LOS in ED (hours)')
plt.xlabel('Disease Category')
plt.title('Mean ED Length of Stay by Disease Category')
plt.show()


In [None]:
df_merge2.info()

In [None]:
df_merge2.head()

In [None]:
##-----

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



X = df_merge2.drop(columns=["LOS_ED_hours", "hadm_id", "subject_id"], errors="ignore").copy()
y = df_merge2["LOS_ED_hours"].astype(float).copy()


if (y < 0).any():
    raise ValueError("Found negative LOS_ED_hours values. Fix data before log transform.")



datetime_cols = X.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist()

for col in datetime_cols:
    dt = pd.to_datetime(X[col], errors="coerce")
    X[col + "_hour"] = dt.dt.hour
    X[col + "_weekday"] = dt.dt.weekday
    X[col + "_month"] = dt.dt.month

X.drop(columns=datetime_cols, inplace=True, errors="ignore")


cat_like = X.select_dtypes(include=["object", "string", "category"]).columns
X[cat_like] = X[cat_like].astype(str)


X[cat_like] = X[cat_like].replace({"nan": np.nan, "NaT": np.nan, "None": np.nan})



X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)






In [None]:


y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)



num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ],
    remainder="drop"
)


In [None]:

rf = RandomForestRegressor(
    n_estimators=50,      
    max_depth=10,         
    min_samples_leaf=5,   
    random_state=42,
    n_jobs=1
)

et = ExtraTreesRegressor(
    n_estimators=80,      
    max_depth=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=1
)

gbr_stack = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

ridge = Ridge(alpha=1.0, random_state=42)

voting = VotingRegressor(
    estimators=[("et", et), ("rf", rf), ("ridge", ridge),("gbr_stack", gbr_stack)]
)

# Weighted voting (ET strongest)
weighted_voting = VotingRegressor(
    estimators=[("et", et), ("rf", rf), ("ridge", ridge),("gbr_stack", gbr_stack)],
    weights=[2, 1, 0.5,3]
)

models = {
    "RandomForest": rf,
    "ExtraTrees": et,
    "GradientBoostingRegressor":gbr_stack,
    "VotingRegressor": voting,
    "WeightedVotingRegressor": weighted_voting
}


In [None]:

results = {}

for name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", model)
    ])

    print(f"\nTraining: {name}")
    pipe.fit(X_train, y_train_log)

    y_pred_log = pipe.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    
    y_pred = np.clip(y_pred, 0, None)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2_hours = r2_score(y_test, y_pred)
    r2_log = r2_score(y_test_log, y_pred_log)

    results[name] = {
        "MAE": mae,
        "RMSE": rmse,
        "MSE": mse,
        "R2_hours": r2_hours,
        "R2_log": r2_log
    }

results_df = pd.DataFrame(results).T.sort_values("MAE")
print("\n===== RESULTS (sorted by MAE) =====")
print(results_df)

In [None]:
# SIMPLE STACKING - same models 


from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge


rf_stack = RandomForestRegressor(
    n_estimators=120,
    max_depth=15,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=1
)

et_stack = ExtraTreesRegressor(
    n_estimators=180,
    max_depth=15,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=1
)



meta = Ridge(alpha=1.0, random_state=42)


simple_stacking = StackingRegressor(
    estimators=[("rf", rf_stack), ("et", et_stack)],
    final_estimator=meta,
    passthrough=False,
    cv=3,
    n_jobs=1
)


stack_pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", simple_stacking)
])

print("\nTraining: SimpleStacking")
stack_pipe.fit(X_train, y_train_log)

y_pred_log_stack = stack_pipe.predict(X_test)
y_pred_stack = np.expm1(y_pred_log_stack)
y_pred_stack = np.clip(y_pred_stack, 0, None)


mse = mean_squared_error(y_test, y_pred_stack)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_stack)
r2_hours = r2_score(y_test, y_pred_stack)
r2_log = r2_score(y_test_log, y_pred_log_stack)


results["SimpleStacking"] = {
    "MAE": mae,
    "RMSE": rmse,
    "MSE": mse,
    "R2_hours": r2_hours,
    "R2_log": r2_log
}


results_df = pd.DataFrame(results).T.sort_values("MAE")
print("\n===== RESULTS (INCLUDING STACKING) =====")
print(results_df)


In [None]:
# ## the long computation time code -- because of cv folds---


# import numpy as np
# import pandas as pd

# from sklearn.model_selection import train_test_split
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer

# from sklearn.ensemble import (
#     RandomForestRegressor,
#     ExtraTreesRegressor,
#     GradientBoostingRegressor,
#     AdaBoostRegressor,
#     HistGradientBoostingRegressor,
#     StackingRegressor,
#     VotingRegressor
# )

# from sklearn.linear_model import Ridge
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# X = df_merge2.drop(columns=["LOS_ED_hours", "hadm_id", "subject_id"], errors="ignore").copy()
# y = df_merge2["LOS_ED_hours"].astype(float).copy()


# if (y < 0).any():
#     raise ValueError("Found negative LOS_ED_hours values. Fix data before log transform.")



# datetime_cols = X.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist()

# for col in datetime_cols:
#     dt = pd.to_datetime(X[col], errors="coerce")
#     X[col + "_hour"] = dt.dt.hour
#     X[col + "_weekday"] = dt.dt.weekday
#     X[col + "_month"] = dt.dt.month

# X.drop(columns=datetime_cols, inplace=True, errors="ignore")


# cat_like = X.select_dtypes(include=["object", "string", "category"]).columns
# X[cat_like] = X[cat_like].astype(str)

# X[cat_like] = X[cat_like].replace({
#     "nan": np.nan,
#     "NaT": np.nan,
#     "None": np.nan
# })



# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )



# y_train_log = np.log1p(y_train)
# y_test_log = np.log1p(y_test)



# num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
# cat_cols = [c for c in X_train.columns if c not in num_cols]

# numeric_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median"))
# ])

# categorical_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("onehot", OneHotEncoder(handle_unknown="ignore"))
# ])

# preprocess = ColumnTransformer(
#     transformers=[
#         ("num", numeric_transformer, num_cols),
#         ("cat", categorical_transformer, cat_cols)
#     ],
#     remainder="drop"
# )



# ensemble_models = {
#     "RandomForest": RandomForestRegressor(
#         n_estimators=300,
#         random_state=42,
#         n_jobs=4
#     ),
#     "ExtraTrees": ExtraTreesRegressor(
#         n_estimators=300,
#         random_state=42,
#         n_jobs=4
#     ),
#     "GradientBoosting": GradientBoostingRegressor(
#         n_estimators=300,
#         random_state=42
#     ),
#     "AdaBoost": AdaBoostRegressor(
#         n_estimators=400,
#         random_state=42
#     ),
#     "HistGradientBoosting": HistGradientBoostingRegressor(
#         random_state=42
#     )
# }

# voting = VotingRegressor(
#     estimators=[
#         ("rf", ensemble_models["RandomForest"]),
#         ("et", ensemble_models["ExtraTrees"]),
#         ("hgb", ensemble_models["HistGradientBoosting"])
#     ]
# )

# stacking = StackingRegressor(
#     estimators=[
#         ("rf", ensemble_models["RandomForest"]),
#         ("et", ensemble_models["ExtraTrees"]),
#         ("gbr", ensemble_models["GradientBoosting"]),
#         ("hgb", ensemble_models["HistGradientBoosting"])
#     ],
#     final_estimator=Ridge(alpha=1.0),
#     passthrough=False,
#     cv=3,
#     n_jobs=1
# )

# ensemble_models["VotingRegressor"] = voting
# ensemble_models["StackingRegressor"] = stacking



# results = {}

# for name, model in ensemble_models.items():
#     pipe = Pipeline(steps=[
#         ("preprocess", preprocess),
#         ("model", model)
#     ])

#     pipe.fit(X_train, y_train_log)

#     y_pred_log = pipe.predict(X_test)
#     y_pred = np.expm1(y_pred_log)
#     y_pred = np.clip(y_pred, 0, None)

#     mse = mean_squared_error(y_test, y_pred)
#     rmse = np.sqrt(mse)
#     mae = mean_absolute_error(y_test, y_pred)
#     r2_hours = r2_score(y_test, y_pred)
#     r2_log = r2_score(y_test_log, y_pred_log)

#     results[name] = {
#         "MAE": mae,
#         "RMSE": rmse,
#         "MSE": mse,
#         "R2_hours": r2_hours,
#         "R2_log": r2_log
#     }

# results_df = pd.DataFrame(results).T.sort_values("MAE")
# print(results_df)


In [None]:
def end_signature():
    print("="*40)
    print("END OF NOTEBOOK — AT")
    print("="*40)

end_signature()
