In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from holoviews.plotting.bokeh.styles import alpha
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
import catboost as cat
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
#importing CSV File
df = pd.read_csv('extractedMimic.csv')

In [3]:
df.head()

Unnamed: 0,icustay_id,hadm_id,intime,outtime,dbsource,suspected_infection_time_poe,suspected_infection_time_poe_days,specimen_poe,positiveculture_poe,antibiotic_time_poe,...,glucose_min1,glucose_max1,glucose_mean,rrt,subject_id,hadm_id.1,icustay_id.1,urineoutput,colloid_bolus,crystalloid_bolus
0,205941,156324,28/5/2157 14:26:21,30/5/2157 14:18:24,metavision,28/5/2157 15:30:00,-0.044201,MRSA SCREEN,0,28/5/2157 00:00:00,...,40.0,202.0,87.25,0,88883,156324,205941,0.0,,250.0
1,252848,163315,29/7/2196 02:26:17,29/7/2196 12:02:39,metavision,29/7/2196 04:57:00,-0.104664,MRSA SCREEN,1,29/7/2196 00:00:00,...,182.0,231.0,206.5,0,46154,163315,252848,0.0,,250.0
2,237901,180937,14/2/2145 17:55:07,23/2/2145 12:43:43,metavision,14/2/2145 21:20:00,-0.14228,BLOOD CULTURE,0,15/2/2145 00:00:00,...,123.0,185.0,151.285714,1,42682,180937,237901,0.0,,250.0
3,207491,143962,11/6/2159 12:47:02,14/6/2159 16:31:30,metavision,11/6/2159 12:11:00,0.025023,BLOOD CULTURE,0,11/6/2159 00:00:00,...,92.0,118.0,105.0,0,45111,143962,207491,4.0,,250.0
4,293063,118489,1/1/2135 17:28:33,2/1/2135 06:56:56,metavision,1/1/2135 15:55:00,0.064965,BLOOD CULTURE,0,1/1/2135 00:00:00,...,150.0,163.0,155.0,0,56648,118489,293063,5.0,,250.0


In [4]:
df.drop(df[df['age'] < 18].index, inplace=True)

df.describe()
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4555 entries, 0 to 4558
Columns: 106 entries, icustay_id to crystalloid_bolus
dtypes: float64(57), int64(39), object(10)
memory usage: 3.7+ MB


In [5]:
X = df[['urineoutput', 'lactate_min','bun_mean','sysbp_min', 'metastatic_cancer', 'inr_max', 'age', 'sodium_max', 'aniongap_max', 'creatinine_min', 'spo2_mean']]

y = df['thirtyday_expire_flag']

In [6]:
X.isnull().sum()

urineoutput            0
lactate_min            0
bun_mean               0
sysbp_min              8
metastatic_cancer      0
inr_max              270
age                    0
sodium_max             0
aniongap_max          14
creatinine_min         2
spo2_mean              1
dtype: int64

In [7]:
X.shape

(4555, 11)

In [8]:
X.fillna(X.median(), inplace=True)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)


In [13]:
# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'iterations': [ 400, 800, 1000, 1500, 1800],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 5, 6, 7],
    'l2_leaf_reg': [1, 3, 5, 7],
    'class_weights': [[1, 2], [1, 3], [1, 4]],  # Try different class weight combinations
    'border_count': [32, 64, 128],
    'bagging_temperature': [0, 0.5, 1]
}

# Initialize the CatBoost model
catboost_model = CatBoostClassifier(random_state=42, verbose=0)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=catboost_model,
                                   param_distributions=param_dist,
                                   n_iter=50,  # Increased number of iterations for better exploration
                                   cv=3,  # 3-fold cross-validation
                                   scoring='roc_auc',  # Optimize for AUC
                                   random_state=42,
                                   verbose=1,
                                   n_jobs=-1)  # Use all processors

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters from RandomizedSearchCV
best_params = random_search.best_params_

# Use best_params to define a narrower grid for GridSearchCV
param_grid = {
    'iterations': [best_params['iterations'] - 200, best_params['iterations'], best_params['iterations'] + 200],
    'learning_rate': [best_params['learning_rate'] * 0.5, best_params['learning_rate'], best_params['learning_rate'] * 1.5],
    'depth': [best_params['depth'] - 1, best_params['depth'], best_params['depth'] + 1],
    'l2_leaf_reg': [best_params['l2_leaf_reg'] - 2, best_params['l2_leaf_reg'], best_params['l2_leaf_reg'] + 2],
    'class_weights': [best_params['class_weights']],
    'border_count': [best_params['border_count']],
    'bagging_temperature': [best_params['bagging_temperature']]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=catboost_model,
                           param_grid=param_grid,
                           cv=5,  # 3-fold cross-validation
                           scoring='roc_auc',  # Optimize for AUC
                           verbose=1,
                           n_jobs=-1)  # Use all processors

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Retrieve the best estimator after fitting
best_model = grid_search.best_estimator_

# Predict probabilities and classes for the test set using the best model
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print(f"Model Accuracy: {accuracy * 100:.5f}%")
print(f"Model AUC: {auc * 100:.5f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Print the best hyperparameters
print("Best hyperparameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Model Accuracy: 84.10887%
Model AUC: 82.30275%
Confusion Matrix:
[[843  74]
 [107 115]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       917
           1       0.61      0.52      0.56       222

    accuracy                           0.84      1139
   macro avg       0.75      0.72      0.73      1139
weighted avg       0.83      0.84      0.84      1139

Best hyperparameters found:  {'bagging_temperature': 0.5, 'border_count': 64, 'class_weights': [1, 2], 'depth': 4, 'iterations': 1000, 'l2_leaf_reg': 3, 'learning_rate': 0.01}


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('extractedMimic.csv')
for col in data.columns: # Check if the column is of type datetime by trying to convert 
    try: 
        data[col] = pd.to_datetime(data[col]) 
        # Convert datetime to timestamp difference from the first date in the column 
        data[col] = (data[col] - data[col].min()).dt.total_seconds() 
    except (ValueError, TypeError): # Column is not datetime; 
        # continue to the next step
        continue # Step 2: Encode categorical variables with Label Encoding 
label_encoders = {} 
for col in data.select_dtypes(include=['object']).columns: # Apply label encoding for categorical columns 
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le # Save encoder for future reference # Step 3: Fill missing values # You can choose different strategies (e.g., mean, median, mode) depending on the data characteristics 
data = data.fillna(data.median())

# Define features and target
# Assuming `mortality` is the target column for third-day mortality
X = data.drop(columns=['thirtyday_expire_flag'])
y = data['thirtyday_expire_flag']

# Preprocess data: handle missing values and standardize features
X = X.fillna(X.median())  # Simple missing value imputation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 1. Filter Method: Univariate Selection with ANOVA F-test
select_k_best = SelectKBest(score_func=f_classif, k=20)  # Select top 20 features
X_selected = select_k_best.fit_transform(X_scaled, y)
selected_features_filter = X.columns[select_k_best.get_support()]

print("Selected features using Filter Method (Univariate Selection):")
print(selected_features_filter)

# 2. Wrapper Method: Recursive Feature Elimination (RFE) with RandomForestClassifier
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=20, step=1)
X_selected_rfe = rfe.fit_transform(X_scaled, y)
selected_features_rfe = X.columns[rfe.get_support()]

print("Selected features using Wrapper Method (RFE):")
print(selected_features_rfe)

# 3. Embedded Method: Lasso Regularization
lasso = Lasso(alpha=0.01)  # Adjust alpha as needed
lasso.fit(X_scaled, y)
selected_features_lasso = X.columns[(lasso.coef_ != 0)]

print("Selected features using Embedded Method (Lasso):")
print(selected_features_lasso)

# Combine selected features from each method for final model
final_selected_features = list(set(selected_features_filter) | set(selected_features_rfe) | set(selected_features_lasso))
X_final = X[final_selected_features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42)

# Train a final model with selected features
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy with selected features: {accuracy:.2f}")

  data[col] = pd.to_datetime(data[col])
  data[col] = pd.to_datetime(data[col])
  data[col] = pd.to_datetime(data[col])
  data[col] = pd.to_datetime(data[col])
  data[col] = pd.to_datetime(data[col])
  data[col] = pd.to_datetime(data[col])
  data[col] = pd.to_datetime(data[col])
  data[col] = pd.to_datetime(data[col])
  data[col] = pd.to_datetime(data[col])
  f = msb / msw


Selected features using Filter Method (Univariate Selection):
Index(['hospital_expire_flag', 'sepsis_explicit', 'severe_sepsis_explicit',
       'elixhauser_hospital', 'sofa', 'lods', 'aniongap_min', 'aniongap_max',
       'lactate_min', 'lactate_max', 'lactate_mean', 'bun_min', 'bun_max',
       'bun_mean', 'sysbp_min', 'meanbp_min', 'resprate_mean', 'spo2_min',
       'spo2_mean', 'urineoutput'],
      dtype='object')


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [4]:
from sklearn.preprocessing import LabelEncoder

for col in data.columns: # Check if the column is of type datetime by trying to convert 
    try: 
        data[col] = pd.to_datetime(data[col]) 
        # Convert datetime to timestamp difference from the first date in the column 
        data[col] = (data[col] - data[col].min()).dt.total_seconds() 
    except (ValueError, TypeError): # Column is not datetime; 
        # continue to the next step
        continue # Step 2: Encode categorical variables with Label Encoding 
label_encoders = {} 
for col in data.select_dtypes(include=['object']).columns: # Apply label encoding for categorical columns 
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le 
data = data.fillna(data.median())

In [5]:

# Define features and target
# Assuming `mortality` is the target column for third-day mortality
X = data.drop(columns=['thirtyday_expire_flag'])
y = data['thirtyday_expire_flag']

# Preprocess data: handle missing values and standardize features
X = X.fillna(X.median())  # Simple missing value imputation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 1. Filter Method: Univariate Selection with ANOVA F-test
select_k_best = SelectKBest(score_func=f_classif, k=20)  # Select top 20 features
X_selected = select_k_best.fit_transform(X_scaled, y)
selected_features_filter = X.columns[select_k_best.get_support()]

print("Selected features using Filter Method (Univariate Selection):")
print(selected_features_filter)

# 2. Wrapper Method: Recursive Feature Elimination (RFE) with RandomForestClassifier
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=20, step=1)
X_selected_rfe = rfe.fit_transform(X_scaled, y)
selected_features_rfe = X.columns[rfe.get_support()]

print("Selected features using Wrapper Method (RFE):")
print(selected_features_rfe)

# 3. Embedded Method: Lasso Regularization
lasso = Lasso(alpha=0.01)  # Adjust alpha as needed
lasso.fit(X_scaled, y)
selected_features_lasso = X.columns[(lasso.coef_ != 0)]

print("Selected features using Embedded Method (Lasso):")
print(selected_features_lasso)

# Combine selected features from each method for final model
final_selected_features = list(set(selected_features_filter) | set(selected_features_rfe) | set(selected_features_lasso))
X_final = X[final_selected_features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42)

# Train a final model with selected features
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy with selected features: {accuracy:.2f}")

  msb = ssbn / float(dfbn)


Selected features using Filter Method (Univariate Selection):
Index(['meanbp_mean', 'resprate_min', 'resprate_max', 'resprate_mean',
       'tempc_min', 'tempc_max', 'tempc_mean', 'spo2_min', 'spo2_max',
       'spo2_mean', 'glucose_min1', 'glucose_max1', 'glucose_mean', 'rrt',
       'subject_id', 'hadm_id.1', 'icustay_id.1', 'urineoutput',
       'colloid_bolus', 'crystalloid_bolus'],
      dtype='object')
Selected features using Wrapper Method (RFE):
Index(['meanbp_mean', 'resprate_min', 'resprate_max', 'resprate_mean',
       'tempc_min', 'tempc_max', 'tempc_mean', 'spo2_min', 'spo2_max',
       'spo2_mean', 'glucose_min1', 'glucose_max1', 'glucose_mean', 'rrt',
       'subject_id', 'hadm_id.1', 'icustay_id.1', 'urineoutput',
       'colloid_bolus', 'crystalloid_bolus'],
      dtype='object')
Selected features using Embedded Method (Lasso):
Index([], dtype='object')
Model Accuracy with selected features: 1.00


  model = cd_fast.enet_coordinate_descent(


In [8]:
# Necessary imports
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Load the dataset
df = pd.read_csv('extractedMimic.csv')

# Select features and target
X = df[['meanbp_mean', 'resprate_min', 'resprate_max', 'resprate_mean',
       'tempc_min', 'tempc_max', 'tempc_mean', 'spo2_min', 'spo2_max',
       'spo2_mean', 'glucose_min1', 'glucose_max1', 'glucose_mean', 'rrt',
       'subject_id', 'hadm_id.1', 'icustay_id.1', 'urineoutput',
       'colloid_bolus', 'crystalloid_bolus']]
y = df['thirtyday_expire_flag']

# Fill missing values with the median
X.fillna(X.median(), inplace=True)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes of the resulting datasets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

# Train the XGBoost model with hyperparameter tuning
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False, 
    eval_metric='logloss', 
    booster='gbtree', 
    objective='binary:logistic', 
    n_estimators=90,      # Number of trees
    max_depth=6,          # Depth of trees
    learning_rate=0.05,   # Learning rate
    subsample=0.8,        # Percentage of data for each tree
    colsample_bytree=0.8, # Feature subsampling
    gamma=0.05            # Regularization parameter
)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)  # Predicted labels
y_probs = xgb_model.predict_proba(X_test)[:, 1]  # Predicted probabilities for class 1

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_probs)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print(f"Model Accuracy: {accuracy * 100:.5f}%")
print(f"Model AUC: {auc * 100:.5f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Training set shape: (3647, 20)
Testing set shape: (912, 20)
Model Accuracy: 84.32018%
Model AUC: 80.50519%
Confusion Matrix:
[[721  19]
 [124  48]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.97      0.91       740
           1       0.72      0.28      0.40       172

    accuracy                           0.84       912
   macro avg       0.78      0.63      0.66       912
weighted avg       0.83      0.84      0.81       912



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace=True)
Parameters: { "use_label_encoder" } are not used.

