In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


# File path
data_path = '/content/drive/My Drive/ColabNotebooks/Cluepoints/'  # Adjust the path
data_file = data_path + 'hospital_stay_data.csv'

# Load the dataset
df = pd.read_csv(data_file)




Mounted at /content/drive


In [None]:


# Fill missing values: Numerical with median, Categorical with mode
df.fillna(df.median(numeric_only=True), inplace=True)
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# 2.1 Convert 'Stay' Range to Mean Value
stay_mapping = {
    '0-10': 5, '11-20': 15, '21-30': 25, '31-40': 35, '41-50': 45,
    '51-60': 55, '61-70': 65, '71-80': 75, '81-90': 85, '91-100': 95, 'More than 100 Days': 110
}
df['Stay'] = df['Stay'].map(stay_mapping)

# 2.2 Convert 'Age' Range to Mean Value
age_mapping = {
    '0-10': 5, '11-20': 15, '21-30': 25, '31-40': 35, '41-50': 45, '51-60': 55, '61-70': 65,
    '71-80': 75, '81-90': 85, '91-100': 95
}
df['Age'] = df['Age'].map(age_mapping).fillna(df['Age'])  # If age is not in range, keep it

# Separate categorical and numerical features
categorical_cols = ['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code', 'Department',
                    'Ward_Type', 'Ward_Facility_Code', 'patientid', 'City_Code_Patient', 'Type of Admission', 'Severity of Illness']
numerical_cols = ['Available Extra Rooms in Hospital', 'Age', 'Admission_Deposit', 'Stay', 'Visitors with Patient', 'Bed Grade']

# Apply LabelEncoder to categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Ensure the categorical columns are string type
    label_encoders[col] = le  # Store encoder


# Features and target
X = df.drop(columns=['Stay', 'case_id', 'Available Extra Rooms in Hospital', 'Ward_Facility_Code', 'patientid' , 'City_Code_Patient', 'Severity of Illness', 'Admission_Deposit']).values
y = df['Stay'].values

# Convert to PyTorch tensors (correct for regression)
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)




# Split data: 70% train, 15% valid, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y_tensor, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert to PyTorch datasets
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
valid_dataset = TensorDataset(torch.tensor(X_valid, dtype=torch.float32), torch.tensor(y_valid, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

# Create DataLoaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
  valid_dataset = TensorDataset(torch.tensor(X_valid, dtype=torch.float32), torch.tensor(y_valid, dtype=torch.long))
  test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))


In [None]:
class HospitalStayPredictor(nn.Module):
    def __init__(self, input_dim):
        super(HospitalStayPredictor, self).__init__()

        # Increased network capacity: More layers and neurons
        self.fc1 = nn.Linear(input_dim, 512)  # Increased neurons
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 32)
        self.out = nn.Linear(32, 1)  # Output layer

        # Activation function (LeakyReLU is better than ReLU)
        self.activation = nn.LeakyReLU(0.1)

        # Dropout for regularization (reduces overfitting)
        self.dropout = nn.Dropout(0.3)

        # Batch Normalization (stabilizes training)
        self.bn1 = nn.BatchNorm1d(512)
        self.bn2 = nn.BatchNorm1d(256)
        self.bn3 = nn.BatchNorm1d(128)
        self.bn4 = nn.BatchNorm1d(64)
        self.bn5 = nn.BatchNorm1d(32)

    def forward(self, x):
        x = self.dropout(self.activation(self.bn1(self.fc1(x))))
        x = self.dropout(self.activation(self.bn2(self.fc2(x))))
        x = self.dropout(self.activation(self.bn3(self.fc3(x))))
        x = self.dropout(self.activation(self.bn4(self.fc4(x))))
        x = self.dropout(self.activation(self.bn5(self.fc5(x))))
        x = self.out(x)  # Output layer (no activation since it's regression)
        return x

In [None]:


# Initialize model
input_dim = X_train.shape[1]
model = HospitalStayPredictor(input_dim)  # Remove num_classes


# Early stopping setup
num_epochs = 20
patience = 10  # Early stopping patience
best_val_loss = float('inf')
epochs_no_improve = 0

# Define loss function and optimizer
criterion = criterion = nn.SmoothL1Loss()  # Huber Loss
 #nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

# Learning rate scheduler (Reduce LR if validation loss stops improving)
scheduler = optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.0001, max_lr=0.01, step_size_up=5, mode='triangular2')
#scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

# Training loop with early stopping and adaptive learning rate
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.float(), targets.float()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Compute validation loss
    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for inputs, targets in valid_loader:
            inputs, targets = inputs.float(), targets.float()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            valid_loss += loss.item()

    # Calculate average losses
    train_loss /= len(train_loader)
    valid_loss /= len(valid_loader)
    print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}")

    # Adjust learning rate based on validation loss
    scheduler.step(valid_loss)

    # Check for early stopping
    if valid_loss < (0.5 * best_val_loss):
        best_val_loss = valid_loss
        torch.save(model.state_dict(), data_path + "best_model.pth")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break

torch.save(model.state_dict(), data_path + "hospital_stay_regression.pth")


Epoch 1: Train Loss: 30.1335, Valid Loss: 27.7695




Epoch 2: Train Loss: 13.5413, Valid Loss: 11.1669
Epoch 3: Train Loss: 11.9173, Valid Loss: 11.0021
Epoch 4: Train Loss: 11.8047, Valid Loss: 11.1078
Epoch 5: Train Loss: 11.7144, Valid Loss: 10.9571
Epoch 6: Train Loss: 11.6696, Valid Loss: 10.8713
Epoch 7: Train Loss: 11.6106, Valid Loss: 10.8459
Epoch 8: Train Loss: 11.5635, Valid Loss: 10.7827
Epoch 9: Train Loss: 11.5247, Valid Loss: 10.7925
Epoch 10: Train Loss: 11.4978, Valid Loss: 10.7969
Epoch 11: Train Loss: 11.4614, Valid Loss: 10.7817
Epoch 12: Train Loss: 11.4345, Valid Loss: 10.7074
Early stopping triggered.


In [None]:
# Initialize the model
input_dim = X_train.shape[1]
model = HospitalStayPredictor(input_dim)  # Ensure the model is defined

# Load best model
model.load_state_dict(torch.load(data_path + "hospital_stay_regression.pth"))
model.eval()

# Evaluate on test data
test_loss = 0
num_samples = 0

# Store examples for display
example_inputs = []
example_targets = []
example_predictions = []

with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)  # MSE loss
        test_loss += loss.item() * inputs.size(0)  # Multiply by batch size to get total loss
        num_samples += inputs.size(0)

        # Store first 5 examples for printing
        example_inputs.extend(inputs[:5].tolist())  # Convert tensor to list
        example_targets.extend(targets[:5].squeeze().tolist())  # Ensure it's a flat list
        example_predictions.extend(outputs[:5].squeeze().tolist())  # Ensure it's a flat list

# Compute Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
mse = test_loss / num_samples
rmse = np.sqrt(mse)

print(f"Test MSE: {mse:.4f}")
print(f"Test RMSE: {rmse:.4f}")

# Print some example predictions
print("\nExample Predictions (First 5 Samples):")
for i in range(min(5, len(example_inputs))):
    print(f"Input: {example_inputs[i]}")
    print(f"Actual Stay: {float(example_targets[i]):.2f} days")  # Convert explicitly to float
    print(f"Predicted Stay: {float(example_predictions[i]):.2f} days\n")  # Convert explicitly to float


  model.load_state_dict(torch.load(data_path + "hospital_stay_regression.pth"))


Test MSE: 10.8314
Test RMSE: 3.2911

Example Predictions (First 5 Samples):
Input: [17.0, 4.0, 0.0, 0.0, 2.0, 3.0, 3.0, 1.0, 2.0, 75.0]
Actual Stay: 25.00 days
Predicted Stay: 23.73 days

Input: [9.0, 3.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 6.0, 75.0]
Actual Stay: 110.00 days
Predicted Stay: 63.51 days

Input: [1.0, 4.0, 0.0, 0.0, 2.0, 1.0, 3.0, 0.0, 4.0, 75.0]
Actual Stay: 15.00 days
Predicted Stay: 22.75 days

Input: [1.0, 4.0, 0.0, 0.0, 2.0, 2.0, 2.0, 1.0, 2.0, 35.0]
Actual Stay: 25.00 days
Predicted Stay: 24.10 days

Input: [10.0, 0.0, 9.0, 1.0, 2.0, 1.0, 3.0, 0.0, 2.0, 45.0]
Actual Stay: 15.00 days
Predicted Stay: 15.56 days



In [None]:
# Function to decode categorical features
def decode_categorical_features(encoded_sample, label_encoders, categorical_cols):
    decoded_sample = []
    num_features = len(encoded_sample)  # Get actual number of input features

    for i, col in enumerate(categorical_cols):
        if i >= num_features:  # Prevent index out of range
            break

        if col in label_encoders:
            try:
                encoded_value = int(encoded_sample[i])
                decoded_value = label_encoders[col].inverse_transform([encoded_value])[0]
                decoded_value = str(col) + ':' + decoded_value
            except ValueError:
                decoded_value = 'Decoding failed:' + str(col) + ' ' + str(encoded_sample[i])
        else:
            decoded_value = str(col) + ':' + str(encoded_sample[i])

        decoded_sample.append(decoded_value)

    return decoded_sample


# Print example predictions with decoded categorical values
print("\nExample Predictions (First 5 Samples with Decoded Categories):")
for i in range(min(5, len(example_inputs))):
    decoded_input = decode_categorical_features(example_inputs[i], label_encoders, categorical_cols)

    print(f"Decoded Input: {decoded_input}")
    print(f"Actual Stay: {float(example_targets[i]):.2f} days")
    print(f"Predicted Stay: {float(example_predictions[i]):.2f} days\n")



Example Predictions (First 5 Samples with Decoded Categories):
Decoded Input: ['case_id:100011', 'Hospital_code:13', 'Hospital_type_code:a', 'City_Code_Hospital:1', 'Hospital_region_code:Z', 'Department:radiotherapy', 'Ward_Type:S', 'Ward_Facility_Code:B', 'patientid:100', 'Decoding failed:City_Code_Patient 75.0']
Actual Stay: 25.00 days
Predicted Stay: 23.73 days

Decoded Input: ['case_id:100004', 'Hospital_code:12', 'Hospital_type_code:d', 'City_Code_Hospital:10', 'Hospital_region_code:Z', 'Department:gynecology', 'Ward_Type:S', 'Ward_Facility_Code:B', 'patientid:100001', 'Decoding failed:City_Code_Patient 75.0']
Actual Stay: 110.00 days
Predicted Stay: 63.51 days

Decoded Input: ['case_id:10', 'Hospital_code:13', 'Hospital_type_code:a', 'City_Code_Hospital:1', 'Hospital_region_code:Z', 'Department:anesthesia', 'Ward_Type:S', 'Ward_Facility_Code:A', 'patientid:10000', 'Decoding failed:City_Code_Patient 75.0']
Actual Stay: 15.00 days
Predicted Stay: 22.75 days

Decoded Input: ['case_

In [None]:
# Reverse mapping dictionary
reverse_stay_mapping = {
    (0, 10): '0-10', (11, 20): '11-20', (21, 30): '21-30', (31, 40): '31-40',
    (41, 50): '41-50', (51, 60): '51-60', (61, 70): '61-70', (71, 80): '71-80',
    (81, 90): '81-90', (91, 100): '91-100', (101, float('inf')): 'More than 100 Days'
}

# Function to map numerical Stay values back to categorical ranges
def map_stay_to_range(value):
    for (lower, upper), label in reverse_stay_mapping.items():
        if lower <= value <= upper:
            return label
    return "Unknown"  # Fallback in case of unexpected values

In [None]:
# Count correct predictions
correct_predictions = 0
total_predictions = len(example_inputs)

print("\nExample Predictions with Accuracy Check:")
for i in range(total_predictions):
    decoded_input = decode_categorical_features(example_inputs[i], label_encoders, categorical_cols)

    actual_range = map_stay_to_range(float(example_targets[i]))
    predicted_range = map_stay_to_range(float(example_predictions[i]))

    is_correct = actual_range == predicted_range  # True if ranges match
    if is_correct:
        correct_predictions += 1

    print(f"Decoded Input: {decoded_input}")
    print(f"Actual Stay: {actual_range} ({float(example_targets[i]):.2f} days)")
    print(f"Predicted Stay: {predicted_range} ({float(example_predictions[i]):.2f} days)")
    print(f"Match: {'✅' if is_correct else '❌'}\n")

# Calculate accuracy
accuracy = (correct_predictions / total_predictions) * 100
print(f"Prediction Accuracy: {accuracy:.2f}%")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Actual Stay: 51-60 (55.00 days)
Predicted Stay: 41-50 (44.56 days)
Match: ❌

Decoded Input: ['case_id:100012', 'Hospital_code:10', 'Hospital_type_code:e', 'City_Code_Hospital:10', 'Hospital_region_code:Y', 'Department:TB & Chest disease', 'Ward_Type:S', 'Ward_Facility_Code:B', 'patientid:10000', 'Decoding failed:City_Code_Patient 55.0']
Actual Stay: 21-30 (25.00 days)
Predicted Stay: 21-30 (24.65 days)
Match: ✅

Decoded Input: ['case_id:100014', 'Hospital_code:10', 'Hospital_type_code:c', 'City_Code_Hospital:1', 'Hospital_region_code:Z', 'Department:gynecology', 'Ward_Type:R', 'Ward_Facility_Code:B', 'patientid:100', 'Decoding failed:City_Code_Patient 65.0']
Actual Stay: 21-30 (25.00 days)
Predicted Stay: 21-30 (24.68 days)
Match: ✅

Decoded Input: ['case_id:100021', 'Hospital_code:1', 'Decoding failed:Hospital_type_code 8.0', 'City_Code_Hospital:1', 'Hospital_region_code:Z', 'Department:anesthesia', 'Ward_Type:R', 'Ward_