In [None]:
import pandas as pd
df = pd.read_csv('HALT_score_included_cleaned.csv', low_memory=False)

In [None]:
df.head()

In [None]:
#there are so many missing values in many columns, let's see for the decision of whether we will keep in analysis or not
# Calculate the percentage of missing values for each column
missing_percentage = df.isnull().mean() * 100
columns_to_drop = missing_percentage[missing_percentage > 80].index.tolist()
print("Columns dropped:", columns_to_drop)

In [None]:
#except 'timerandtodeath','causedeath','causedeathother','stillinhospday28','rebleedingnum'
#other data columns are not needed for my analysis and will drop those columns as part of data cleaning, will update
#columsn to drop meaning
essential_columns = ['timerandtodeath', 'causedeath', 'causedeathother', 'stillinhospday28', 'rebleedingnum','isserious1','isserious1']
columns_to_drop2 = missing_percentage[missing_percentage > 80].index.difference(essential_columns).tolist()
cleaned_data = df.drop(columns=columns_to_drop2)
print(cleaned_data.head())

In [None]:
#saving final cleanded dataset
cleaned_data.to_csv('final_cleaned_data.csv', index=False)

In [None]:
missing_data = cleaned_data.isna().sum()
missing_data = missing_data[missing_data > 0] # Filter columns with missing values 
print(missing_data)

In [None]:
cleaned_data.info()

In [None]:
# Summary of categorical data
for col in cleaned_data.select_dtypes(include=['object']).columns: 
    print(f"{col}:\n{cleaned_data[col].value_counts()}\n")

In [None]:
#------------------------Different methodology below---------------------------#

In [None]:


# Visualize missing data
sns.heatmap(cleaned_data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('HALT_score_included_cleaned.csv', low_memory=False)

# Encode the 'rebleeding' column to numerical values
cleaned_data = df.copy()
cleaned_data['rebleeding_encoded'] = cleaned_data['rebleeding'].map({'No': 0, 'Yes': 1})

# Verify the encoding
print(cleaned_data[['rebleeding', 'rebleeding_encoded']].head())

In [None]:
# Scatter plots for numerical variables vs a specific key outcome1
key_outcome1 = 'rebleeding_encoded'
numerical_cols = cleaned_data.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_cols:
    if col != key_outcome1:
        sns.lmplot(x=col, y=key_outcome1, data=cleaned_data, aspect=1.5)
        plt.title(f'{col} vs {key_outcome1}')
        plt.show()

# Box plots for numerical variables vs key outcome
for col in numerical_cols:
    if col != key_outcome1:
        plt.figure(figsize=(8, 4))
        sns.boxplot(x=key_outcome1, y=col, data=cleaned_data)
        plt.title(f'{key_outcome1} vs {col}')
        plt.show()

# Correlation and Pair Plots
plt.figure(figsize=(16, 12))
sns.heatmap(cleaned_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Pair plot for numerical columns
sns.pairplot(cleaned_data[numerical_cols])
plt.show()

In [None]:
cleaned_data.info()

In [None]:
#Bivariate Analysis
# Scatter plots for numerical variables vs a specific key outcome2 'causedeath', create a binary outcome column
import seaborn as sns
import matplotlib.pyplot as plt
# Create a binary outcome column for death
cleaned_data['death'] = cleaned_data['causedeath'].notnull().astype(int)

# Verify the new column
print(cleaned_data[['causedeath', 'death']].head())

In [None]:
key_outcome2 = 'death'
numerical_cols = cleaned_data.select_dtypes(include=['int64', 'float64']).columns
# Scatter plots for numerical variables vs a specific key outcome2
for col in numerical_cols:
    if col != key_outcome2:
        sns.lmplot(x=col, y=key_outcome2, data=cleaned_data, aspect=1.5)
        plt.title(f'{col} vs {key_outcome2}')
        plt.show()

# Box plots for numerical variables vs key outcome2
for col in numerical_cols:
    if col != key_outcome2:
        plt.figure(figsize=(8, 4))
        sns.boxplot(x=key_outcome2, y=col, data=cleaned_data)
        plt.title(f'{key_outcome2} vs {col}')
        plt.show()
        
# Heatmaps for categorical variables vs outcome3
categorical_cols = cleaned_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    count_matrix = pd.crosstab(index=cleaned_data[col], columns=cleaned_data[key_outcome2])
    plt.figure(figsize=(10, 8))
    sns.heatmap(count_matrix, annot=True, fmt='d', cmap='coolwarm')
    plt.title(f'Heatmap of {col} by {key_outcome2}')
    plt.xlabel(key_outcome2)
    plt.ylabel(col)
    plt.show()


In [None]:
#Bivariate Analysis
# Scatter plots for numerical variables vs a specific key outcome3 'stillinhospday28', create a binary outcome column
import seaborn as sns
import matplotlib.pyplot as plt
# Create a binary outcome column for death
cleaned_data['long_hospital_stay'] = cleaned_data['stillinhospday28'].notnull().astype(int)

# Verify the new column
print(cleaned_data[['long_hospital_stay', 'stillinhospday28']].head())

In [None]:
key_outcome3 = 'long_hospital_stay'
numerical_cols = cleaned_data.select_dtypes(include=['int64', 'float64']).columns
# Scatter plots for numerical variables vs a specific key outcome3
for col in numerical_cols:
    if col != key_outcome3:
        sns.lmplot(x=col, y=key_outcome3, data=cleaned_data, aspect=1.5)
        plt.title(f'{col} vs {key_outcome3}')
        plt.show()

# Box plots for numerical variables vs key outcome3
for col in numerical_cols:
    if col != key_outcome3:
        plt.figure(figsize=(8, 4))
        sns.boxplot(x=key_outcome3, y=col, data=cleaned_data)
        plt.title(f'{key_outcome3} vs {col}')
        plt.show()
        
# Heatmaps for categorical variables vs outcome3
categorical_cols = cleaned_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    count_matrix = pd.crosstab(index=cleaned_data[col], columns=cleaned_data[key_outcome3])
    plt.figure(figsize=(10, 8))
    sns.heatmap(count_matrix, annot=True, fmt='d', cmap='coolwarm')
    plt.title(f'Heatmap of {col} by {key_outcome3}')
    plt.xlabel(key_outcome3)
    plt.ylabel(col)
    plt.show()


In [None]:
# Correlation Matrix
plt.figure(figsize=(16, 12))
sns.heatmap(cleaned_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Pair plot for numerical columns
sns.pairplot(cleaned_data[numerical_cols])
plt.show()

In [None]:
#Feature Importance Analysis_step one_ data preparation
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv('HALT_score_included_cleaned.csv', low_memory=False)

# Drop columns with more than 80% missing values, except essential ones
essential_columns = ['timerandtodeath', 'causedeath', 'causedeathother', 'stillinhospday28', 'rebleedingnum']
missing_percentage = df.isnull().mean() * 100
columns_to_drop2 = missing_percentage[missing_percentage > 80].index.difference(essential_columns).tolist()
cleaned_data = df.drop(columns=columns_to_drop2)

# Impute missing values for numerical columns with median
for col in cleaned_data.select_dtypes(include=['int64', 'float64']).columns:
    cleaned_data[col].fillna(cleaned_data[col].median(), inplace=True)

# Impute missing values for categorical columns with mode
for col in cleaned_data.select_dtypes(include=['object']).columns:
    cleaned_data[col].fillna(cleaned_data[col].mode()[0], inplace=True)

# Encode categorical variables
cleaned_data_encoded = pd.get_dummies(cleaned_data, drop_first=True)

# Add binary outcomes for analysis
if 'rebleeding' in cleaned_data.columns:
    cleaned_data_encoded['rebleeding_encoded'] = cleaned_data['rebleeding'].map({'No': 0, 'Yes': 1})
if 'timerandtodeath' in cleaned_data.columns:
    cleaned_data_encoded['death'] = cleaned_data['timerandtodeath'].notnull().astype(int)
if 'stillinhospday28' in cleaned_data.columns:
    cleaned_data_encoded['long_hospital_stay'] = cleaned_data['stillinhospday28'].notnull().astype(int)

# Verify the encoding
print(cleaned_data_encoded[['rebleeding_encoded', 'death', 'long_hospital_stay']].head())

In [None]:
#Step 2: Define Features and Target Variables
from sklearn.model_selection import train_test_split

# Define features (X) and target (y) for 'rebleeding' outcome
X_rebleeding = cleaned_data_encoded.drop(columns=['rebleeding_encoded', 'death', 'long_hospital_stay'], errors='ignore')
y_rebleeding = cleaned_data_encoded['rebleeding_encoded']

# Split the dataset into training and testing sets for 'rebleeding'
X_train_rebleeding, X_test_rebleeding, y_train_rebleeding, y_test_rebleeding = train_test_split(X_rebleeding, y_rebleeding, test_size=0.2, random_state=42)

# Define features (X) and target (y) for 'death' outcome
y_death = cleaned_data_encoded['death']

# Split the dataset into training and testing sets for 'death'
X_train_death, X_test_death, y_train_death, y_test_death = train_test_split(X_rebleeding, y_death, test_size=0.2, random_state=42)

# Define features (X) and target (y) for 'long hospital stay' outcome
y_long_hospital_stay = cleaned_data_encoded['long_hospital_stay']

# Split the dataset into training and testing sets for 'long hospital stay'
X_train_long_hospital_stay, X_test_long_hospital_stay, y_train_long_hospital_stay, y_test_long_hospital_stay = train_test_split(X_rebleeding, y_long_hospital_stay, test_size=0.2, random_state=42)

In [None]:
#Step 3: Train the Model and Perform Feature Importance Analysis
#Feature Importance for Rebleeding
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Train a Random Forest model for 'rebleeding' outcome
rf_model_rebleeding = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_rebleeding.fit(X_train_rebleeding, y_train_rebleeding)

# Extract feature importances
feature_importances_rebleeding = rf_model_rebleeding.feature_importances_
features_rebleeding = X_rebleeding.columns

# Create a DataFrame for visualization
importance_df_rebleeding = pd.DataFrame({'Feature': features_rebleeding, 'Importance': feature_importances_rebleeding})
importance_df_rebleeding = importance_df_rebleeding.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
top_n = 5  # You can adjust this value as needed
importance_df_rebleeding_top = importance_df_rebleeding.head(top_n)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df_rebleeding)
plt.title('Feature Importances for Rebleeding from Random Forest')
plt.show()

In [None]:
#Feature Importance for Death
# Train a Random Forest model for 'death' outcome
rf_model_death = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_death.fit(X_train_death, y_train_death)

# Extract feature importances
feature_importances_death = rf_model_death.feature_importances_
features_death = X_rebleeding.columns

# Create a DataFrame for visualization
importance_df_death = pd.DataFrame({'Feature': features_death, 'Importance': feature_importances_death})
importance_df_death = importance_df_death.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
importance_df_death_top = importance_df_death.head(top_n)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df_death)
plt.title('Feature Importances for Death from Random Forest')
plt.show()

In [None]:
#Feature Importance for Long Hospital Stay
# Train a Random Forest model for 'long hospital stay' outcome
rf_model_long_hospital_stay = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_long_hospital_stay.fit(X_train_long_hospital_stay, y_train_long_hospital_stay)

# Extract feature importances
feature_importances_long_hospital_stay = rf_model_long_hospital_stay.feature_importances_
features_long_hospital_stay = X_rebleeding.columns

# Create a DataFrame for visualization
importance_df_long_hospital_stay = pd.DataFrame({'Feature': features_long_hospital_stay, 'Importance': feature_importances_long_hospital_stay})
importance_df_long_hospital_stay = importance_df_long_hospital_stay.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
importance_df_long_hospital_stay_top = importance_df_long_hospital_stay.head(top_n)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df_long_hospital_stay)
plt.title('Feature Importances for Long Hospital Stay from Random Forest')
plt.show()

In [None]:
#images are not readable, trying new code 

import matplotlib.pyplot as plt
import seaborn as sns

# Train a Random Forest model for 'rebleeding' outcome
rf_model_rebleeding = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_rebleeding.fit(X_train_rebleeding, y_train_rebleeding)

# Extract feature importances
feature_importances_rebleeding = rf_model_rebleeding.feature_importances_
features_rebleeding = X_rebleeding.columns

# Create a DataFrame for visualization
importance_df_rebleeding = pd.DataFrame({'Feature': features_rebleeding, 'Importance': feature_importances_rebleeding})
importance_df_rebleeding = importance_df_rebleeding.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
top_n = 10  # You can adjust this value as needed
importance_df_rebleeding_top = importance_df_rebleeding.head(top_n)

# Plot feature importances
plt.figure(figsize=(15, 10))  # Increase figure size
sns.barplot(x='Importance', y='Feature', data=importance_df_rebleeding_top)
plt.title('Top Feature Importances for Rebleeding from Random Forest', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
# Train a Random Forest model for 'death' outcome
rf_model_death = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_death.fit(X_train_death, y_train_death)

# Extract feature importances
feature_importances_death = rf_model_death.feature_importances_
features_death = X_rebleeding.columns

# Create a DataFrame for visualization
importance_df_death = pd.DataFrame({'Feature': features_death, 'Importance': feature_importances_death})
importance_df_death = importance_df_death.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
top_n = 10  # You can adjust this value as needed
importance_df_death_top = importance_df_death.head(top_n)

# Plot feature importances
plt.figure(figsize=(15, 10))  # Increase figure size
sns.barplot(x='Importance', y='Feature', data=importance_df_death_top)
plt.title('Top Feature Importances for Death from Random Forest', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xticks(rotation=45, ha='right')  # Rotate feature names
plt.show()

In [None]:
# Train a Random Forest model for 'long hospital stay' outcome
rf_model_long_hospital_stay = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_long_hospital_stay.fit(X_train_long_hospital_stay, y_train_long_hospital_stay)

# Extract feature importances
feature_importances_long_hospital_stay = rf_model_long_hospital_stay.feature_importances_
features_long_hospital_stay = X_rebleeding.columns

# Create a DataFrame for visualization
importance_df_long_hospital_stay = pd.DataFrame({'Feature': features_long_hospital_stay, 'Importance': feature_importances_long_hospital_stay})
importance_df_long_hospital_stay = importance_df_long_hospital_stay.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
importance_df_long_hospital_stay_top = importance_df_long_hospital_stay.head(top_n)

# Plot feature importances
plt.figure(figsize=(15, 10))  # Increase figure size
sns.barplot(x='Importance', y='Feature', data=importance_df_long_hospital_stay_top)
plt.title('Top Feature Importances for Long Hospital Stay from Random Forest', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xticks(rotation=45, ha='right')  # Rotate feature names
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load the cleaned dataset
df = pd.read_csv('HALT_score_included_cleaned.csv', low_memory=False)

# Drop columns with more than 80% missing values, except essential ones
essential_columns = ['timerandtodeath', 'causedeath', 'causedeathother', 'stillinhospday28', 'rebleedingnum']
missing_percentage = df.isnull().mean() * 100
columns_to_drop2 = missing_percentage[missing_percentage > 80].index.difference(essential_columns).tolist()
cleaned_data = df.drop(columns=columns_to_drop2)

# Impute missing values for numerical columns with median
for col in cleaned_data.select_dtypes(include=['int64', 'float64']).columns:
    cleaned_data[col].fillna(cleaned_data[col].median(), inplace=True)

# Impute missing values for categorical columns with mode
for col in cleaned_data.select_dtypes(include=['object']).columns:
    cleaned_data[col].fillna(cleaned_data[col].mode()[0], inplace=True)

# Encode categorical variables
cleaned_data_encoded = pd.get_dummies(cleaned_data, drop_first=True)

# Add binary outcomes for analysis
cleaned_data_encoded['rebleeding_encoded'] = cleaned_data['rebleeding'].map({'No': 0, 'Yes': 1})
cleaned_data_encoded['death'] = cleaned_data['timerandtodeath'].notnull().astype(int)
cleaned_data_encoded['long_hospital_stay'] = cleaned_data['stillinhospday28'].notnull().astype(int)

# Remove features with high cardinality
cardinality_threshold = 50  # Adjust as needed
high_cardinality_cols = [col for col in cleaned_data_encoded.columns if cleaned_data_encoded[col].nunique() > cardinality_threshold]
cleaned_data_encoded.drop(columns=high_cardinality_cols, inplace=True)

# Verify the encoding
print(cleaned_data_encoded[['rebleeding_encoded', 'death', 'long_hospital_stay']].head())

# Define features (X) and target (y) for 'rebleeding' outcome
X_rebleeding = cleaned_data_encoded.drop(columns=['rebleeding_encoded', 'death', 'long_hospital_stay', 'timerandtodeath'], errors='ignore')
y_rebleeding = cleaned_data_encoded['rebleeding_encoded']

# Split the dataset into training and testing sets for 'rebleeding'
X_train_rebleeding, X_test_rebleeding, y_train_rebleeding, y_test_rebleeding = train_test_split(X_rebleeding, y_rebleeding, test_size=0.2, random_state=42)

# Train a Random Forest model for 'rebleeding' outcome
rf_model_rebleeding = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_rebleeding.fit(X_train_rebleeding, y_train_rebleeding)

# Extract feature importances
feature_importances_rebleeding = rf_model_rebleeding.feature_importances_
features_rebleeding = X_rebleeding.columns

# Create a DataFrame for visualization
importance_df_rebleeding = pd.DataFrame({'Feature': features_rebleeding, 'Importance': feature_importances_rebleeding})
importance_df_rebleeding = importance_df_rebleeding.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
top_n = 10  # You can adjust this value as needed
importance_df_rebleeding_top = importance_df_rebleeding.head(top_n)

# Plot feature importances
plt.figure(figsize=(15, 10))  # Increase figure size
sns.barplot(x='Importance', y='Feature', data=importance_df_rebleeding_top)
plt.title('Top Feature Importances for Rebleeding from Random Forest', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xticks(rotation=45, ha='right')  # Rotate feature names
plt.show()

In [None]:
# Define features (X) and target (y) for 'death' outcome
X_death = cleaned_data_encoded.drop(columns=['death', 'rebleeding_encoded', 'long_hospital_stay', 'timerandtodeath'], errors='ignore')
y_death = cleaned_data_encoded['death']

# Split the dataset into training and testing sets for 'death'
X_train_death, X_test_death, y_train_death, y_test_death = train_test_split(X_death, y_death, test_size=0.2, random_state=42)

# Train a Random Forest model for 'death' outcome
rf_model_death = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_death.fit(X_train_death, y_train_death)

# Extract feature importances
feature_importances_death = rf_model_death.feature_importances_
features_death = X_death.columns

# Create a DataFrame for visualization
importance_df_death = pd.DataFrame({'Feature': features_death, 'Importance': feature_importances_death})
importance_df_death = importance_df_death.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
importance_df_death_top = importance_df_death.head(top_n)

# Plot feature importances
plt.figure(figsize=(15, 10))  # Increase figure size
sns.barplot(x='Importance', y='Feature', data=importance_df_death_top)
plt.title('Top Feature Importances for Death from Random Forest', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xticks(rotation=45, ha='right')  # Rotate feature names
plt.show()

In [None]:
# Define features (X) and target (y) for 'long hospital stay' outcome
X_long_hospital_stay = cleaned_data_encoded.drop(columns=['long_hospital_stay', 'rebleeding_encoded', 'death'], errors='ignore')
y_long_hospital_stay = cleaned_data_encoded['long_hospital_stay']

# Split the dataset into training and testing sets for 'long hospital stay'
X_train_long_hospital_stay, X_test_long_hospital_stay, y_train_long_hospital_stay, y_test_long_hospital_stay = train_test_split(X_long_hospital_stay, y_long_hospital_stay, test_size=0.2, random_state=42)

# Train a Random Forest model for 'long hospital stay' outcome
rf_model_long_hospital_stay = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_long_hospital_stay.fit(X_train_long_hospital_stay, y_train_long_hospital_stay)

# Extract feature importances
feature_importances_long_hospital_stay = rf_model_long_hospital_stay.feature_importances_
features_long_hospital_stay = X_long_hospital_stay.columns

# Create a DataFrame for visualization
importance_df_long_hospital_stay = pd.DataFrame({'Feature': features_long_hospital_stay, 'Importance': feature_importances_long_hospital_stay})
importance_df_long_hospital_stay = importance_df_long_hospital_stay.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
importance_df_long_hospital_stay_top = importance_df_long_hospital_stay.head(top_n)

# Plot feature importances
plt.figure(figsize=(15, 10))  # Increase figure size
sns.barplot(x='Importance', y='Feature', data=importance_df_long_hospital_stay_top)
plt.title('Top Feature Importances for Long Hospital Stay from Random Forest', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xticks(rotation=45, ha='right')  # Rotate feature names
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
df = pd.read_csv('HALT_score_included_cleaned.csv', low_memory=False)

# Drop columns with more than 80% missing values, except essential ones
essential_columns = ['timerandtodeath', 'causedeath', 'causedeathother', 'stillinhospday28', 'rebleedingnum']
missing_percentage = df.isnull().mean() * 100
columns_to_drop2 = missing_percentage[missing_percentage > 80].index.difference(essential_columns).tolist()
cleaned_data = df.drop(columns=columns_to_drop2)

# Impute missing values for numerical columns with median
for col in cleaned_data.select_dtypes(include=['int64', 'float64']).columns:
    cleaned_data[col].fillna(cleaned_data[col].median(), inplace=True)

# Impute missing values for categorical columns with mode
for col in cleaned_data.select_dtypes(include=['object']).columns:
    cleaned_data[col].fillna(cleaned_data[col].mode()[0], inplace=True)

# Encode categorical variables
cleaned_data_encoded = pd.get_dummies(cleaned_data, drop_first=True)

# Add binary outcomes for analysis
cleaned_data_encoded['rebleeding_encoded'] = cleaned_data['rebleeding'].map({'No': 0, 'Yes': 1})
cleaned_data_encoded['death'] = cleaned_data['timerandtodeath'].notnull().astype(int)
cleaned_data_encoded['long_hospital_stay'] = cleaned_data['stillinhospday28'].notnull().astype(int)

# Verify the encoding
print(cleaned_data_encoded[['rebleeding_encoded', 'death', 'long_hospital_stay']].head())

In [None]:
# Define features (X) and target (y) for 'rebleeding' outcome
X_rebleeding = cleaned_data_encoded.drop(columns=['rebleeding_encoded', 'death', 'long_hospital_stay', 'timerandtodeath'], errors='ignore')
y_rebleeding = cleaned_data_encoded['rebleeding_encoded']

# Split the dataset into training and testing sets for 'rebleeding'
X_train_rebleeding, X_test_rebleeding, y_train_rebleeding, y_test_rebleeding = train_test_split(X_rebleeding, y_rebleeding, test_size=0.2, random_state=42)

# Train a Random Forest model for 'rebleeding' outcome
rf_model_rebleeding = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_rebleeding.fit(X_train_rebleeding, y_train_rebleeding)

# Extract feature importances
feature_importances_rebleeding = rf_model_rebleeding.feature_importances_
features_rebleeding = X_rebleeding.columns

# Create a DataFrame for visualization
importance_df_rebleeding = pd.DataFrame({'Feature': features_rebleeding, 'Importance': feature_importances_rebleeding})
importance_df_rebleeding = importance_df_rebleeding.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
top_n = 10  # You can adjust this value as needed
importance_df_rebleeding_top = importance_df_rebleeding.head(top_n)

# Plot feature importances
plt.figure(figsize=(15, 10))  # Increase figure size
sns.barplot(x='Importance', y='Feature', data=importance_df_rebleeding_top)
plt.title('Top Feature Importances for Rebleeding from Random Forest', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xticks(rotation=45, ha='right')  # Rotate feature names
plt.show()

# Define features (X) and target (y) for 'death' outcome
X_death = cleaned_data_encoded.drop(columns=['death', 'rebleeding_encoded', 'long_hospital_stay', 'timerandtodeath'], errors='ignore')
y_death = cleaned_data_encoded['death']

# Split the dataset into training and testing sets for 'death'
X_train_death, X_test_death, y_train_death, y_test_death = train_test_split(X_death, y_death, test_size=0.2, random_state=42)

# Train a Random Forest model for 'death' outcome
rf_model_death = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_death.fit(X_train_death, y_train_death)

# Extract feature importances
feature_importances_death = rf_model_death.feature_importances_
features_death = X_death.columns

# Create a DataFrame for visualization
importance_df_death = pd.DataFrame({'Feature': features_death, 'Importance': feature_importances_death})
importance_df_death = importance_df_death.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
importance_df_death_top = importance_df_death.head(top_n)

# Plot feature importances
plt.figure(figsize=(15, 10))  # Increase figure size
sns.barplot(x='Importance', y='Feature', data=importance_df_death_top)
plt.title('Top Feature Importances for Death from Random Forest', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xticks(rotation=45, ha='right')  # Rotate feature names
plt.show()

# Define features (X) and target (y) for 'long hospital stay' outcome
X_long_hospital_stay = cleaned_data_encoded.drop(columns=['long_hospital_stay', 'rebleeding_encoded', 'death', 'timerandtodeath'], errors='ignore')
y_long_hospital_stay = cleaned_data_encoded['long_hospital_stay']

# Split the dataset into training and testing sets for 'long hospital stay'
X_train_long_hospital_stay, X_test_long_hospital_stay, y_train_long_hospital_stay, y_test_long_hospital_stay = train_test_split(X_long_hospital_stay, y_long_hospital_stay, test_size=0.2, random_state=42)

# Train a Random Forest model for 'long hospital stay' outcome
rf_model_long_hospital_stay = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_long_hospital_stay.fit(X_train_long_hospital_stay, y_train_long_hospital_stay)

# Extract feature importances
feature_importances_long_hospital_stay = rf_model_long_hospital_stay.feature_importances_
features_long_hospital_stay = X_long_hospital_stay.columns

# Create a DataFrame for visualization
importance_df_long_hospital_stay = pd.DataFrame({'Feature': features_long_hospital_stay, 'Importance': feature_importances_long_hospital_stay})
importance_df_long_hospital_stay = importance_df_long_hospital_stay.sort_values(by='Importance', ascending=False)

# Filter to display only the top N most important features
importance_df_long_hospital_stay_top = importance_df_long_hospital_stay.head(top_n)

# Plot feature importances
plt.figure(figsize=(15, 10))  # Increase figure size
sns.barplot(x='Importance', y='Feature', data=importance_df_long_hospital_stay_top)
plt.title('Top Feature Importances for Long Hospital Stay from Random Forest', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xticks(rotation=45, ha='right')  # Rotate feature names
plt.show()