<h1 style="color: brown;">1. Importing Libraries and Configuring Display Options</h1>

In [1]:
# Data manipulation and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Evaluation
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import cross_validate
import time
from itertools import product

## Customizing DataFrame Output Formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

<h1 style="color: brown;">2. Exploring Dataset</h1>

<h2 style="color: purple;">2.1 Loading Healthcare Dataset</h2>

In [2]:
df = pd.read_csv('healthcare_dataset.csv')

<h2 style="color: purple;">2.2 Displaying Dataset Dimensions</h2> 

In [3]:
print(f'The data has {df.shape[0]} rows and {df.shape[1]} columns.')

The data has 55500 rows and 15 columns.


<h2 style="color: purple;">2.3 Displaying All Column Names</h2>

In [4]:
# Get all column names from df
all_column_names = df.columns

# Display columns with numbered list
print("All Columns:")
for idx, col in enumerate(all_column_names, start=1):
    print(f"{idx}. {col}")

All Columns:
1. Name
2. Age
3. Gender
4. Blood Type
5. Medical Condition
6. Date of Admission
7. Doctor
8. Hospital
9. Insurance Provider
10. Billing Amount
11. Room Number
12. Admission Type
13. Discharge Date
14. Medication
15. Test Results


<h2 style="color: purple;">2.4 Identify Numeric and Non-Numeric Features</h2>  

In [5]:

numeric_column_names = df.select_dtypes(include='number').columns
non_numeric_column_names = df.select_dtypes(exclude='number').columns

# Organize the columns in a dictionary for clear display
columns_dict = {
    "Numeric Columns": list(numeric_column_names),
    "Non-Numeric Columns": list(non_numeric_column_names)
}

# Print dictionary in a readable format with numbered lists and unique value counts
for category, columns in columns_dict.items():
    print(f"{category}:")
    for idx, col in enumerate(columns, start=1):
        unique_values_count = df[col].nunique()  # Count unique values
        print(f"  {idx}. {col} (Unique Values: {unique_values_count})")
    print()  # Add an empty line between categories


Numeric Columns:
  1. Age (Unique Values: 77)
  2. Billing Amount (Unique Values: 50000)
  3. Room Number (Unique Values: 400)

Non-Numeric Columns:
  1. Name (Unique Values: 49992)
  2. Gender (Unique Values: 2)
  3. Blood Type (Unique Values: 8)
  4. Medical Condition (Unique Values: 6)
  5. Date of Admission (Unique Values: 1827)
  6. Doctor (Unique Values: 40341)
  7. Hospital (Unique Values: 39876)
  8. Insurance Provider (Unique Values: 5)
  9. Admission Type (Unique Values: 3)
  10. Discharge Date (Unique Values: 1856)
  11. Medication (Unique Values: 5)
  12. Test Results (Unique Values: 3)



<h2 style="color: purple;">2.5 Displaying First 5 Rows of the Dataset</h2>  

In [6]:
df.head(5)

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.782,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.318,458,Urgent,2022-10-09,Penicillin,Abnormal


<h2 style="color: purple;">2.6 Checking for Missing Data</h2>  

In [7]:
print(f'Total number of missing values: {df.isnull().sum().sum()}')

Total number of missing values: 0


<h2 style="color: purple;">2.7 Checking for Duplicate Data</h2> 

In [8]:
print(f'There are {df.duplicated().sum()} duplicate rows in the dataset.')

There are 534 duplicate rows in the dataset.


<h1 style="color: brown;">3. Preprocessing </h1> 

<h2 style="color: purple;">3.1 Removing Duplicate Rows</h2>


In [9]:
df = df.drop_duplicates()

<h2 style="color: purple;">3.2 Computing Length of Stay from Admission and Discharge Dates</h2>

In [10]:
# Convert Date of Admission and Discharge Date to datetime
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'])

# Calculate the length of stay and create a new column
df['Length of Stay'] = (df['Discharge Date'] - df['Date of Admission']).dt.days

<h2 style="color: purple;">3.1 Dropping Unnecessary Columns</h2>

In [11]:
df = df.drop(columns = ['Room Number', 'Name', 'Date of Admission', 'Doctor', 'Hospital', 'Discharge Date' ])

<h2 style="color: purple;">3.1 Mapping Admission Types to Numerical Values</h2> 

In [12]:
# Create a mapping for the "Admission Type" column
admission_type_mapping = {
    'Elective': 1,
    'Urgent': 2,
    'Emergency': 3
}

# Apply the mapping to the "Admission Type" column
df['Admission Type'] = df['Admission Type'].map(admission_type_mapping)

# Verify the transformation
print(df['Admission Type'].value_counts())

Admission Type
1    18473
2    18391
3    18102
Name: count, dtype: int64


<h2 style="color: purple;">2.8 Summary Statistics for Numeric Columns</h2> 

In [13]:
numeric_columns = df.select_dtypes(include='number')
numeric_columns.describe()

Unnamed: 0,Age,Billing Amount,Admission Type,Length of Stay
count,54966.0,54966.0,54966.0,54966.0
mean,51.535,25544.306,1.993,15.499
std,19.606,14208.41,0.816,8.661
min,13.0,-2008.492,1.0,1.0
25%,35.0,13243.719,1.0,8.0
50%,52.0,25542.749,2.0,15.0
75%,68.0,37819.858,3.0,23.0
max,89.0,52764.277,3.0,30.0


<h2 style="color: purple;">2.9 Dropping Billing Amount Featur</h2>  

In [14]:
### Billing ammount is in minus and not helpful compared to other features
df = df.drop(columns = ['Billing Amount'])

<h2 style="color: purple;">2.9 Summary Statistics for Categorical Columns</h2> 

In [15]:
non_numeric_columns = df.select_dtypes(exclude='number')
print("Non-Numeric Columns Summary:")
non_numeric_columns.describe(include='all')

Non-Numeric Columns Summary:


Unnamed: 0,Gender,Blood Type,Medical Condition,Insurance Provider,Medication,Test Results
count,54966,54966,54966,54966,54966,54966
unique,2,8,6,5,5,3
top,Male,A-,Arthritis,Cigna,Lipitor,Abnormal
freq,27496,6898,9218,11139,11038,18437


<h2 style="color: purple;">2.10 Displaying Frequency of Categorical Data</h2> 

In [16]:
### Displaying Frequency of Categorical Data
for col in non_numeric_columns:
    # Get value counts
    value_counts = df[col].value_counts()

    # Display with a numbered list
    print(f"\nValue counts for {col}:")
    for idx, (value, count) in enumerate(value_counts.items(), start=1):
        print(f"{idx}. {value}: {count}")
    print("-" * 50)


Value counts for Gender:
1. Male: 27496
2. Female: 27470
--------------------------------------------------

Value counts for Blood Type:
1. A-: 6898
2. A+: 6896
3. B+: 6885
4. AB+: 6882
5. AB-: 6874
6. B-: 6872
7. O+: 6855
8. O-: 6804
--------------------------------------------------

Value counts for Medical Condition:
1. Arthritis: 9218
2. Diabetes: 9216
3. Hypertension: 9151
4. Obesity: 9146
5. Cancer: 9140
6. Asthma: 9095
--------------------------------------------------

Value counts for Insurance Provider:
1. Cigna: 11139
2. Medicare: 11039
3. UnitedHealthcare: 11014
4. Blue Cross: 10952
5. Aetna: 10822
--------------------------------------------------

Value counts for Medication:
1. Lipitor: 11038
2. Ibuprofen: 11023
3. Aspirin: 10984
4. Paracetamol: 10965
5. Penicillin: 10956
--------------------------------------------------

Value counts for Test Results:
1. Abnormal: 18437
2. Normal: 18331
3. Inconclusive: 18198
--------------------------------------------------


<h2 style="color: purple;">3.4 Preprocessing Non-Numeric Columns for Machine Learning</h2>  

In [17]:
# Define LabelEncoder
label_encoder = LabelEncoder()

# Iterate over the non-numeric columns
for col in non_numeric_columns:
    # Get the number of unique values in the column
    unique_values_count = df[col].nunique()

    if unique_values_count == 2:
        # Apply Label Encoding for columns with 2 unique values
        df[col] = label_encoder.fit_transform(df[col])

        # Display the mapping of label encoding
        print(f"\nLabel Encoding for column '{col}':")
        mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
        print(f"  {mapping}")
    elif unique_values_count >= 3:
        # Apply One-Hot Encoding for columns with 3 or more unique values
        df_encoded = pd.get_dummies(df[col], prefix=col, dtype=int)

        # Drop the original column and replace it with the One-Hot Encoded columns
        df = df.drop(columns=[col])
        df = pd.concat([df, df_encoded], axis=1)

        print(f"\nOne-Hot Encoding applied to column '{col}':")
        print(f"  Created columns: {list(df_encoded.columns)}")


Label Encoding for column 'Gender':
  {'Female': 0, 'Male': 1}

One-Hot Encoding applied to column 'Blood Type':
  Created columns: ['Blood Type_A+', 'Blood Type_A-', 'Blood Type_AB+', 'Blood Type_AB-', 'Blood Type_B+', 'Blood Type_B-', 'Blood Type_O+', 'Blood Type_O-']

One-Hot Encoding applied to column 'Medical Condition':
  Created columns: ['Medical Condition_Arthritis', 'Medical Condition_Asthma', 'Medical Condition_Cancer', 'Medical Condition_Diabetes', 'Medical Condition_Hypertension', 'Medical Condition_Obesity']

One-Hot Encoding applied to column 'Insurance Provider':
  Created columns: ['Insurance Provider_Aetna', 'Insurance Provider_Blue Cross', 'Insurance Provider_Cigna', 'Insurance Provider_Medicare', 'Insurance Provider_UnitedHealthcare']

One-Hot Encoding applied to column 'Medication':
  Created columns: ['Medication_Aspirin', 'Medication_Ibuprofen', 'Medication_Lipitor', 'Medication_Paracetamol', 'Medication_Penicillin']

One-Hot Encoding applied to column 'Test Res

In [18]:
# Display the updated dataframe with new encoded columns
print("\nUpdated DataFrame with Encoded Columns:")
df.head()


Updated DataFrame with Encoded Columns:


Unnamed: 0,Age,Gender,Admission Type,Length of Stay,Blood Type_A+,Blood Type_A-,Blood Type_AB+,Blood Type_AB-,Blood Type_B+,Blood Type_B-,Blood Type_O+,Blood Type_O-,Medical Condition_Arthritis,Medical Condition_Asthma,Medical Condition_Cancer,Medical Condition_Diabetes,Medical Condition_Hypertension,Medical Condition_Obesity,Insurance Provider_Aetna,Insurance Provider_Blue Cross,Insurance Provider_Cigna,Insurance Provider_Medicare,Insurance Provider_UnitedHealthcare,Medication_Aspirin,Medication_Ibuprofen,Medication_Lipitor,Medication_Paracetamol,Medication_Penicillin,Test Results_Abnormal,Test Results_Inconclusive,Test Results_Normal
0,30,1,2,2,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
1,62,1,3,6,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0
2,76,0,3,15,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1
3,28,0,1,30,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0
4,43,0,2,20,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0


<h1 style="color: brown;">4. Development and Assessment of Models</h1> 

<h2 style="color: purple;">4.1 Splitting Features and Target Variable</h2>


In [19]:
X = df.drop(columns=['Admission Type'])
y = df['Admission Type']

<h2 style="color: purple;">4.2 Splitting Data into Training and Test Sets</h2> 


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

<h2 style="color: purple;">4.3 Scaling Features using StandardScaler</h2>

In [21]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h1 style="color: brown;">5. Evaluating Models and Measuring Performance</h1>  

<h2 style="color: purple;">5.1 Finding the Best Model</h2> 

In [22]:
model_list = [
    LogisticRegression(max_iter=10000, random_state=42),
    DecisionTreeClassifier(ccp_alpha=0.001, random_state=42),
    RandomForestClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    AdaBoostClassifier(algorithm='SAMME', random_state=42),
    SVC(probability=True, random_state=42),  # probability=True needed for log_loss
    GaussianNB(),
    KNeighborsClassifier(),
]

results = []

for model in model_list:
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    preds_proba = None
    
    # For log_loss we need predicted probabilities, if available
    if hasattr(model, "predict_proba"):
        preds_proba = model.predict_proba(X_test_scaled)
    elif hasattr(model, "decision_function"):
        # Some models like SVC can give decision_function scores; convert to probabilities
        decision_scores = model.decision_function(X_test_scaled)
        from sklearn.preprocessing import softmax
        preds_proba = softmax(decision_scores, axis=1)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='macro')
    prec = precision_score(y_test, preds, average='macro')
    rec = recall_score(y_test, preds, average='macro')
    
    # Compute log loss only if predicted probabilities are available
    if preds_proba is not None:
        ll = log_loss(y_test, preds_proba)
    else:
        ll = np.nan  # not available
    
    results.append((model.__class__.__name__, acc, f1, prec, rec, ll))

# Create DataFrame and sort by accuracy or any metric
df_results = pd.DataFrame(results, columns=["Model", "Accuracy", "F1_macro", "Precision_macro", "Recall_macro", "Log_loss"])
df_results = df_results.sort_values(by="Accuracy", ascending=False)

print(df_results)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                        Model  Accuracy  F1_macro  Precision_macro  \
2      RandomForestClassifier     0.416     0.416            0.416   
7        KNeighborsClassifier     0.359     0.352            0.362   
5                         SVC     0.349     0.347            0.350   
3  GradientBoostingClassifier     0.341     0.335            0.341   
4          AdaBoostClassifier     0.340     0.321            0.340   
0          LogisticRegression     0.339     0.320            0.341   
6                  GaussianNB     0.338     0.329            0.340   
1      DecisionTreeClassifier     0.332     0.166            0.111   

   Recall_macro  Log_loss  
2         0.416     1.089  
7         0.359     5.355  
5         0.349     1.098  
3         0.341     1.100  
4         0.340     1.099  
0         0.340     1.099  
6         0.339     1.106  
1         0.333     1.099  


In [23]:
best_row = df_results.loc[df_results['Accuracy'].idxmax()]

best_model_name = best_row['Model']
best_accuracy = best_row['Accuracy']

print(f"Best model based on accuracy: {best_model_name} with accuracy = {best_accuracy:.4f}")

Best model based on accuracy: RandomForestClassifier with accuracy = 0.4162


<h2 style="color: purple;">5.2 Manual Grid Search for Best Hyperparameters</h2>  

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from itertools import product
import numpy as np

# Create a validation split from your training data
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)

# Define your hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2', None]  # you can add more if you want
}

best_score = 0
best_params = None

for combo in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combo))
    model = RandomForestClassifier(**params, random_state=42)
    model.fit(X_subtrain, y_subtrain)
    preds = model.predict(X_val)
    score = accuracy_score(y_val, preds)
    
    if score > best_score:
        best_score = score
        best_params = params

print("Best hyperparameters found (manual grid search):")
print(best_params)
print(f"Validation Accuracy with best params: {best_score:.4f}")

Best hyperparameters found (manual grid search):
{'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'max_features': None}
Validation Accuracy with best params: 0.4057


<h2 style="color: purple;">5.3 Cross-Validation Evaluation</h2>   

In [25]:
# Best parameters from grid search
best_params = {
    'n_estimators': 100,
    'max_depth': None,
    'min_samples_split': 2,
    'max_features': None
}

# Initialize final model
final_model = RandomForestClassifier(**best_params, random_state=42)

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'log_loss': 'neg_log_loss'
}

# Perform cross-validation
cv_results = cross_validate(final_model, X_train_scaled, y_train, cv=5, scoring=scoring)

# Extract and convert metrics
accuracy_scores = cv_results['test_accuracy']
f1_scores = cv_results['test_f1_macro']
precision_scores = cv_results['test_precision_macro']
recall_scores = cv_results['test_recall_macro']
log_loss_scores = -cv_results['test_log_loss']  # convert to positive

# Print individual and mean scores
print("CV Accuracy scores:", accuracy_scores)
print("Mean CV Accuracy:", np.mean(accuracy_scores))

print("CV F1 (macro) scores:", f1_scores)
print("Mean CV F1 (macro):", np.mean(f1_scores))

print("CV Precision (macro) scores:", precision_scores)
print("Mean CV Precision (macro):", np.mean(precision_scores))

print("CV Recall (macro) scores:", recall_scores)
print("Mean CV Recall (macro):", np.mean(recall_scores))

print("CV Log Loss scores:", log_loss_scores)
print("Mean CV Log Loss:", np.mean(log_loss_scores))

CV Accuracy scores: [0.39929834 0.40207927 0.39181287 0.39220273 0.397141  ]
Mean CV Accuracy: 0.39650684084894616
CV F1 (macro) scores: [0.39914109 0.40159177 0.3915713  0.39178392 0.3966876 ]
Mean CV F1 (macro): 0.39615513590891305
CV Precision (macro) scores: [0.39937673 0.40209564 0.39230471 0.39209158 0.39696782]
Mean CV Precision (macro): 0.3965672944955095
CV Recall (macro) scores: [0.39917122 0.40176516 0.39159509 0.3919498  0.3968418 ]
Mean CV Recall (macro): 0.3962646157758687
CV Log Loss scores: [1.09450024 1.09270311 1.0923675  1.09808448 1.093994  ]
Mean CV Log Loss: 1.0943298651201936


<h2 style="color: purple;">5.4 Final Model Training and Test Set Evaluation</h2>   

In [26]:
# Retrain on the full training data
final_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_test_preds = final_model.predict(X_test_scaled)

# Evaluate
from sklearn.metrics import classification_report

print("Final Test Set Evaluation:\n")
print(classification_report(y_test, y_test_preds, digits=4))

Final Test Set Evaluation:

              precision    recall  f1-score   support

           1     0.4019    0.4229    0.4121      5481
           2     0.4148    0.4280    0.4213      5486
           3     0.4187    0.3837    0.4004      5523

    accuracy                         0.4115     16490
   macro avg     0.4118    0.4115    0.4113     16490
weighted avg     0.4118    0.4115    0.4112     16490

