Load the Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

data = pd.read_csv('data/custom_covid19.csv') 
print("Data loaded with shape:", data.shape)


Data loaded with shape: (100000, 21)


##### 1. Create target variable

In [2]:
data['DIED'] = data['DATE_DIED'].apply(lambda x: 0 if x == '9999-99-99' else 1)


##### 2. Mark missing values

In [3]:
data.replace([97, 98, 99], np.nan, inplace=True)


##### 3. Convert boolean variables

In [4]:
bool_cols = ['INTUBED', 'PNEUMONIA', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 
             'INMSUPR', 'HYPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR', 
             'OBESITY', 'RENAL_CHRONIC', 'TOBACCO', 'ICU']
data[bool_cols] = data[bool_cols].replace(2, 0)


##### 4. Create COVID status feature

In [5]:
data['COVID_POSITIVE'] = data['TEST_RESULT'].apply(lambda x: 1 if x in [1,2,3] else 0)

##### 5. Define features to keep/drop

In [6]:
features_to_keep = ['USMER', 'MEDICAL_UNIT', 'SEX', 'PATIENT_TYPE', 
                   'INTUBED', 'PNEUMONIA', 'AGE', 'DIABETES', 'COPD',
                   'ASTHMA', 'INMSUPR', 'HYPERTENSION', 'OTHER_DISEASE',
                   'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO',
                   'ICU', 'COVID_POSITIVE']

# 6. Separate features and target
X = data[features_to_keep]
y = data['DIED']

##### 6. Define preprocessing pipeline

In [7]:
numeric_features = ['AGE']
categorical_features = [col for col in features_to_keep 
                       if col not in numeric_features + ['DIED', 'COVID_POSITIVE']]

# Numeric pipeline (mean imputation + scaling)
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical pipeline (mode imputation)
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# Combined preprocessor
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Verification
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

Numeric features: ['AGE']
Categorical features: ['USMER', 'MEDICAL_UNIT', 'SEX', 'PATIENT_TYPE', 'INTUBED', 'PNEUMONIA', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HYPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO', 'ICU']


##### 8. Train-test split
 
Split the data into training and testing sets
Stratified split to maintain the same distribution of the target variable in both sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
print(f"Class distribution (train): {pd.Series(y_train).value_counts(normalize=True)}")


Training set: (80000, 19), Test set: (20000, 19)
Class distribution (train): DIED
0    0.926625
1    0.073375
Name: proportion, dtype: float64


##### 9. Model Evaluation 


1. Model Evaluation Setup

In [9]:
# We'll evaluate three baseline classifiers:
# 1. Naive Bayes (probabilistic)
# 2. K-Nearest Neighbors (instance-based)
# 3. SVM with RBF kernel (maximum margin)

import os
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown

# Visualization settings
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (8, 4)
sns.set_palette("husl")
# Ensure 'figures' directory exists
os.makedirs("figures", exist_ok=True)

2. Evaluation Function

In [10]:
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    """
    Evaluates a classification model and generates report-ready outputs.

    Parameters:
    - name: str, model name for display
    - model: sklearn classifier object
    - X_train, X_test, y_train, y_test: training/test data

    Returns:
    - Dictionary containing metrics and visualization paths
    """
    # Create pipeline and fit model
    clf = make_pipeline(preprocessor, model)
    clf.fit(X_train, y_train)
    
    # Generate predictions
    y_pred = clf.predict(X_test)
    
    # Create classification report and confusion matrix
    report = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)
    
    # Create visualizations
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

    # Confusion matrix heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
                xticklabels=['Survived', 'Died'],
                yticklabels=['Survived', 'Died'])
    ax1.set_title(f'{name} Confusion Matrix')
    ax1.set_xlabel('Predicted')
    ax1.set_ylabel('Actual')

    # Metrics bar plot
    metrics = ['precision', 'recall', 'f1-score']
    scores = [report['weighted avg'][m] for m in metrics]
    sns.barplot(x=metrics, y=scores, ax=ax2)
    ax2.set_title(f'{name} Performance Metrics')
    ax2.set_ylim(0, 1)

    # Save the figure
    fig_path = f'figures/{name.lower().replace(" ", "_")}_performance.png'
    plt.tight_layout()
    plt.savefig(fig_path)
    plt.close()

    # Display outputs in the notebook
    display(Markdown(f"## {name} Performance"))
    display(Markdown("### Classification Report"))
    print(classification_report(y_test, y_pred))

    display(Markdown("### Confusion Matrix"))
    print(cm)

    return {
        'model': name,
        'accuracy': report['accuracy'],
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1': report['weighted avg']['f1-score'],
        'figure_path': fig_path
    }


3. Model Definitions

In [11]:

#### Baseline Model Definitions
# We select three distinct algorithmic approaches:

models = [
    ('Naive Bayes', GaussianNB()),
    ('K-Nearest Neighbors', KNeighborsClassifier(n_neighbors=5)),
    ('Support Vector Machine', SVC(kernel='rbf', gamma='scale'))
]

4. Model Evaluation Execution

In [12]:
# === Evaluate All Baseline Models ===

model_results = []

for name, model in models:
    result = evaluate_model(name, model, X_train, X_test, y_train, y_test)
    model_results.append(result)


## Naive Bayes Performance

### Classification Report

              precision    recall  f1-score   support

           0       0.97      0.92      0.94     18532
           1       0.40      0.70      0.51      1468

    accuracy                           0.90     20000
   macro avg       0.69      0.81      0.73     20000
weighted avg       0.93      0.90      0.91     20000



### Confusion Matrix

[[16969  1563]
 [  438  1030]]


## K-Nearest Neighbors Performance

### Classification Report

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     18532
           1       0.62      0.48      0.54      1468

    accuracy                           0.94     20000
   macro avg       0.79      0.73      0.75     20000
weighted avg       0.93      0.94      0.94     20000



### Confusion Matrix

[[18100   432]
 [  770   698]]


## Support Vector Machine Performance

### Classification Report

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     18532
           1       0.77      0.32      0.46      1468

    accuracy                           0.94     20000
   macro avg       0.86      0.66      0.71     20000
weighted avg       0.94      0.94      0.93     20000



### Confusion Matrix

[[18392   140]
 [  993   475]]


##### 10. Hyperparameter Optimization
We implement systematic hyperparameter tuning for all three models using GridSearchCV (exhaustive search) to find optimal parameters.

A. Naive Bayes (GaussianNB)
   

In [13]:
param_grid_nb = {
    'gaussiannb__var_smoothing': [1e-9, 1e-8, 1e-7]  # Note the stepname__param format
}

B. K-Nearest Neighbors (KNN)

In [14]:
param_grid_knn = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7, 9],
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__p': [1, 2]
}

C. Support Vector Machine (SVM)

In [15]:
param_grid_svm = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 'auto', 0.01, 0.1],
    'svc__kernel': ['rbf']
}

##### 11. Implementation with Cross-Validation

In [None]:
from sklearn.model_selection import GridSearchCV

# Create pipelines with preprocessing + model
pipeline_nb = Pipeline([
    ('preprocessor', preprocessor),
    ('gaussiannb', GaussianNB())
])

pipeline_knn = Pipeline([
    ('preprocessor', preprocessor),
    ('kneighborsclassifier', KNeighborsClassifier())
])


pipeline_svm = Pipeline([
    ('preprocessor', preprocessor),
    ('svc', SVC())
])

# GridSearchCV setup
grid_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=5, scoring='f1_weighted')
grid_knn = GridSearchCV(pipeline_knn, param_grid_knn, cv=5, scoring='f1_weighted')
grid_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=5, scoring='f1_weighted')

# Fit models
grid_nb.fit(X_train, y_train)
grid_knn.fit(X_train, y_train)
grid_svm.fit(X_train, y_train)

# Less expensive GridsearchCV
# param_grid_nb = {}  # No hyperparameters to tune for GaussianNB

# param_grid_knn = {
#     'kneighborsclassifier__n_neighbors': [3, 5, 7]
# }

# param_grid_svm = {
#     'svc__C': [0.1, 1, 10],
#     'svc__gamma': ['scale', 'auto']
# }


NameError: name 'Pipeline' is not defined