In [1]:
from utils import load_datasets, basic_preprocess
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import pandas as pd


In [2]:
datasets = load_datasets()


In [3]:
def train_and_evaluate_nb_xgb(df, target_column):
    df = basic_preprocess(df, target_column)

    X = df.drop(columns=[target_column])
    y = df[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        "Naive Bayes": GaussianNB(),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }

    for name, model in models.items():
        print(f"\n🔸 {name} on target `{target_column}`")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))


In [4]:
df1 = datasets['dataset_1'].copy()
train_and_evaluate_nb_xgb(df1, target_column='Severity_Severe')



🔸 Naive Bayes on target `Severity_Severe`
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     47641
           1       1.00      1.00      1.00     15719

    accuracy                           1.00     63360
   macro avg       1.00      1.00      1.00     63360
weighted avg       1.00      1.00      1.00     63360


🔸 XGBoost on target `Severity_Severe`
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     47641
           1       1.00      1.00      1.00     15719

    accuracy                           1.00     63360
   macro avg       1.00      1.00      1.00     63360
weighted avg       1.00      1.00      1.00     63360



In [5]:
df2 = datasets['dataset_2'].copy()

# Convert 'Deaths' column to numeric if needed
df2['Deaths'] = pd.to_numeric(df2['Deaths'], errors='coerce')

# Drop any rows where Deaths is missing
df2 = df2.dropna(subset=['Deaths'])

# Create the binary target column before preprocessing
median_death = df2['Deaths'].median()
df2['high_death'] = (df2['Deaths'] > median_death).astype(int)

# Now train using that new column
train_and_evaluate_nb_xgb(df2, target_column='high_death')




🔸 Naive Bayes on target `high_death`
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1801
           1       0.98      0.97      0.98      1821

    accuracy                           0.98      3622
   macro avg       0.98      0.98      0.98      3622
weighted avg       0.98      0.98      0.98      3622


🔸 XGBoost on target `high_death`
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1801
           1       1.00      1.00      1.00      1821

    accuracy                           1.00      3622
   macro avg       1.00      1.00      1.00      3622
weighted avg       1.00      1.00      1.00      3622



In [6]:
df3 = datasets['dataset_3'].copy()
df3['high_death'] = (df3['Deaths'] > df3['Deaths'].median()).astype(int)

train_and_evaluate_nb_xgb(df3, target_column='high_death')




🔸 Naive Bayes on target `high_death`
              precision    recall  f1-score   support

           0       0.78      0.98      0.87      5060
           1       0.97      0.70      0.81      4754

    accuracy                           0.84      9814
   macro avg       0.87      0.84      0.84      9814
weighted avg       0.87      0.84      0.84      9814


🔸 XGBoost on target `high_death`
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5060
           1       1.00      1.00      1.00      4754

    accuracy                           1.00      9814
   macro avg       1.00      1.00      1.00      9814
weighted avg       1.00      1.00      1.00      9814



In [7]:
df4 = datasets['dataset_4'].copy()
df4['vaccinated'] = (df4['total_vaccinations'] > df4['total_vaccinations'].median()).astype(int)
train_and_evaluate_nb_xgb(df4, target_column='vaccinated')



🔸 Naive Bayes on target `vaccinated`
              precision    recall  f1-score   support

           0       0.79      0.99      0.88     12950
           1       0.85      0.24      0.37      4353

    accuracy                           0.80     17303
   macro avg       0.82      0.61      0.63     17303
weighted avg       0.81      0.80      0.75     17303


🔸 XGBoost on target `vaccinated`
              precision    recall  f1-score   support

           0       0.89      0.93      0.91     12950
           1       0.77      0.67      0.72      4353

    accuracy                           0.87     17303
   macro avg       0.83      0.80      0.82     17303
weighted avg       0.86      0.87      0.86     17303



In [11]:
df5 = datasets['dataset_5'].copy()

# Keep only rows with ICU = 1 or 2
df5 = df5[df5['ICU'].isin([1, 2])]

# Recode: 1 = ICU, 2 = No ICU
df5['ICU'] = df5['ICU'].map({1: 1, 2: 0})

# Optional: Check balance
print(df5['ICU'].value_counts())

# Sample for faster training
df5_sample = df5.sample(n=2000, random_state=42)

# Train models
train_and_evaluate_nb_xgb(df5_sample, target_column='ICU')


ICU
0    175685
1     16858
Name: count, dtype: int64

🔸 Naive Bayes on target `ICU`
              precision    recall  f1-score   support

           0       0.91      0.98      0.95       364
           1       0.22      0.06      0.09        36

    accuracy                           0.90       400
   macro avg       0.57      0.52      0.52       400
weighted avg       0.85      0.90      0.87       400


🔸 XGBoost on target `ICU`
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       364
           1       0.40      0.28      0.33        36

    accuracy                           0.90       400
   macro avg       0.67      0.62      0.64       400
weighted avg       0.88      0.90      0.89       400

