In [1]:
from utils import load_datasets, basic_preprocess
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd


In [2]:
datasets = load_datasets()


In [3]:
def train_and_evaluate(df, target_column):
    df = basic_preprocess(df, target_column)

    X = df.drop(columns=[target_column])
    y = df[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier()
    }

    for name, model in models.items():
        print(f"\n🔸 {name} on target `{target_column}`")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))


In [4]:
df1 = datasets['dataset_1'].copy()
train_and_evaluate(df1, target_column='Severity_Severe')



🔸 Logistic Regression on target `Severity_Severe`
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     47641
           1       1.00      1.00      1.00     15719

    accuracy                           1.00     63360
   macro avg       1.00      1.00      1.00     63360
weighted avg       1.00      1.00      1.00     63360


🔸 Random Forest on target `Severity_Severe`
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     47641
           1       1.00      1.00      1.00     15719

    accuracy                           1.00     63360
   macro avg       1.00      1.00      1.00     63360
weighted avg       1.00      1.00      1.00     63360



In [5]:
df2 = datasets['dataset_2'].copy()

# Convert 'Deaths' column to numeric if needed
df2['Deaths'] = pd.to_numeric(df2['Deaths'], errors='coerce')

# Drop any rows where Deaths is missing
df2 = df2.dropna(subset=['Deaths'])

# Create the binary target column before preprocessing
median_death = df2['Deaths'].median()
df2['high_death'] = (df2['Deaths'] > median_death).astype(int)

# Now train using that new column
train_and_evaluate(df2, target_column='high_death')



🔸 Logistic Regression on target `high_death`
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1801
           1       1.00      1.00      1.00      1821

    accuracy                           1.00      3622
   macro avg       1.00      1.00      1.00      3622
weighted avg       1.00      1.00      1.00      3622


🔸 Random Forest on target `high_death`
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1801
           1       1.00      1.00      1.00      1821

    accuracy                           1.00      3622
   macro avg       1.00      1.00      1.00      3622
weighted avg       1.00      1.00      1.00      3622



In [6]:
df3 = datasets['dataset_3'].copy()
df3['high_death'] = (df3['Deaths'] > df3['Deaths'].median()).astype(int)
train_and_evaluate(df3, target_column='high_death')



🔸 Logistic Regression on target `high_death`
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5060
           1       1.00      1.00      1.00      4754

    accuracy                           1.00      9814
   macro avg       1.00      1.00      1.00      9814
weighted avg       1.00      1.00      1.00      9814


🔸 Random Forest on target `high_death`
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5060
           1       1.00      1.00      1.00      4754

    accuracy                           1.00      9814
   macro avg       1.00      1.00      1.00      9814
weighted avg       1.00      1.00      1.00      9814



In [7]:
df4 = datasets['dataset_4'].copy()
df4['vaccinated'] = (df4['total_vaccinations'] > df4['total_vaccinations'].median()).astype(int)
train_and_evaluate(df4, target_column='vaccinated')



🔸 Logistic Regression on target `vaccinated`
              precision    recall  f1-score   support

           0       0.83      0.88      0.85     12950
           1       0.56      0.44      0.50      4353

    accuracy                           0.77     17303
   macro avg       0.69      0.66      0.67     17303
weighted avg       0.76      0.77      0.76     17303


🔸 Random Forest on target `vaccinated`
              precision    recall  f1-score   support

           0       0.91      0.92      0.92     12950
           1       0.76      0.73      0.74      4353

    accuracy                           0.87     17303
   macro avg       0.83      0.82      0.83     17303
weighted avg       0.87      0.87      0.87     17303



In [8]:
df5 = datasets['dataset_5'].copy()

# Sample 20,000 rows for quick modeling
df5_sample = df5.sample(n=20000, random_state=42)

train_and_evaluate(df5_sample, target_column='ICU')



🔸 Logistic Regression on target `ICU`


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           1       0.60      0.10      0.17        60
           2       0.92      0.99      0.96       668
          97       1.00      1.00      1.00      3236
          99       1.00      1.00      1.00        36

    accuracy                           0.99      4000
   macro avg       0.88      0.77      0.78      4000
weighted avg       0.98      0.99      0.98      4000


🔸 Random Forest on target `ICU`
              precision    recall  f1-score   support

           1       0.46      0.22      0.30        60
           2       0.93      0.98      0.95       668
          97       1.00      1.00      1.00      3236
          99       1.00      1.00      1.00        36

    accuracy                           0.98      4000
   macro avg       0.85      0.80      0.81      4000
weighted avg       0.98      0.98      0.98      4000

