In [41]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("cleaned_data.csv")

In [38]:
X = np.array(df.drop(columns=["h1n1_vaccine", "seasonal_vaccine"]))
y = np.array(df["h1n1_vaccine"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=42,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    class_weight=None,
    ccp_alpha=0.0,
    monotonic_cst=None
)

model.fit(X_train, y_train)

print("Classification Report for Train set")
print(classification_report(y_train, model.predict(X_train)))
print("\nClassification Report for Test set")
print(classification_report(y_test, model.predict(X_test)))

Classification Report for Train set
              precision    recall  f1-score   support

         0.0       0.86      0.95      0.90     16821
         1.0       0.67      0.41      0.51      4544

    accuracy                           0.83     21365
   macro avg       0.76      0.68      0.70     21365
weighted avg       0.82      0.83      0.82     21365


Classification Report for Test set
              precision    recall  f1-score   support

         0.0       0.86      0.95      0.90      4212
         1.0       0.68      0.41      0.52      1130

    accuracy                           0.84      5342
   macro avg       0.77      0.68      0.71      5342
weighted avg       0.82      0.84      0.82      5342



In [48]:
pipeline = Pipeline([
    ('dt', DecisionTreeClassifier(random_state=42))
])

param_grid = {
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [3, 4, 5, 6, 7, 10],
    'dt__min_samples_split': [2, 3, 4, 5, 7, 10],
    'dt__min_samples_leaf': [1, 2, 3, 4, 5]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'dt__criterion': 'gini', 'dt__max_depth': 5, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2}


In [45]:
model = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=42,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    class_weight=None,
    ccp_alpha=0.0,
    monotonic_cst=None
)

model.fit(X_train, y_train)

print("Classification Report for Train set")
print(classification_report(y_train, model.predict(X_train)))
print("\nClassification Report for Test set")
print(classification_report(y_test, model.predict(X_test)))

Classification Report for Train set
              precision    recall  f1-score   support

         0.0       0.86      0.95      0.90     16821
         1.0       0.67      0.41      0.51      4544

    accuracy                           0.83     21365
   macro avg       0.76      0.68      0.70     21365
weighted avg       0.82      0.83      0.82     21365


Classification Report for Test set
              precision    recall  f1-score   support

         0.0       0.86      0.95      0.90      4212
         1.0       0.68      0.41      0.52      1130

    accuracy                           0.84      5342
   macro avg       0.77      0.68      0.71      5342
weighted avg       0.82      0.84      0.82      5342



In [46]:
roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

0.8192712141458454