In [1]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import utilsfunc as uf
from config import SELECTED_FEATURES, TARGETS
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
import pickle
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

In [None]:
df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
mappings = uf.read_json("mapping.json")
results = pd.read_csv("training_set_labels.csv", index_col="respondent_id")

df = df[SELECTED_FEATURES]
df = pd.concat([df, results], axis=1)

for col, mapping in mappings.items():
    df[col] = df[col].map(mapping)

X = df.drop(columns=TARGETS)
y = df[TARGETS[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

26707


In [14]:
Column_transformer = ColumnTransformer(
    transformers=[
        ("imputer", SimpleImputer(strategy="median"), SELECTED_FEATURES)
    ], 
    remainder="passthrough"
)

In [15]:
params = {
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [3, 4, 5, 6, 7, 10],
    'dt__min_samples_split': [2, 3, 4, 5, 7, 10],
    'dt__min_samples_leaf': [1, 2, 3, 4, 5]
}

In [None]:
pipeline_model = Pipeline([
    ('drop_missing', uf.DropMissingTransformer(threshold=0.2)),
    ('column_transformer', Column_transformer),
    ('selectkbest', SelectKBest(score_func=chi2, k=10)),
    ('dt', DecisionTreeClassifier(random_state=42))
])

In [None]:
grid_search = GridSearchCV(
    estimator=pipeline_model,
    param_grid=params,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)

In [18]:
print(grid_search.best_score_)

0.8161766518310996


In [20]:
print("\nClassification Report for Test set")
print(classification_report(y_test, grid_search.predict(X_test)))
roc_auc_score(y_test, grid_search.predict_proba(X_test)[:, 1])


Classification Report for Test set
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      4212
           1       0.69      0.41      0.51      1130

    accuracy                           0.84      5342
   macro avg       0.77      0.68      0.71      5342
weighted avg       0.82      0.84      0.82      5342



0.8198540411298523

In [2]:
df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
mappings = uf.read_json("mapping.json")
results = pd.read_csv("training_set_labels.csv", index_col="respondent_id")

results = results[TARGETS[0]]

X_train, X_test, y_train, y_test = train_test_split(df, results, test_size=0.2, random_state=42)

In [3]:
from DecisionTreePipeline import *

dt_pipeline = DTPipeline()
pipeline_model = dt_pipeline.build_pipeline(X_train)

In [4]:
params = {
    'selectkbest__k': [5, 10, 15, 'all'],
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [4, 5, 6],
    'dt__min_samples_split': [2, 3, 4, 5],
    'dt__min_samples_leaf': [3, 4, 5]
}

In [5]:
grid_search = GridSearchCV(
    estimator=pipeline_model,
    param_grid=params,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)

ValueError: 
All the 1440 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1440 fits failed with the following error:
Traceback (most recent call last):
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 833, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\base.py", line 1336, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\pipeline.py", line 613, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\pipeline.py", line 547, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ~~~~~~~~~~~~~~~~~~~~~~~~^
        cloned_transformer,
        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
        params=step_params,
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\pipeline.py", line 1484, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\base.py", line 910, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ~~~~~~~~^^^^^^^^^^^^^^^^^^^^
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\base.py", line 1336, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\feature_selection\_univariate_selection.py", line 563, in fit
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self, X, y, accept_sparse=["csr", "csc"], multi_output=True
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2919, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1314, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1074, in check_array
    _assert_all_finite(
    ~~~~~~~~~~~~~~~~~~^
        array,
        ^^^^^^
    ...<2 lines>...
        allow_nan=ensure_all_finite == "allow-nan",
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\utils\validation.py", line 133, in _assert_all_finite
    _assert_all_finite_element_wise(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        X,
        ^^
    ...<4 lines>...
        input_name=input_name,
        ^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\SandBox\Flue Shot\Flue-Shot-Machine-Learning\.venv\Lib\site-packages\sklearn\utils\validation.py", line 182, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
SelectKBest does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [14]:
np.isnan(pipeline_model[:-2].fit_transform(X_train, y_train)).sum()

np.int64(8549)

In [None]:
print(grid_search.best_score_)

0.8162074508686962
