In [26]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import utilsfunc as uf
from config import SELECTED_FEATURES, TARGET
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
import pickle

In [18]:
df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
mappings = uf.read_json("mapping.json")
results = pd.read_csv("training_set_labels.csv", index_col="respondent_id")

df = pd.concat([df, results], axis=1)
df = df[SELECTED_FEATURES]

for col, mapping in mappings.items():
    df[col] = df[col].map(mapping)

In [33]:
Column_transformer = ColumnTransformer(
    transformers=[
        ("imputer"), SimpleImputer(strategy="median"), SELECTED_FEATURES
    ], 
    remainder="passthrough"
)

In [34]:
params = {
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [3, 4, 5, 6, 7, 10],
    'dt__min_samples_split': [2, 3, 4, 5, 7, 10],
    'dt__min_samples_leaf': [1, 2, 3, 4, 5]
}

In [35]:
pipeline_model = Pipeline([
    ('column_transformer', Column_transformer),
    ('selectkbest', SelectKBest(score_func=chi2, k=10)),
    ('dt', DecisionTreeClassifier(random_state=42))
])

In [None]:
grid_search = GridSearchCV(
    estimator=pipeline_model,
    param_grid=params,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
)

grid_search.fit(df[SELECTED_FEATURES], df[TARGET])
print(grid_search.best_scores_)