In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import utilsfunc as uf
from config import SELECTED_FEATURES, TARGETS
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
import pickle
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

ModuleNotFoundError: No module named 'DropMissingTransformer'

In [None]:
df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
mappings = uf.read_json("mapping.json")
results = pd.read_csv("training_set_labels.csv", index_col="respondent_id")

df = df[SELECTED_FEATURES]
df = pd.concat([df, results], axis=1)

for col, mapping in mappings.items():
    df[col] = df[col].map(mapping)

X = df.drop(columns=TARGETS)
y = df[TARGETS[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

26707


In [14]:
Column_transformer = ColumnTransformer(
    transformers=[
        ("imputer", SimpleImputer(strategy="median"), SELECTED_FEATURES)
    ], 
    remainder="passthrough"
)

In [15]:
params = {
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [3, 4, 5, 6, 7, 10],
    'dt__min_samples_split': [2, 3, 4, 5, 7, 10],
    'dt__min_samples_leaf': [1, 2, 3, 4, 5]
}

In [None]:
pipeline_model = Pipeline([
    ('drop_missing', uf.DropMissingTransformer(threshold=0.2)),
    ('column_transformer', Column_transformer),
    ('selectkbest', SelectKBest(score_func=chi2, k=10)),
    ('dt', DecisionTreeClassifier(random_state=42))
])

In [None]:
grid_search = GridSearchCV(
    estimator=pipeline_model,
    param_grid=params,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)

In [18]:
print(grid_search.best_score_)

0.8161766518310996


In [20]:
print("\nClassification Report for Test set")
print(classification_report(y_test, grid_search.predict(X_test)))
roc_auc_score(y_test, grid_search.predict_proba(X_test)[:, 1])


Classification Report for Test set
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      4212
           1       0.69      0.41      0.51      1130

    accuracy                           0.84      5342
   macro avg       0.77      0.68      0.71      5342
weighted avg       0.82      0.84      0.82      5342



0.8198540411298523