In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import itertools
from typing import List
import json
import time
import multiprocessing

# from google.colab import drive
# drive.mount('/content/drive')

import warnings
warnings.filterwarnings('ignore')

In [2]:
RANDOM_SEED = 42

DATASET_PATH = 'dataset_filtered_1_450.csv'
OUTPUT_PATH = 'dt_grid_search_results/'

TARGET_COLUMN = "rest_int"
REMOVE_LIST = ["user_id", 'rest_int', 'exp', 'seq']

CROSS_VALIDATION = 3
N_JOBS = multiprocessing.cpu_count() - 1

In [3]:
data = pd.read_csv(DATASET_PATH)

In [4]:
features_list = list(data.columns)

features_list = [i for i in features_list if i not in REMOVE_LIST]

len(features_list)

1350

In [5]:
def grid_search_on_features(list_of_features: List[str], target_column: str, experiment_id: int) -> dict:

    X = data[list_of_features]
    y = data[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    start = time.time()

    classifier = DecisionTreeClassifier(random_state=RANDOM_SEED)

    param_grid = {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 10, 20, 30, 50, 100],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2'],
        'max_leaf_nodes': [None, 10, 20],
        'min_weight_fraction_leaf': [0.0, 0.1, 0.2]
    }

    grid_search = GridSearchCV(classifier,
                            param_grid,
                            cv=CROSS_VALIDATION,
                            scoring="f1_weighted",
                            n_jobs=N_JOBS)

    grid_search.fit(X_train, y_train)

    end = time.time()

    best_model = grid_search.best_estimator_
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=CROSS_VALIDATION)
    y_pred = best_model.predict(X_test)
    classification_results = classification_report(y_test, y_pred, output_dict = True)

    feature_importances = best_model.feature_importances_
    features = list_of_features
    feature_importance_list = list(zip(features, feature_importances))
    sorted_feature_importance = sorted(feature_importance_list, key=lambda x: x[1], reverse=True)
    best_features = {feature: importance for feature, importance in sorted_feature_importance}

    return {
        "experiment_id": experiment_id,
        "used_features": list_of_features,
        "cross_validation_scores": cv_scores.tolist(),
        "best_estimator": best_model.get_params(),
        "feature_importances": best_features,
        "classification_results": classification_results,
        "random_seed": RANDOM_SEED,
        "performance_time": end - start
        }

In [6]:
def save_results(experiment_data: dict, experiment_id: int) -> None:
    with open(f"dt_grid_search_results/experiment_{experiment_id}.json", "w", encoding="utf-8") as json_file:
        json.dump(experiment_data, json_file, ensure_ascii=False, indent=4)

In [7]:
idx = 0
current_features = list(features_list)

while len(current_features) > 1:

    experiment_results = grid_search_on_features(list_of_features=current_features,
                                                 target_column=TARGET_COLUMN,
                                                 experiment_id=idx)
    
    save_results(experiment_data = experiment_results, experiment_id=idx)

    best_feature = list(experiment_results["feature_importances"])[0]

    current_features.remove(best_feature)

    idx += 1

    print(f'Step:{idx}, features left: {len(current_features)}')

Step:1, features left: 1349
Step:2, features left: 1348
Step:3, features left: 1347
Step:4, features left: 1346
Step:5, features left: 1345
Step:6, features left: 1344
Step:7, features left: 1343
Step:8, features left: 1342
Step:9, features left: 1341
Step:10, features left: 1340
Step:11, features left: 1339
Step:12, features left: 1338
Step:13, features left: 1337
Step:14, features left: 1336
Step:15, features left: 1335
Step:16, features left: 1334
Step:17, features left: 1333
Step:18, features left: 1332
Step:19, features left: 1331
Step:20, features left: 1330
Step:21, features left: 1329
Step:22, features left: 1328
Step:23, features left: 1327
Step:24, features left: 1326
Step:25, features left: 1325
Step:26, features left: 1324
Step:27, features left: 1323
Step:28, features left: 1322
Step:29, features left: 1321
Step:30, features left: 1320
Step:31, features left: 1319
Step:32, features left: 1318
Step:33, features left: 1317
Step:34, features left: 1316
Step:35, features left: