In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
import dice_ml
from dice_ml.utils import helpers # helper functions
from dice_ml import Data,Model,Dice
import numpy as np
from xgboost import XGBClassifier
import time
import threading
from joblib import Parallel, delayed
import os
import numpy as np
from dataLoader import DataLoader
from plotter import Plotter





ModuleNotFoundError: No module named 'sklearn'

In [None]:

class HyperparameterTuner:
    def __init__(self, model, param_grid, scoring='precision', cv=5, verbose=1, n_jobs=-1):
        self.model = model
        self.param_grid = param_grid
        self.scoring = scoring
        self.cv = cv
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.grid_search = None

    def tune(self, X, y):
        self.grid_search = GridSearchCV(estimator=self.model, param_grid=self.param_grid, scoring=self.scoring, cv=self.cv, verbose=self.verbose, n_jobs=self.n_jobs)
        self.grid_search.fit(X, y)
        return self.grid_search.best_params_, self.grid_search.best_score_

    def get_best_estimator(self):
        if self.grid_search:
            return self.grid_search.best_estimator_
        else:
            raise ValueError("You need to run the tune method first.")

# Example usage:
if __name__ == "__main__":
    # Load dataset using DataLoader
    dataLoader = DataLoader("heart_statlog_cleveland_hungary_final.csv")
    df_cvd = dataLoader.load_data()

    # Assuming the target column is named 'target'
    X = df_cvd.drop(columns='target')
    y = df_cvd['target']

    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    numerical = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

    categorical = X_train.columns.difference(numerical)


    # We create the preprocessing pipelines for both numeric and categorical data.
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    transformations = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical),
            ('cat', categorical_transformer, categorical)])


    pipeline = Pipeline(steps=[
    ('preprocessor', transformations),
    ('classifier', XGBClassifier())
   ])

    # Define the parameter grid
    param_grid = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 6, 9],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__subsample': [0.8, 1.0],
        'classifier__colsample_bytree': [0.8, 1.0]
    }

    # Create the HyperparameterTuner instance
    tuner = HyperparameterTuner(pipeline, param_grid)

    # Perform hyperparameter tuning
    best_params, best_score = tuner.tune(X_train, y_train)
    print("Best Parameters:", best_params)
    print("Best Score:", best_score)

    # Get the best estimator
    best_model = tuner.get_best_estimator()
    print("Best Estimator:", best_model)

In [None]:
# Train the model with the best hyperparameters on the full training set
best_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy}")
print(f"Test F1 Score: {f1}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")

In [9]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

categorical = X_train.columns.difference(numerical)


# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])


pipeline = Pipeline(steps=[('preprocessor', transformations),('classifier', XGBClassifier(max_depth=3, learning_rate=0.01, n_estimators=300, subsample=0.8,colsample_bytree=1.0))])



In [10]:
X_high_risk_tp = X_test[(y_pred == 1) & (y_test == 1)].reset_index().drop(['index'], axis=1)

In [None]:
len(X_high_risk_tp)