In [26]:
import pandas as pd
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from dask.distributed import wait

import numpy as np
import time
import psutil
import coiled

In [27]:
cluster = coiled.Cluster(n_workers=60)
client = cluster.get_client()

[2025-05-13 01:28:51,769][INFO    ][coiled] Fetching latest package priorities...
[2025-05-13 01:28:51,771][INFO    ][coiled.package_sync] Resolving your local Python312 Python environment...
[2025-05-13 01:28:52,550][INFO    ][coiled.package_sync] Scanning 129 python packages...
[2025-05-13 01:28:53,611][INFO    ][coiled] Running pip check...
[2025-05-13 01:28:54,723][INFO    ][coiled] Validating environment...
[2025-05-13 01:28:59,967][INFO    ][coiled] Creating wheel for ~\AppData\Roaming\Python\Python312\site-packages\win32\lib...
[2025-05-13 01:28:59,972][INFO    ][coiled] Creating wheel for ~\AppData\Roaming\Python\Python312\site-packages\pythonwin...
[2025-05-13 01:28:59,979][INFO    ][coiled] Creating wheel for ~\AppData\Roaming\Python\Python312\site-packages\win32...
[2025-05-13 01:28:59,986][INFO    ][coiled] Uploading coiled_local_lib...
[2025-05-13 01:29:00,886][INFO    ][coiled] Uploading coiled_local_pythonwin...
[2025-05-13 01:29:01,536][INFO    ][coiled] Uploading coile

In [28]:
start_time = time.time()

In [29]:
# Load dataset
df = pd.read_csv("pdc_dataset_with_target.csv")

num_duplicates = df.duplicated().sum()
filtered_df_no_duplicates = df.drop_duplicates()
rows_removed_due_to_duplicates = df.shape[0] - filtered_df_no_duplicates.shape[0]
df = filtered_df_no_duplicates
filtered_df_no_duplicates.shape[0], rows_removed_due_to_duplicates

(40045, 955)

In [30]:
df.isnull().sum()
df = df.dropna()
df.isnull().sum()

feature_1    0
feature_2    0
feature_3    0
feature_4    0
feature_5    0
feature_6    0
feature_7    0
target       0
dtype: int64

In [31]:
# Column definitions
numerical_cols = ['feature_1', 'feature_2', 'feature_4', 'feature_6','feature_7']
categorical_cols = ['feature_3', 'feature_5']

In [32]:
def remove_outliers_iqr(df, cols):
    filtered_df = df.copy()
    for col in cols:
        Q1 = filtered_df[col].quantile(0.25)
        Q3 = filtered_df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_df = filtered_df[(filtered_df[col] >= lower_bound) & (filtered_df[col] <= upper_bound)]
    return filtered_df

df = remove_outliers_iqr(df, numerical_cols)



In [33]:
X = df.drop('target', axis=1)
y = df['target']

# Column definitions
numerical_cols = ['feature_1', 'feature_2', 'feature_4', 'feature_7']
categorical_cols = ['feature_3', 'feature_5']

# Preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Apply preprocessing once
X_processed = preprocessor.fit_transform(X)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [34]:
end_time = time.time()
print(f"Total execution time: {end_time - start_time:.2f} seconds")

Total execution time: 0.43 seconds


In [35]:
# Logging setup
resource_logs = {
    'cpu': [],
    'mem': [],
    'threads': [],
    'time': []
}

def log_resources():
    cpu = psutil.cpu_percent(interval=1)
    mem = psutil.virtual_memory().percent
    threads = psutil.Process().num_threads()
    resource_logs['cpu'].append(cpu)
    resource_logs['mem'].append(mem)
    resource_logs['threads'].append(threads)

In [36]:
backend_storage = optuna.storages.InMemoryStorage()
dask_storage = optuna.integration.DaskStorage(storage=backend_storage)

study = optuna.create_study(
    direction="maximize",
    storage=dask_storage,  # This makes the study Dask-enabled
    sampler=optuna.samplers.RandomSampler(),
)

# Define Optuna objective
def objective(trial):
    params = {
        'eval_metric': 'logloss',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
    }
    
    model = xgb.XGBClassifier(n_jobs=-1, **params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probas = model.predict_proba(X_test)[:, 1]
    
    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, probas)
    cm = confusion_matrix(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)
    
    # Store additional metrics in the trial
    trial.set_user_attr("roc_auc", auc)
    trial.set_user_attr("confusion_matrix", cm.tolist())
    trial.set_user_attr("classification_report", report)
    
    return acc

futures = [
    client.submit(study.optimize, objective, n_trials=5, pure=False) for _ in range(100)
]

_ = wait(futures)


print("Best Params:", study.best_params)
best_trial = study.best_trial

print("Best Accuracy:", best_trial.value)
print("ROC AUC:", best_trial.user_attrs["roc_auc"])
print("Confusion Matrix:", best_trial.user_attrs["confusion_matrix"])
print("Classification Report:")
from pprint import pprint
pprint(best_trial.user_attrs["classification_report"])

  dask_storage = optuna.integration.DaskStorage(storage=backend_storage)
  mod.loads(out, buffers=buffers)


Best Params: {'max_depth': 4, 'learning_rate': 0.014471185363828597, 'n_estimators': 267, 'subsample': 0.9228559072464841, 'colsample_bytree': 0.6267972881208569}
Best Accuracy: 0.6032140490390987
ROC AUC: 0.5030952814352807
Confusion Matrix: [[3635, 3], [2392, 6]]
Classification Report:
{'0': {'f1-score': 0.752198654940507,
       'precision': 0.6031192964990875,
       'recall': 0.9991753710830127,
       'support': 3638.0},
 '1': {'f1-score': 0.004985459077690071,
       'precision': 0.6666666666666666,
       'recall': 0.0025020850708924102,
       'support': 2398.0},
 'accuracy': 0.6032140490390987,
 'macro avg': {'f1-score': 0.37859205700909854,
               'precision': 0.634892981582877,
               'recall': 0.5008387280769525,
               'support': 6036.0},
 'weighted avg': {'f1-score': 0.4553435781215814,
                  'precision': 0.6283655843820986,
                  'recall': 0.6032140490390987,
                  'support': 6036.0}}


In [37]:
end_time = time.time()
print(f"Total execution time: {end_time - start_time:.2f} seconds")

Total execution time: 127.23 seconds


In [38]:
client.close()
cluster.close()

[2025-05-13 01:32:56,806][INFO    ][coiled] Cluster 876327 deleted successfully.
