In [None]:
import pandas as pd
from sklearn.model_selection import KFold

# Function to load dataset
def load_data(data_path, labels_path, sep='\s+', header=None, label_transform={-1: 0}):
    """Loads data and labels"""
    data = pd.read_csv(data_path, header=header, sep=sep)
    labels = pd.read_csv(labels_path, header=header, sep=sep).replace(label_transform)
    return data, labels

# Load training and validation datasets
train_data, train_labels = load_data(train_data_path, train_labels_path)
valid_data, valid_labels = load_data(valid_data_path, valid_labels_path)

# Combine training and validation datasets
X = pd.concat([train_data, valid_data], ignore_index=True)
y = pd.concat([train_labels, valid_labels], ignore_index=True)

# Prepare for stratified 5-fold cross-validation
kf = KFold(n_splits=5)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import optuna.integration.lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation


# Function to perform GBDT and evaluate the resuluts
def GBDT_evaluation(X_train, y_train, X_valid, y_valid, X_test, y_test):
    """Train the classifier and evaluate it on the test set."""
    # Prepare LightGBM datasets
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    # Define model parameters
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
    }

    # Train the LightGBM-classfier with validation set
    clf = lgb.train(params, 
                    dtrain, 
                    valid_sets=[dvalid], 
                    callbacks=[early_stopping(100), log_evaluation(100)],
    )

    # Predict on the test set
    y_prob = clf.predict(X_test, num_iteration=clf.best_iteration)
    y_pred = np.rint(y_prob)

    # Compute results
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    return accuracy, auc

In [None]:
# Initialize lists to store the results
accuracies = []
aucs = []

for train_index, test_index in kf.split(X, y):
    # Split the dataset into the current fold
    X_train_all, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train_all, y_test = y.iloc[train_index], y.iloc[test_index]

    # Further split the whole training dadtaset for LightGBM training and validation
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, test_size=0.25, random_state=42)
    
    # Train the model and evaluate on the test set
    accuracy, auc = GBDT_evaluation(X_train, y_train, X_valid, y_valid, X_test, y_test)

    # Store the results
    accuracies.append(accuracy)
    aucs.append(auc)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np

# Initialize lists to store the results
accuracies = []
aucs = []

# Define the pipeline
pcr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('clf', LogisticRegression())
])

param_grid = {
    "pca__n_components": range(1, 31),
    "clf__C": np.logspace(-4, 4, 15),
}

for train_index, test_index in kf.split(X, y):
    # Split the dataset into the current fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index].values.ravel(), y.iloc[test_index].values.ravel()
    
    # Perform grid search with cross-validation
    search = GridSearchCV(pcr_pipeline, param_grid, n_jobs=-1)
    search.fit(X_train, y_train)
    
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)
    
    # Predict on the test set using the best model
    y_prob = search.best_estimator_.predict_proba(X_test)[:, 1]
    y_pred = search.best_estimator_.predict(X_test)
    
    # Compute results
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
        
    # Store the results
    accuracies.append(accuracy)
    aucs.append(auc)