In [None]:
import os
%cd /content/gdrive/MyDrive/NCS
path = os.getcwd()
print(path)



In [None]:
# Train SVM based on Test F1 Score and only save the model with the highest score.
import os
import pandas as pd
import logging
import time
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

logging.basicConfig(level=logging.INFO)

datasets = [20711, 21050, 21122, 29354, 30784, 31312, 31552, 32537, 33315, 37364, 39582, 39716, 44077]

folders = '01.07.BestF'

def get_svm_model():
    return SVC(C=100000, kernel='rbf', gamma='scale', class_weight='balanced', random_state=42)
for dataset in datasets:
    logging.info(f"Processing dataset: {dataset}")
    print(f"Processing dataset: {dataset}")

    start_dataset_time = time.time()
    lime_mean_importance_file = os.path.join(
        'Experiment', 'Selected Features', 'Results', folder, str(dataset), 'lime_mean_feature_importance.csv'
    )

    if not os.path.exists(lime_mean_importance_file):
        logging.error(f"File {lime_mean_importance_file} not found, skipping dataset {dataset} - Reason: LIME file does not exist.")
        continue

    lime_importance_df = pd.read_csv(lime_mean_importance_file)
    total_features = lime_importance_df['Feature'].nunique()

    file_path = os.path.join('Gene Data', str(dataset), 'data.trn.gz')
    if not os.path.exists(file_path):
        logging.error(f"File {file_path} not found, skipping dataset {dataset} - Reason: Data file does not exist.")
        continue

    df = pd.read_csv(file_path, header=None, sep='\s+')
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    class_counts = y.value_counts()
    classes_to_keep = class_counts[class_counts >= 2].index
    if len(classes_to_keep) < 2:
        logging.error(f"After removing infrequent classes, there aren't enough classes for classification in dataset {dataset} - Reason: Not enough classes.")
        continue

    X = X[y.isin(classes_to_keep)]
    y = y[y.isin(classes_to_keep)]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    end_preprocessing_time = time.time()
    preprocessing_time = end_preprocessing_time - start_dataset_time

    best_result = None
    all_results = []  # Store all results

    for top_n_features in range(10, total_features + 1):
        top_features = lime_importance_df['Feature'].astype(int).values[:top_n_features]

        X_train_top_selected = X_train.iloc[:, top_features]
        X_test_top_selected = X_test.iloc[:, top_features]

        cv_method = KFold(n_splits=10, shuffle=True, random_state=42) if len(X_train_top_selected) > 300 else LeaveOneOut()

        pipeline_cv = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', get_svm_model())
        ])

        scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

        start_train_time = time.time()
        cv_results = cross_validate(
            pipeline_cv, X_train_top_selected, y_train, cv=cv_method, scoring=scoring,
            n_jobs=-1, return_train_score=False
        )
        training_time = time.time() - start_train_time
        total_training_time += training_time

        accuracy_cv = cv_results['test_accuracy'].mean()
        precision_cv = cv_results['test_precision_weighted'].mean()
        recall_cv = cv_results['test_recall_weighted'].mean()
        f1_cv = cv_results['test_f1_weighted'].mean()

        pipeline_final = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', get_svm_model())
        ])

        pipeline_final.fit(X_train_top_selected, y_train)
        y_test_pred = pipeline_final.predict(X_test_top_selected)

        accuracy_test = accuracy_score(y_test, y_test_pred)
        precision_test = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
        recall_test = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
        f1_test = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)

        current_result = {
            'Top N Features': top_n_features,
            'Test Accuracy': accuracy_test,
            'Test Precision': precision_test,
            'Test Recall': recall_test,
            'Test F1 Score': f1_test,
            'Training Time': training_time,
            'Preprocessing Time (s)': preprocessing_time,
            'Total Training Time (s)': total_training_time,
            'Dataset Processing Time (s)': time.time() - start_dataset_time
        }
        all_results.append(current_result)
        if best_result is None or current_result['Test F1 Score'] > best_result['Test F1 Score']:
            best_result = current_result

    output_folder = os.path.join('Experiment', 'Selected Features', 'Results', folder, str(dataset))
    os.makedirs(output_folder, exist_ok=True)
    all_results_output_file = os.path.join(output_folder, f'SVM_{dataset}_All_Results.csv')
    if all_results:
        pd.DataFrame(all_results).to_csv(all_results_output_file, index=False)
        logging.info(f"All results for dataset {dataset} have been saved to file SVM_All_Results.csv.")

    # Save best results to file SVM_Best_CV.csv
    results_output_file = os.path.join(output_folder, f'SVM_{dataset}_Best_CV.csv')
    if best_result:
        pd.DataFrame([best_result]).to_csv(results_output_file, index=False)
        logging.info(f"Best result for dataset {dataset} has been saved to file SVM_Best_CV.csv.")
    else:
        logging.warning(f"No sufficient result to save for dataset {dataset}.")

    logging.info(f"Preprocessing time for dataset {dataset}: {preprocessing_time:.2f} seconds.")
    logging.info(f"Total training time for dataset {dataset}: {total_training_time:.2f} seconds.")
    logging.info(f"Dataset processing time for dataset {dataset}: {time.time() - start_dataset_time:.2f} seconds.")

logging.info('All datasets have been processed.')


In [None]:
# Run through all directories
import matplotlib.pyplot as plt
import pandas as pd
import os

base_path = 'Experiment/Selected Features/Results/01.07.BestF'

directories = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]

colors = [(0.8, 0.4, 0.2, 1.0), (0.5, 0.7, 0.4, 1.0), (0.1, 0.6, 0.8, 1.0), (0.7, 0.6, 0.2, 0.6), (0.5, 0.2, 0.6, 1.0)]

for dir_idx, directory in enumerate(directories):
    path1 = os.path.join(base_path, directory)

    csv_files = [file for file in os.listdir(path1) if file.endswith('_All_Results.csv')]

    plt.figure(figsize=(12, 8))
    for idx, file in enumerate(csv_files):
        file_path = os.path.join(path1, file)
        dff_result = pd.read_csv(file_path)
        max_f1 = dff_result['Test F1 Score'].max()
        max_f1_index = dff_result['Test F1 Score'].idxmax()
        max_f1_feature = dff_result.loc[max_f1_index, 'Top N Features']
        label_name = f"{file.replace('_All_Results.csv', '')}"
        plt.plot(dff_result['Top N Features'], dff_result['Test F1 Score'], marker='o',
                 label=f"{label_name}", color=colors[idx % len(colors)])

        plt.scatter(max_f1_feature, max_f1, color='red',
                    label=f"{label_name} Max F1: {max_f1:.4f} (Features: {int(max_f1_feature)})", s=100)
        plt.text(max_f1_feature, max_f1, f"{label_name}", fontsize=9, color='black', ha='right', va='bottom')

    plt.title(f"Test F1 Score vs. Top N Features ({directory})", fontsize=16)
    plt.xlabel("Top N Features", fontsize=14)
    plt.ylabel("Test F1 Score", fontsize=14)
    plt.axhline(y=max_f1, color='gray', linestyle='--', linewidth=0.7)
    plt.legend(fontsize=10, loc='best', bbox_to_anchor=(1, 1))
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()


Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Train with RF model 
import os
import pandas as pd
import logging
import time
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_validate
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

logging.basicConfig(level=logging.INFO)
datasets = [20685, 20711, 21050, 21122, 29354, 30784, 31312, 31552, 32537, 33315, 37364, 39582, 39716, 44077]
folders = '01.07.BestF'

def get_rf_model():
    return RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )

for dataset in datasets:
    logging.info(f"Processing dataset: {dataset}")
    print(f"Processing dataset: {dataset}")

    start_dataset_time = time.time()
    lime_mean_importance_file = os.path.join(
        'Experiment', 'Selected Features', 'Results', folders, str(dataset), 'lime_mean_feature_importance.csv'
    )

    if not os.path.exists(lime_mean_importance_file):
        logging.error(f"File {lime_mean_importance_file} not found, skipping dataset {dataset} - Reason: LIME file does not exist.")
        continue

    lime_importance_df = pd.read_csv(lime_mean_importance_file)
    total_features = lime_importance_df['Feature'].nunique()

    file_path = os.path.join('Gene Data', str(dataset), 'data.trn.gz')
    if not os.path.exists(file_path):
        logging.error(f"File {file_path} not found, skipping dataset {dataset} - Reason: Data file does not exist.")
        continue

    df = pd.read_csv(file_path, header=None, sep='\s+')
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    class_counts = y.value_counts()
    classes_to_keep = class_counts[class_counts >= 2].index
    if len(classes_to_keep) < 2:
        logging.error(f"After removing infrequent classes, not enough classes for classification in dataset {dataset} - Reason: Not enough classes.")
        continue

    X = X[y.isin(classes_to_keep)]
    y = y[y.isin(classes_to_keep)]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    end_preprocessing_time = time.time()
    preprocessing_time = end_preprocessing_time - start_dataset_time

    best_result = None
    all_results = []
    total_training_time = 0

    for top_n_features in range(10, total_features + 1):
        top_features = lime_importance_df['Feature'].astype(int).values[:top_n_features]

        X_train_top_selected = X_train.iloc[:, top_features]
        X_test_top_selected = X_test.iloc[:, top_features]

        cv_method = KFold(n_splits=10, shuffle=True, random_state=42) if len(X_train_top_selected) > 300 else LeaveOneOut()

        pipeline_cv = Pipeline([
            ('classifier', get_rf_model())
        ])

        scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

        start_train_time = time.time()
        cv_results = cross_validate(
            pipeline_cv, X_train_top_selected, y_train, cv=cv_method, scoring=scoring,
            n_jobs=-1, return_train_score=False
        )
        training_time = time.time() - start_train_time
        total_training_time += training_time

        accuracy_cv = cv_results['test_accuracy'].mean()
        precision_cv = cv_results['test_precision_weighted'].mean()
        recall_cv = cv_results['test_recall_weighted'].mean()
        f1_cv = cv_results['test_f1_weighted'].mean()

        pipeline_final = Pipeline([
            ('classifier', get_rf_model())
        ])

        pipeline_final.fit(X_train_top_selected, y_train)
        y_test_pred = pipeline_final.predict(X_test_top_selected)

        accuracy_test = accuracy_score(y_test, y_test_pred)
        precision_test = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
        recall_test = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
        f1_test = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)

        current_result = {
            'Top N Features': top_n_features,
            'Test Accuracy': accuracy_test,
            'Test Precision': precision_test,
            'Test Recall': recall_test,
            'Test F1 Score': f1_test,
            'Training Time': training_time,
            'Preprocessing Time (s)': preprocessing_time,
            'Total Training Time (s)': total_training_time,
            'Dataset Processing Time (s)': time.time() - start_dataset_time
        }

        all_results.append(current_result)

        if best_result is None or current_result['Test F1 Score'] > best_result['Test F1 Score']:
            best_result = current_result
            
    output_folder = os.path.join('Experiment', 'Selected Features', 'Results', folders, str(dataset))
    os.makedirs(output_folder, exist_ok=True)
    all_results_output_file = os.path.join(output_folder, f'RF_{dataset}_All_Results.csv')
    if all_results:
        pd.DataFrame(all_results).to_csv(all_results_output_file, index=False)
        logging.info(f"All results for dataset {dataset} have been saved to file RF_All_Results.csv.")

    results_output_file = os.path.join(output_folder, f'RF_{dataset}_Best_CV.csv')
    if best_result:
        pd.DataFrame([best_result]).to_csv(results_output_file, index=False)
        logging.info(f"Best result for dataset {dataset} has been saved to file RF_Best_CV.csv.")
    else:
        logging.warning(f"No sufficient result to save for dataset {dataset}.")

    logging.info(f"Preprocessing time for dataset {dataset}: {preprocessing_time:.2f} seconds.")
    logging.info(f"Total training time for dataset {dataset}: {total_training_time:.2f} seconds.")
    logging.info(f"Dataset processing time for dataset {dataset}: {time.time() - start_dataset_time:.2f} seconds.")

logging.info('All datasets have been processed.')


In [None]:
# Train with XGBoost 

import os
import pandas as pd
import logging
import time
import cupy as cp
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

logging.basicConfig(level=logging.INFO)
folders = '01.07.BestF'
def get_xgboost_model():
    return XGBClassifier(
        n_estimators=50,
        random_state=42,
        learning_rate=0.1,
        max_depth=10,
        tree_method='hist',
        device='cuda',       # Run on GPU
        eval_metric='mlogloss'
    )

for dataset in datasets:
    logging.info(f"Processing dataset: {dataset}")
    print(f"Processing dataset: {dataset}")

    start_dataset_time = time.time()
    lime_mean_importance_file = os.path.join(
        'Experiment', 'Selected Features', 'Results', folders, str(dataset), 'lime_mean_feature_importance.csv'
    )

    if not os.path.exists(lime_mean_importance_file):
        logging.error(f"File {lime_mean_importance_file} not found, skipping dataset {dataset} - Reason: LIME file does not exist.")
        continue
    lime_importance_df = pd.read_csv(lime_mean_importance_file)
    total_features = lime_importance_df['Feature'].nunique()

    file_path = os.path.join('Gene Data', str(dataset), 'data.trn.gz')
    if not os.path.exists(file_path):
        logging.error(f"File {file_path} not found, skipping dataset {dataset} - Reason: Data file does not exist.")
        continue

    df = pd.read_csv(file_path, header=None, sep='\s+')
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    class_counts = y.value_counts()
    classes_to_keep = class_counts[class_counts >= 2].index
    if len(classes_to_keep) < 2:
        logging.error(f"After removing infrequent classes, not enough classes for classification in dataset {dataset} - Reason: Not enough classes.")
        continue

    X = X[y.isin(classes_to_keep)]
    y = y[y.isin(classes_to_keep)]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    X_train_gpu = cp.array(X_train)
    X_test_gpu = cp.array(X_test)
    y_train_gpu = cp.array(y_train)
    y_test_gpu = cp.array(y_test)

    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train_gpu.get())
    y_test_encoded = label_encoder.transform(y_test_gpu.get())

    end_preprocessing_time = time.time()
    preprocessing_time = end_preprocessing_time - start_dataset_time

    best_result = None  #
    all_results = []  #
    total_training_time = 0

    for top_n_features in range(10, total_features + 1):
        top_features = lime_importance_df['Feature'].astype(int).values[:top_n_features]
        X_train_top_selected_gpu = X_train_gpu[:, top_features]
        X_test_top_selected_gpu = X_test_gpu[:, top_features]
        cv_method = KFold(n_splits=10, shuffle=True, random_state=42) if len(X_train_top_selected_gpu) > 300 else LeaveOneOut()
        pipeline_cv = get_xgboost_model()
        start_train_time = time.time()
        pipeline_cv.fit(X_train_top_selected_gpu.get(), y_train_encoded)  # Use y_train_encoded
        training_time = time.time() - start_train_time
        total_training_time += training_time
        y_test_pred = pipeline_cv.predict(X_test_top_selected_gpu.get())
        accuracy_test = accuracy_score(y_test_encoded, y_test_pred)
        precision_test = precision_score(y_test_encoded, y_test_pred, average='weighted', zero_division=0)
        recall_test = recall_score(y_test_encoded, y_test_pred, average='weighted', zero_division=0)
        f1_test = f1_score(y_test_encoded, y_test_pred, average='weighted', zero_division=0)
        current_result = {
            'Top N Features': top_n_features,
            'Test Accuracy': accuracy_test,
            'Test Precision': precision_test,
            'Test Recall': recall_test,
            'Test F1 Score': f1_test,
            'Training Time': training_time,
            'Preprocessing Time (s)': preprocessing_time,
            'Total Training Time (s)': total_training_time,
            'Dataset Processing Time (s)': time.time() - start_dataset_time
        }
        all_results.append(current_result)

        if best_result is None or current_result['Test F1 Score'] > best_result['Test F1 Score']:
            best_result = current_result

    output_folder = os.path.join('Experiment', 'Selected Features', 'Results', folders, str(dataset))
    os.makedirs(output_folder, exist_ok=True)

    all_results_output_file = os.path.join(output_folder, f'XGBoost_{dataset}_All_Results.csv')
    if all_results:
        pd.DataFrame(all_results).to_csv(all_results_output_file, index=False)
        logging.info(f"All results for dataset {dataset} have been saved to file XGBoost_All_Results.csv.")

    results_output_file = os.path.join(output_folder, f'XGBoost_{dataset}_Best_CV.csv')
    if best_result:
        pd.DataFrame([best_result]).to_csv(results_output_file, index=False)
        logging.info(f"Best result for dataset {dataset} has been saved to file XGBoost_Best_CV.csv.")
    else:
        logging.warning(f"No sufficient result to save for dataset {dataset}.")

    logging.info(f"Preprocessing time for dataset {dataset}: {preprocessing_time:.2f} seconds.")
    logging.info(f"Total training time for dataset {dataset}: {total_training_time:.2f} seconds.")
    logging.info(f"Dataset processing time for dataset {dataset}: {time.time() - start_dataset_time:.2f} seconds.")

logging.info('All datasets have been processed.')


Đang xử lý dataset: 20685


In [None]:
## Train with gradient Boosting
import os
import pandas as pd
import logging
import time
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed

datasets = [20685, 20711]
def get_gradient_boosting_model():
    return GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.01,
        max_depth=3,
        random_state=42
    )

def train_and_evaluate_top_n_features(top_n_features, X_train, X_test, y_train, y_test, lime_importance_df, preprocessing_time, start_dataset_time):
    top_features = lime_importance_df['Feature'].astype(int).values[:top_n_features]
    X_train_top_selected = X_train.iloc[:, top_features]
    X_test_top_selected = X_test.iloc[:, top_features]
    model = get_gradient_boosting_model()
    start_train_time = time.time()
    model.fit(X_train_top_selected, y_train)
    training_time = time.time() - start_train_time
    y_test_pred = model.predict(X_test_top_selected)
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
    recall_test = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
    f1_test = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
    return {
        'Top N Features': top_n_features,
        'Test Accuracy': accuracy_test,
        'Test Precision': precision_test,
        'Test Recall': recall_test,
        'Test F1 Score': f1_test,
        'Training Time (s)': training_time,
        'Preprocessing Time (s)': preprocessing_time,
        'Total Training Time (s)': training_time,
        'Dataset Processing Time (s)': time.time() - start_dataset_time
    }

for dataset in datasets:
    logging.info(f"Processing dataset: {dataset}")
    print(f"Processing dataset: {dataset}")

    start_dataset_time = time.time()
    lime_mean_importance_file = os.path.join(
        'Experiment', 'Selected Features', 'Results', folders, str(dataset), 'lime_mean_feature_importance.csv'
    )

    if not os.path.exists(lime_mean_importance_file):
        logging.error(f"File {lime_mean_importance_file} not found, skipping dataset {dataset}.")
        continue

    lime_importance_df = pd.read_csv(lime_mean_importance_file)
    total_features = lime_importance_df['Feature'].nunique()

    file_path = os.path.join('Gene Data', str(dataset), 'data.trn.gz')
    if not os.path.exists(file_path):
        logging.error(f"File {file_path} not found, skipping dataset {dataset}.")
        continue

    df = pd.read_csv(file_path, header=None, sep='\s+')
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    class_counts = y.value_counts()
    classes_to_keep = class_counts[class_counts >= 2].index
    if len(classes_to_keep) < 2:
        logging.error(f"After removing infrequent classes, not enough classes for classification in dataset {dataset}.")
        continue

    X = X[y.isin(classes_to_keep)]
    y = y[y.isin(classes_to_keep)]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    end_preprocessing_time = time.time()
    preprocessing_time = end_preprocessing_time - start_dataset_time
    start_parallel_time = time.time()
    results = Parallel(n_jobs=-1)(
        delayed(train_and_evaluate_top_n_features)(
            top_n_features, X_train, X_test, y_train_encoded, y_test_encoded,
            lime_importance_df, preprocessing_time, start_dataset_time
        )
        for top_n_features in range(10, total_features + 1)
    )
    end_parallel_time = time.time()
    output_folder = os.path.join('Experiment', 'Selected Features', 'Results', folders, str(dataset))
    os.makedirs(output_folder, exist_ok=True)
    all_results_output_file = os.path.join(output_folder, f'GradientBoosting_{dataset}_All_Results_100.csv')
    pd.DataFrame(results).to_csv(all_results_output_file, index=False)
    logging.info(f"All results have been saved to {all_results_output_file}.")
    best_result = max(results, key=lambda x: x['Test F1 Score'])
    best_result_output_file = os.path.join(output_folder, f'GradientBoosting_{dataset}_Best_CV_100.csv')
    pd.DataFrame([best_result]).to_csv(best_result_output_file, index=False)
    logging.info(f"The best result has been saved to {best_result_output_file}.")

    logging.info(f"Preprocessing time for dataset {dataset}: {preprocessing_time:.2f} seconds.")
    logging.info(f"Parallel processing time for dataset {dataset}: {end_parallel_time - start_parallel_time:.2f} seconds.")
    logging.info(f"Total processing time for dataset {dataset}: {time.time() - start_dataset_time:.2f} seconds.")

logging.info('All datasets have been processed.')
