In [None]:
from google.colab import drive
import os
# Mount Google Drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/NCS
path = os.getcwd()
print(path)

In [None]:

import gzip
import pandas as pd
file_path ='Gene Data/21050/data.trn.gz'
with gzip.open(file_path, 'rt') as f:
   df = pd.read_csv(f, header=None, sep='\s+');

print(df.shape)
X = df.iloc[:, :-1]

y= df.iloc[:,-1]
unique_labels = y.unique()
print("Các nhãn duy nhất trong tập dữ liệu:", unique_labels)
num_classes = y.nunique()
label_counts = y.value_counts()
print("Số lượng mẫu cho mỗi nhãn:")
print(label_counts)
if 13 in label_counts.index:
    print(f"Số lượng mẫu cho nhãn 13: {label_counts[13]}")
else:
    print("Nhãn 13 không có trong tập dữ liệu.")
missing_labels = y.isnull().sum()
print(f"Số lượng mẫu không có nhãn (NaN): {missing_labels}")


In [None]:
# CHọn đặc trưng với SVM, RF, DT
import os
import logging
import warnings
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
warnings.filterwarnings('ignore')
datasets_and_n_SF = [(20685,545),(20711,111),(21050,72),
 (21122, 271),
    (29354, 28), (30784, 171), (31312, 213), (31552, 79),
    (32537, 96), (33315, 483), (37364, 59),
    (39582, 644), (39716, 124), (44077, 227),(36895,39)]

def load_data(file_path):
    try:
        df = pd.read_csv(file_path, header=None, sep='\s+')
        logging.info(f"Đọc dữ liệu từ file: {file_path}")
        return df
    except FileNotFoundError:
        logging.error(f"Không tìm thấy file: {file_path}")
        return None

def scale_data(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    logging.info("Dữ liệu đã được chuẩn hóa.")
    return X_scaled
def select_features_with_rf(X_scaled, y, n_SF, n_estimators=100):
    start_time = time.time()
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf.fit(X_scaled, y)
    importances = rf.feature_importances_
    indices = importances.argsort()[::-1][:n_SF]
    X_selected = X_scaled[:, indices]

    elapsed_time = time.time() - start_time  # Thời gian kết thúc
    logging.info(f"Chọn {len(indices)} đặc trưng quan trọng nhất bằng Random Forest. Thời gian: {elapsed_time:.2f} giây.")
    return X_selected, indices, importances, elapsed_time

def select_features_with_dt(X_scaled, y, n_SF):
    start_time = time.time()  # Thời gian bắt đầu
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_scaled, y)
    importances = dt.feature_importances_

    indices = importances.argsort()[::-1][:n_SF]
    X_selected = X_scaled[:, indices]

    elapsed_time = time.time() - start_time  # Thời gian kết thúc
    logging.info(f"Chọn {len(indices)} đặc trưng quan trọng nhất bằng Decision Tree. Thời gian: {elapsed_time:.2f} giây.")
    return X_selected, indices, importances, elapsed_time

def select_features_with_svm(X_scaled, y, k):
    start_time = time.time()  # Thời gian bắt đầu
    mi = SelectKBest(mutual_info_classif, k=k)
    X_selected = mi.fit_transform(X_scaled, y)
    elapsed_time = time.time() - start_time  # Thời gian kết thúc
    logging.info(f"Chọn {k} đặc trưng quan trọng nhất bằng SVM với Mutual Information. Thời gian: {elapsed_time:.2f} giây.")
    return X_selected, mi, elapsed_time


def save_selected_feature_names_to_csv(X, selected_features, output_directory, filename):
    selected_feature_names = X.columns[selected_features]
    selected_features_file = os.path.join(output_directory, filename)
    pd.DataFrame(selected_feature_names, columns=['Selected Features']).to_csv(selected_features_file, index=False)
    logging.info(f"Tên các đặc trưng đã được chọn lưu vào file: {selected_features_file}")


def save_feature_extraction_times(output_directory, times_dict, filename):
    times_file = os.path.join(output_directory, filename)
    pd.DataFrame(times_dict).to_csv(times_file, index=False)
    logging.info(f"Thời gian trích đặc trưng lưu vào file: {times_file}")

for dataset, n_SF in datasets_and_n_SF:
    logging.info(f"Bắt đầu xử lý dataset: {dataset} với {n_SF} đặc trưng cần chọn.")
    file_path = f'Gene Data/{dataset}/data.trn.gz'
    output_directory = os.path.join('Thuc Nghiem', 'Selected Features', 'Results', '08.10_ML', str(dataset))
    log_file = os.path.join(output_directory, 'experiment_log.log')
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logging.info(f"Bắt đầu quá trình chọn lọc đặc trưng cho dataset {dataset}.")

    df = load_data(file_path)
    if df is not None:
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]
        X_scaled = scale_data(X)
        times_dict = {'Method': [], 'Time (seconds)': []}
        X_selected_rf, selected_features_rf, importances_rf, rf_time = select_features_with_rf(X_scaled, y, n_SF)
        times_dict['Method'].append('Random Forest')
        times_dict['Time (seconds)'].append(rf_time)
        logging.info(f"Số đặc trưng được chọn với Random Forest: {len(selected_features_rf)}")
        X_selected_dt, selected_features_dt, importances_dt, dt_time = select_features_with_dt(X_scaled, y, n_SF)
        times_dict['Method'].append('Decision Tree')
        times_dict['Time (seconds)'].append(dt_time)
        logging.info(f"Số đặc trưng được chọn với Decision Tree: {len(selected_features_dt)}")
        X_selected_svm, mi_selector, svm_time = select_features_with_svm(X_scaled, y, k=n_SF)
        times_dict['Method'].append('SVM (Mutual Information)')
        times_dict['Time (seconds)'].append(svm_time)
        logging.info(f"Số đặc trưng được chọn với SVM: {mi_selector.get_support().sum()}")
        save_selected_feature_names_to_csv(X, selected_features_rf, output_directory, filename=f'{n_SF}_rf_selected_feature_names.csv')
        save_selected_feature_names_to_csv(X, selected_features_dt, output_directory, filename=f'{n_SF}_dt_selected_feature_names.csv')
        save_selected_feature_names_to_csv(X, mi_selector.get_support(), output_directory, filename=f'{n_SF}_svm_selected_feature_names.csv')
        save_feature_extraction_times(output_directory, times_dict, filename=f'{n_SF}_feature_extraction_times.csv')

    logging.info(f"Hoàn tất xử lý dataset: {dataset}")



In [None]:
###Huấn luyện với 4 giải thuật

import os
import pandas as pd
import numpy as np
import logging
import time
import warnings
from sklearn.model_selection import StratifiedKFold, LeaveOneOut
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')

def load_data(file_path):
    try:
        df = pd.read_csv(file_path, compression='gzip')
        return df
    except Exception as e:
        logging.error(f"Lỗi khi tải dữ liệu từ file {file_path}: {e}")
        return None

def load_selected_features(selected_features_file):
    try:
        selected_features = pd.read_csv(selected_features_file)
        return selected_features.iloc[:, 0].tolist()
    except Exception as e:
        logging.error(f"Lỗi khi tải danh sách đặc trưng từ file {selected_features_file}: {e}")
        return None
def scale_data(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)


def save_results_to_csv(results_df, output_directory, filename):
    file_path = os.path.join(output_directory, filename)
    try:
        results_df.to_csv(file_path, index=False)
        logging.info(f"Kết quả đã được lưu vào file {file_path}")
    except Exception as e:
        logging.error(f"Lỗi khi lưu kết quả vào file {file_path}: {e}")


def train_and_evaluate_models_cv(X, y, n_runs=10):
    models = {
        'SVM': SVC(C=100000, kernel='rbf', gamma='scale', class_weight='balanced', random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42),
        'XGBoost (GPU)': XGBClassifier(
            n_estimators=200, learning_rate=0.01, max_depth=3, min_child_weight=1, gamma=1,
            subsample=0.8, colsample_bytree=0.8, objective='multi:softmax', use_label_encoder=False,
            eval_metric='mlogloss', random_state=42, num_class=len(np.unique(y)),
            tree_method='gpu_hist', gpu_id=0
        ),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
    }

    if len(y) >= 300:
        logging.info("Sử dụng 10-fold Stratified cross-validation vì số lượng mẫu >= 300.")
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    else:
        logging.info("Sử dụng leave-one-out cross-validation vì số lượng mẫu < 300.")
        cv = LeaveOneOut()
    aggregated_results = {
        'model': [], 'accuracy': [], 'f1_score': [], 'recall': [], 'precision': [], 'training_time': []
    }

    for run in range(n_runs):
        logging.info(f"Chạy lần thứ {run+1}")

        run_results = {
            'model': [], 'accuracy': [], 'f1_score': [], 'recall': [], 'precision': [], 'training_time': []
        }

        for model_name, model in models.items():
            fold_num = 1
            accuracy_scores = []
            precision_scores = []
            recall_scores = []
            f1_scores = []

            start_train_time = time.time()

            for train_index, test_index in cv.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
                recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
                f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
                accuracy_scores.append(accuracy)
                precision_scores.append(precision)
                recall_scores.append(recall)
                f1_scores.append(f1)

                fold_num += 1

            end_train_time = time.time()
            training_time = end_train_time - start_train_time
            run_results['model'].append(model_name)
            run_results['accuracy'].append(np.mean(accuracy_scores))
            run_results['f1_score'].append(np.mean(f1_scores))
            run_results['recall'].append(np.mean(recall_scores))
            run_results['precision'].append(np.mean(precision_scores))
            run_results['training_time'].append(training_time)
        for key in aggregated_results:
            if key != 'model':
                aggregated_results[key].extend(run_results[key])
            else:
                aggregated_results[key] = run_results[key]
    for key in aggregated_results:
        if key != 'model':
            aggregated_results[key] = np.mean(aggregated_results[key])
    for i, model_name in enumerate(aggregated_results['model']):
        summary_log_message = (f"{model_name} - Average Accuracy: {aggregated_results['accuracy'][i]*100:.2f}%, "
                               f"Average F1 Score: {aggregated_results['f1_score'][i]*100:.2f}%, "
                               f"Average Recall: {aggregated_results['recall'][i]*100:.2f}%, "
                               f"Average Precision: {aggregated_results['precision'][i]*100:.2f}%, "
                               f"Average Training Time: {aggregated_results['training_time'][i]:.2f} seconds")
        print(summary_log_message)
        logging.info(summary_log_message)

    return pd.DataFrame(aggregated_results)

datasets_and_n_SF = [
   (29354, 35), (30784, 42), (31312, 195), (31552, 44),
   (32537, 171), (33315, 2321), (37364, 140),
   (39582, 441), (39716, 118), (44077, 23), (21122, 78)

for dataset, n_SF in datasets_and_n_SF:
    logging.info(f"Bắt đầu xử lý dataset: {dataset} với {n_SF} đặc trưng cần chọn.")
    file_path = f'Gene Data/{dataset}/data.trn.gz'
    selected_features_file = os.path.join('1.Thuc Nghiem', '1.Selected Features', 'Results', '08.10_ML', str(dataset), f'{n_SF}_dt_selected_feature_names.csv')
    output_directory = os.path.join('1.Thuc Nghiem', '1.Selected Features', 'Results', '08.10_ML', str(dataset))
    log_file = os.path.join(output_directory, 'experiment_log_with_selected_features.log')
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logging.info(f"Bắt đầu quá trình huấn luyện và đánh giá với các đặc trưng đã chọn cho dataset {dataset}.")

    df = load_data(file_path)
    if df is not None:
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]
        selected_features = load_selected_features(selected_features_file)
        X_selected = X[selected_features]
        X_scaled = scale_data(X_selected)
        results_df = train_and_evaluate_models_cv(X_scaled, y, n_runs=10)
        save_results_to_csv(results_df, output_directory, f'results_{dataset}_nSF_{n_SF}.csv')
    else:
        logging.error(f"Không thể tải dữ liệu cho dataset {dataset}.")


In [None]:
## grid_search để tìm Best parameters: {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
import os
import logging
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

output_directory = os.path.join('Thuc Nghiem', 'Selected Features', 'Results', '04.10', '31552')
log_file = os.path.join(output_directory, 'experiment_log.log')
csv_file = os.path.join(output_directory, 'grid_search_results.csv')  # Đường dẫn file CSV để ghi kết quả

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

logging.info('Bắt đầu thực hiện quá trình chọn lọc đặc trưng với Boruta.')

def load_data(file_path):
    try:
        df = pd.read_csv(file_path, header=None, sep='\s+')
        logging.info(f"Đọc dữ liệu từ file: {file_path}")
        return df
    except FileNotFoundError:
        logging.error(f"Không tìm thấy file: {file_path}")
        return None


def scale_data(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    logging.info("Dữ liệu đã được chuẩn hóa.")
    return X_scaled

df = load_data(file_path)
if df is not None:

    X = df.iloc[:, 1:]
    y = df.iloc[:,-1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_scaled = scale_data(X_train)
    X_test_scaled = scale_data(X_test)
    svc = SVC(kernel='rbf', class_weight='balanced', random_state=42)
    param_grid = {
        'C': [1000, 10000, 100000, 1000000],
        'gamma': [0.01, 0.05, 0.005, 0.001, 0.0001],
        'kernel': ['rbf']
    }
    grid_search = GridSearchCV(svc, param_grid, cv=5, return_train_score=True)
    grid_search.fit(X_train_scaled, y_train)
    logging.info(f"Tham số tốt nhất: {grid_search.best_params_}")
    print("Best parameters:", grid_search.best_params_)
    results_df = pd.DataFrame(grid_search.cv_results_)
    results_df.to_csv(csv_file, index=False)
    logging.info(f"Kết quả GridSearchCV đã được ghi vào file: {csv_file}")

else:
    print("Không thể đọc dữ liệu.")




Best parameters: {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
