In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Hyperparameter Tuning - Sub model 1

## Load 3 Datasets

In [3]:
import pandas as pd

# Load datasets
ATB_OT = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/balanced/ATB_OT.csv')
#PTB_EPTB = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/balanced/PTB_EPTB.csv')
#LTB_OT = pd.read_csv(r'/content/drive/MyDrive/Research/TB_new/Datasets/train/balanced/LTB_OT.csv')

## Separate features and target

In [4]:
# Separate features and target in each dataset
X_ATB_OT = ATB_OT.drop(columns=['TB_Status'])
y_ATB_OT = ATB_OT['TB_Status']

#X_PTB_EPTB = PTB_EPTB.drop(columns=['TB_Status'])
#y_PTB_EPTB = PTB_EPTB['TB_Status']

#X_LTB_OT = LTB_OT.drop(columns=['TB_Status'])
#y_LTB_OT = LTB_OT['TB_Status']

## Functions to select the best models with best hyperparameters

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [None]:
    "SVM": {
        "model": SVC(probability=True, random_state=42),
        "params": {
            "C": [ 0.01, 0.1, 1],
            "kernel": ["linear", "rbf", "poly", "sigmoid"],
            "gamma": ["scale", "auto", 0.1],
            "degree": [2, 3, 4], # Only for poly kernel
            "coef0": [0, 0.1, 0.5] # Only for poly and sigmoid kernels
        }
    }
    ,
        "Naive Bayes": {
        "model": GaussianNB(),
        "params": {}
    },
    "LightGBM": {
        "model": LGBMClassifier(force_col_wise=True, verbose=-1, random_state=42),
        "params": {
            "learning_rate": [0.01, 0.1, 0.2],
            "n_estimators": [100, 200],
            "max_depth": [-1, 10]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42, verbosity=0),
        "params": {
            "learning_rate": [0.01, 0.1, 0.2],
            "n_estimators": [100, 200],
            "max_depth": [3, 5]
        }
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [3, 5, 7, 9],
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan"]
        }
    }

In [7]:
# Define models and hyperparameter grids
models = {
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200, 300],
            "max_depth": [None, 5, 10, 20],
            "min_samples_split": [2, 5, 10],
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"],
            "max_features": ["sqrt", "log2", None]
        }
    },
}

In [8]:
def tune_and_evaluate(X_train, y_train, X_test, y_test, model_dict):
    results = []
    best_models = {}

    for name, model_info in model_dict.items():
        print(f"\n{'='*70}")
        print(f"Tuning {name}...")
        print(f"{'='*70}")

        grid_search = GridSearchCV(
            model_info["model"],
            model_info["params"],
            cv=5,
            scoring='f1_weighted',
            verbose=0,
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)

        # Print all grid search results
        cv_results = pd.DataFrame(grid_search.cv_results_)
        cv_results = cv_results[['params', 'mean_test_score', 'std_test_score']]
        cv_results = cv_results.sort_values(by='mean_test_score', ascending=False)
        print(f"\nTop results for {name}:\n")
        print(cv_results.head(10).to_string(index=False))

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_models[name] = best_model

        # Evaluate on test set
        y_pred = best_model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        results.append({
            "Model": name,
            "Accuracy": acc,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Best Hyperparameters": str(best_params)
        })

        print(f"\nBest Parameters for {name}: {best_params}")
        print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    return pd.DataFrame(results), best_models


def visualize_results(results_df, title):
    df = results_df.set_index("Model").drop(columns="Best Hyperparameters")
    df.plot(kind='bar', figsize=(12, 6), rot=0, colormap='viridis')
    plt.title(title)
    plt.ylabel("Score")
    plt.xlabel("Model")
    plt.legend(loc='lower right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

## Comparison between different algorithms for Sub model 1

In [9]:
# Load selected features for sub-model 1
selected_features_path = r'/content/drive/MyDrive/Research/TB_new/Saved_files/selected_features_ATB_OT.csv'
selected_features_df = pd.read_csv(selected_features_path)

# Example pipeline for TB_Status
X_train_TB_Status, X_test_TB_Status, y_train_TB_Status, y_test_TB_Status = train_test_split(
    selected_features_df, y_ATB_OT, test_size=0.2, random_state=42
)

# SMOTE for class imbalance
smote = SMOTE(random_state=42)
X_train_TB_Status_balanced, y_train_TB_Status_balanced = smote.fit_resample(X_train_TB_Status, y_train_TB_Status)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_TB_Status_balanced)
X_test_scaled = scaler.transform(X_test_TB_Status)

# Label encoding (fit the encoder on the training data first, then transform both)
label_encoder_TB_Status = LabelEncoder()
y_train_encoded = label_encoder_TB_Status.fit_transform(y_train_TB_Status_balanced)
y_test_encoded = label_encoder_TB_Status.transform(y_test_TB_Status)

# Tune and evaluate
results_TB_Status, best_models_TB_Status = tune_and_evaluate(X_train_scaled, y_train_encoded, X_test_scaled, y_test_encoded, models)

# Identify and visualize the best model
best_model_overall = results_TB_Status.loc[results_TB_Status["F1 Score"].idxmax()]
print("\nOverall Best Model:")
print(f"Model: {best_model_overall['Model']}")
print(f"Accuracy: {best_model_overall['Accuracy']:.4f}")
print(f"Precision: {best_model_overall['Precision']:.4f}")
print(f"Recall: {best_model_overall['Recall']:.4f}")
print(f"F1 Score: {best_model_overall['F1 Score']:.4f}")
print(f"Best Hyperparameters: {best_model_overall['Best Hyperparameters']}")

visualize_results(results_TB_Status, "Model Performance Comparison for TB_Status")


Tuning Random Forest...


KeyboardInterrupt: 

In [8]:
selected_features_df

Unnamed: 0,ILMN_1774071,ILMN_1812433,ILMN_2105441,ILMN_1654875,ILMN_1690241
0,0.619698,-0.273872,-0.551774,-0.534285,-0.497743
1,-0.448513,-0.107235,-0.666390,0.013370,-0.287773
2,0.087455,0.093232,-0.392231,0.136658,-0.290243
3,0.912294,-0.091173,-0.415240,-0.356234,-0.197253
4,-0.083857,-0.061896,-0.534328,-0.966808,-0.258965
...,...,...,...,...,...
1813,-0.068894,0.167008,-0.008675,1.059418,-0.744768
1814,-0.001118,0.198399,0.941275,-0.632220,1.454468
1815,-0.298423,-0.422926,-0.596934,0.020540,-0.392951
1816,-0.493525,-0.823420,-0.035475,0.186640,-0.204439
