In [1]:

# check xgboost version
import xgboost
print(xgboost.__version__)


2.1.2


In [2]:
# !pip install xgboost
# !pip install matplotlib
# !pip install seaborn
# !pip install scikit-learn


In [3]:
import numpy as np
import copy as cp
import matplotlib.pyplot as plt

import seaborn as sns
from typing import Tuple
from sklearn.metrics import confusion_matrix

def cross_val_predict(model, kfold, X, y ) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    no_classes = len(np.unique(y))
    
    actual_classes = np.empty([0], dtype=int)
    actual_X_val = np.empty([0], dtype=int)
    predicted_classes = np.empty([0], dtype=int)
    predicted_proba = np.empty([0, no_classes]) 

    for train_ndx, test_ndx in kfold.split(X):

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)
        actual_X_val = np.append(actual_X_val,test_X)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))

        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
        except:
            predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

    return actual_classes, predicted_classes, predicted_proba, actual_X_val

In [4]:
def plot_confusion_matrix(actual_classes, predicted_classes, sorted_labels):

    matrix = confusion_matrix(actual_classes, predicted_classes, labels=sorted_labels)
    
    plt.figure(figsize=(12.8,6))
    sns.heatmap(matrix, annot=True, xticklabels=sorted_labels, yticklabels=sorted_labels, cmap="Blues", fmt="g")
    plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix')

    plt.show()

In [10]:
# try cross validation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1) 
import csv
import sklearn
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
# linear regression feature importance
# from sklearn.datasets import make_regression
from matplotlib import pyplot
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

# xgboost for feature importance on a classification problem
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from matplotlib import pyplot
# define dataset

data_name_vec = ['WT', 'IkBamut']
for data_name in data_name_vec :
    file_names = 'SimIkBamutClass'
    XFileName = file_names + '_X_codon_stim_' + data_name + '.csv'
    yFileName = file_names + '_y_codon_stim_' + data_name + '.csv'
    SaveFileName = file_names + '_RandomForest_conf_mat_' + data_name + '.csv'
            # SaveFileName = 'XGBRegression_only_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'
            # SaveFileName = 'XGBRegression_with_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'

    X = read_csv(XFileName,header = None)
    #print(X.shape)            
    y = read_csv(yFileName,header = None)
    #print(y.shape)

    # Convert to NumPy arrays
    X_np = X.to_numpy()
    y_np = y.to_numpy()

    print(X_np.shape)
    print(y_np.shape)
    # Ensure y is one-dimensional
    if y_np.ndim > 1 and y_np.shape[1] == 1:
        y_np = y_np.flatten()

    # Define the parameter grid for Random Forest
    param_dist = {
        'n_estimators': [100, 250, 500, 750, 1000],
        'max_depth': [None, 5, 10, 15, 20],
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    # Set up the cross-validation strategy
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)

    # Create the base model to tune
    rf = RandomForestClassifier(random_state=1)

    # Instantiate the random search model
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=50,  # Number of parameter settings that are sampled
        cv=kfold,
        verbose=1,
        random_state=1,
        n_jobs=-1  # Use all available cores
    )

    # Fit the random search model
    random_search.fit(X_np, y_np)

    # Use the best estimator for predictions
    best_model = random_search.best_estimator_

    # Evaluate using cross-validation
    actual_classes, predicted_classes, _, _ = cross_val_predict(best_model, kfold, X_np, y_np)

    #kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)
    #model = RandomForestClassifier(random_state=1)
    # actual_classes, predicted_classes, _ , actual_X_val= cross_val_predict(model, kfold, X.to_numpy(), y.to_numpy())
    # plot_confusion_matrix(actual_classes, predicted_classes, ["TNF", "Pam3CSK", "CpG", "LPS", "PolyIC"])
    #print(actual_classes,predicted_classes)
    confu_mat = confusion_matrix(actual_classes,predicted_classes)
    np.savetxt(SaveFileName,confu_mat,  delimiter=",")  #fmt = '%d',

    print(data_name)
    print(confu_mat)
    #accuracy = model.score(actual_X_val, predicted_classes)
    #print(accuracy)

(3000, 6)
(3000, 1)
Fitting 5 folds for each of 50 candidates, totalling 250 fits


110 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise Invali

WT
[[988   8   4]
 [ 22 948  30]
 [  3  13 984]]
(3000, 6)
(3000, 1)
Fitting 5 folds for each of 50 candidates, totalling 250 fits


110 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
66 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise Invali

IkBamut
[[977  18   5]
 [ 26 951  23]
 [  3   6 991]]


In [11]:
# try cross validation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1) 
import csv
import sklearn
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
# linear regression feature importance
# from sklearn.datasets import make_regression
from matplotlib import pyplot
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

# xgboost for feature importance on a classification problem
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from matplotlib import pyplot
# define dataset

data_name_vec = ['WT']
for data_name in data_name_vec :
    file_names = 'SimIkBamutClass'
    XFileName = file_names + '_X_codon_stim_' + data_name + '.csv'
    yFileName = file_names + '_y_codon_stim_' + data_name + '.csv'
    SaveFileName = file_names + '_RandomForest_conf_mat_' + data_name + '.csv'
            # SaveFileName = 'XGBRegression_only_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'
            # SaveFileName = 'XGBRegression_with_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'

    X = read_csv(XFileName,header = None)
    #print(X.shape)            
    y = read_csv(yFileName,header = None)
    #print(y.shape)

    # Convert to NumPy arrays
    X_np = X.to_numpy()
    y_np = y.to_numpy()

    print(X_np.shape)
    print(y_np.shape)
    # Ensure y is one-dimensional
    if y_np.ndim > 1 and y_np.shape[1] == 1:
        y_np = y_np.flatten()

    # Define the parameter grid for Random Forest
    param_dist = {
        'n_estimators': [100, 250, 500, 750, 1000],
        'max_depth': [None, 5, 10, 15, 20],
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    # Set up the cross-validation strategy
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)

    # Create the base model to tune
    rf = RandomForestClassifier(random_state=1)

    # Instantiate the random search model
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=5,  # Number of parameter settings that are sampled
        cv=kfold,
        verbose=1,
        random_state=1,
        n_jobs=-1  # Use all available cores
    )

    # Fit the random search model
    random_search.fit(X_np, y_np)

    # Use the best estimator for predictions
    best_model = random_search.best_estimator_

    # Evaluate using cross-validation
    actual_classes, predicted_classes, _, _ = cross_val_predict(best_model, kfold, X_np, y_np)
    # predicted_classes = best_model.predict(X_np)
    # actual_classes = y_np

    #kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)
    #model = RandomForestClassifier(random_state=1)
    # actual_classes, predicted_classes, _ , actual_X_val= cross_val_predict(model, kfold, X.to_numpy(), y.to_numpy())
    # plot_confusion_matrix(actual_classes, predicted_classes, ["TNF", "Pam3CSK", "CpG", "LPS", "PolyIC"])
    #print(actual_classes,predicted_classes)
    print(actual_classes.shape)
    print(predicted_classes.shape)
    confu_mat = confusion_matrix(actual_classes,predicted_classes)
    # np.savetxt(SaveFileName,confu_mat,  delimiter=",")  #fmt = '%d',

    print(data_name)
    print(confu_mat)
    #accuracy = model.score(actual_X_val, predicted_classes)
    #print(accuracy)



XFileName_IkBamut = file_names + '_X_codon_stim_' + 'IkBamut' + '.csv'
yFileName_IkBamut = file_names + '_y_codon_stim_' + 'IkBamut' + '.csv'
SaveFileName_IkBamut = file_names + '_RandomForest_conf_mat_' + 'WT_Train_IkBamut' + '.csv'
            # SaveFileName = 'XGBRegression_only_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'
            # SaveFileName = 'XGBRegression_with_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'

X_IkBamut = read_csv(XFileName_IkBamut,header = None)
print(X_IkBamut.shape)            
y_IkBamut = read_csv(yFileName_IkBamut,header = None)
print(y_IkBamut.shape)

    # Convert to NumPy arrays
X_np_IkBamut = X_IkBamut.to_numpy()
y_np_IkBamut = y_IkBamut.to_numpy()

    # Ensure y is one-dimensional
if y_np_IkBamut.ndim > 1 and y_np_IkBamut.shape[1] == 1:
    y_np_IkBamut = y_np_IkBamut.flatten()

y_pred_IkBamut = best_model.predict(X_np_IkBamut)
    # Evaluate using cross-validation

    #kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)
    #model = RandomForestClassifier(random_state=1)
    # actual_classes, predicted_classes, _ , actual_X_val= cross_val_predict(model, kfold, X.to_numpy(), y.to_numpy())
    # plot_confusion_matrix(actual_classes, predicted_classes, ["TNF", "Pam3CSK", "CpG", "LPS", "PolyIC"])
    #print(actual_classes,predicted_classes)
confu_mat = confusion_matrix(y_np_IkBamut,y_pred_IkBamut)
np.savetxt(SaveFileName_IkBamut,confu_mat,  delimiter=",")  #fmt = '%d',

print('IkBamut')
print(confu_mat)
    #accuracy = model.score(actual_X_val, predicted_classes)
    #print(accuracy)

(3000, 6)
(3000, 1)
Fitting 5 folds for each of 5 candidates, totalling 25 fits


10 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidPa

(3000,)
(3000,)
WT
[[978  12  10]
 [ 31 919  50]
 [ 10   9 981]]
(3000, 6)
(3000, 1)
IkBamut
[[689 269  42]
 [  4 965  31]
 [  0  21 979]]
