In [1]:

# check xgboost version
import xgboost
print(xgboost.__version__)


2.1.2


In [2]:
# !pip install xgboost
# !pip install matplotlib
# !pip install seaborn
# !pip install scikit-learn


In [3]:
import numpy as np
import copy as cp
import matplotlib.pyplot as plt

import seaborn as sns
from typing import Tuple
from sklearn.metrics import confusion_matrix

def cross_val_predict(model, kfold, X, y ) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    no_classes = len(np.unique(y))
    
    actual_classes = np.empty([0], dtype=int)
    actual_X_val = np.empty([0], dtype=int)
    predicted_classes = np.empty([0], dtype=int)
    predicted_proba = np.empty([0, no_classes]) 

    for train_ndx, test_ndx in kfold.split(X):

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)
        actual_X_val = np.append(actual_X_val,test_X)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))

        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
        except:
            predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

    return actual_classes, predicted_classes, predicted_proba, actual_X_val

In [4]:
def plot_confusion_matrix(actual_classes, predicted_classes, sorted_labels):

    matrix = confusion_matrix(actual_classes, predicted_classes, labels=sorted_labels)
    
    plt.figure(figsize=(12.8,6))
    sns.heatmap(matrix, annot=True, xticklabels=sorted_labels, yticklabels=sorted_labels, cmap="Blues", fmt="g")
    plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix')

    plt.show()

In [5]:
# try cross validation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1) 
import csv
import sklearn
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
# linear regression feature importance
# from sklearn.datasets import make_regression
from matplotlib import pyplot
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

# xgboost for feature importance on a classification problem
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from matplotlib import pyplot
# define dataset

data_name_vec = ['Fitting', 'Ade']
for data_name in data_name_vec :
    # data_name = 'Ade'
    XFileName = 'All_dose_X_codon_stim_' + data_name + '.csv'
    yFileName = 'All_dose_y_codon_stim_' + data_name + '.csv'
    SaveFileName = 'All_dose_RandomForest_conf_mat_' + data_name + '.csv'
            # SaveFileName = 'XGBRegression_only_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'
            # SaveFileName = 'XGBRegression_with_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'

    X = read_csv(XFileName,header = None)
    #print(X.shape)            
    y = read_csv(yFileName,header = None)
    #print(y.shape)

    # Convert to NumPy arrays
    X_np = X.to_numpy()
    y_np = y.to_numpy()

    # Ensure y is one-dimensional
    if y_np.ndim > 1 and y_np.shape[1] == 1:
        y_np = y_np.flatten()

    # Define the parameter grid for Random Forest
    param_dist = {
        'n_estimators': [100, 250, 500, 750, 1000],
        'max_depth': [None, 5, 10, 15, 20],
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    # Set up the cross-validation strategy
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)

    # Create the base model to tune
    rf = RandomForestClassifier(random_state=1)

    # Instantiate the random search model
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=50,  # Number of parameter settings that are sampled
        cv=kfold,
        verbose=1,
        random_state=1,
        n_jobs=-1  # Use all available cores
    )

    # Fit the random search model
    random_search.fit(X_np, y_np)

    # Use the best estimator for predictions
    best_model = random_search.best_estimator_

    # Evaluate using cross-validation
    actual_classes, predicted_classes, _, _ = cross_val_predict(best_model, kfold, X_np, y_np)

    #kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)
    #model = RandomForestClassifier(random_state=1)
    # actual_classes, predicted_classes, _ , actual_X_val= cross_val_predict(model, kfold, X.to_numpy(), y.to_numpy())
    # plot_confusion_matrix(actual_classes, predicted_classes, ["TNF", "Pam3CSK", "CpG", "LPS", "PolyIC"])
    #print(actual_classes,predicted_classes)
    confu_mat = confusion_matrix(actual_classes,predicted_classes)
    np.savetxt(SaveFileName,confu_mat,  delimiter=",")  #fmt = '%d',

    print(data_name)
    print(confu_mat)
    #accuracy = model.score(actual_X_val, predicted_classes)
    #print(accuracy)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


110 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise Invali

Fitting
[[102   1   1  10   3   6   0   0   0   3   0   0  10   0   0]
 [  1 135  37   0   6  44   0   1   1   1   1   0   9   1   1]
 [  1  44 146   0   2  14   0   0   0   0   0   0   1   1   0]
 [ 12   2   0 269   7   4   7  10   0   9   7   1  97  17  37]
 [ 10   4   1  16 318  55   0   9  17   3  12  94  18   9  21]
 [  2  13   4   5  40 454   1  11  16   0   4  14   9   0   4]
 [  2   2   0  40  21   2  30  26  12  10   9  19  53   5   8]
 [  7   0   0  24  55  29  23  67  22   4  13  54  37  13  18]
 [  4   3   0   8  40  38   7  42  47   4  13  53  14   9   9]
 [  4   3   0  11  17   5   5   8   8  18  15   5  41  11   8]
 [  2   0   0   6  54   4   4  29  14  11  30  46  15   9   6]
 [  2   1   0   2 133  10   3  19  21   3  16 166  13  10  13]
 [  7   7   0  55  19   1   4   4   1   6   2   7 392  73  82]
 [  0   2   1  21  21   0   4   6   4   2   2  12 105 199 142]
 [  3   3   1  36  33   3   1   9   4   5   3   8  90 149 200]]
Fitting 5 folds for each of 50 candidates, tot

110 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
58 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise Invali

Ade
[[ 86   0   0   9   1   3   0   0   3   6   0   0  28   0   0]
 [  1  57  63   1   4  70   0   0   0   0   1   0  26   9   6]
 [  1  49  91   4   1  38   0   0   0   0   0   0  17   7   1]
 [ 11   2   1 185  43   3   1   1   0   2   1   1 157  31  40]
 [ 10   3   0  19 322  44   0   7   7   4  12  70  42  22  25]
 [  0  23  16   4  47 435   0  12  13   3   2   5  15   0   2]
 [  7   1   0  51  50   3   0  20   6   5  10  20  56   3   7]
 [  3   2   0  32  96  27   0  55  16   2  17  44  39  12  21]
 [  3   2   1   5  75  37   0  25  32   4  16  41  27  12  11]
 [  8   0   0  11  31   4   2   2   8  10  18   4  48   7   6]
 [  7   0   0   6  64   4   0  11   6   6  46  43  21  10   6]
 [  3   1   0   8 168  22   0  14  13   6  16 120  18  11  12]
 [ 17   5   3  86  46   6   0   1   3   6   3   3 366  56  59]
 [  1   1   1  31  56   5   0   4   1   1   9  15 107 164 125]
 [  4   1   1  59  82   8   0   1   0   3  10  15  97 144 123]]
