In [1]:

# check xgboost version
import xgboost
print(xgboost.__version__)


2.1.2


In [2]:
# !pip install xgboost
# !pip install matplotlib
# !pip install seaborn
# !pip install scikit-learn


In [3]:
import numpy as np
import copy as cp
import matplotlib.pyplot as plt

import seaborn as sns
from typing import Tuple
from sklearn.metrics import confusion_matrix

def cross_val_predict(model, kfold, X, y ) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    no_classes = len(np.unique(y))
    
    actual_classes = np.empty([0], dtype=int)
    actual_X_val = np.empty([0], dtype=int)
    predicted_classes = np.empty([0], dtype=int)
    predicted_proba = np.empty([0, no_classes]) 

    for train_ndx, test_ndx in kfold.split(X):

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)
        actual_X_val = np.append(actual_X_val,test_X)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))

        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
        except:
            predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

    return actual_classes, predicted_classes, predicted_proba, actual_X_val

In [4]:
def plot_confusion_matrix(actual_classes, predicted_classes, sorted_labels):

    matrix = confusion_matrix(actual_classes, predicted_classes, labels=sorted_labels)
    
    plt.figure(figsize=(12.8,6))
    sns.heatmap(matrix, annot=True, xticklabels=sorted_labels, yticklabels=sorted_labels, cmap="Blues", fmt="g")
    plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix')

    plt.show()

In [7]:
# try cross validation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1) 
import csv
import sklearn
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
# linear regression feature importance
# from sklearn.datasets import make_regression
from matplotlib import pyplot
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

# xgboost for feature importance on a classification problem
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from matplotlib import pyplot
# define dataset

data_name_vec = ['Fitting', 'Ade']
for data_name in data_name_vec :
    # data_name = 'Ade'
    XFileName = 'All_dose_X_codon_stim_' + data_name + '.csv'
    yFileName = 'All_dose_y_codon_stim_' + data_name + '.csv'
    SaveFileName = 'All_dose_RandomForest_conf_mat_' + data_name + '.csv'
            # SaveFileName = 'XGBRegression_only_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'
            # SaveFileName = 'XGBRegression_with_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'

    X = read_csv(XFileName,header = None)
    #print(X.shape)            
    y = read_csv(yFileName,header = None)
    #print(y.shape)

    # Convert to NumPy arrays
    X_np = X.to_numpy()
    y_np = y.to_numpy()

    # Ensure y is one-dimensional
    if y_np.ndim > 1 and y_np.shape[1] == 1:
        y_np = y_np.flatten()

    # Define the parameter grid for Random Forest
    param_dist = {
        'n_estimators': [100, 250, 500, 750, 1000],
        'max_depth': [None, 5, 10, 15, 20],
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    # Set up the cross-validation strategy
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)

    # Create the base model to tune
    rf = RandomForestClassifier(random_state=1)

    # Instantiate the random search model
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=50,  # Number of parameter settings that are sampled
        cv=kfold,
        verbose=1,
        random_state=1,
        n_jobs=-1  # Use all available cores
    )

    # Fit the random search model
    random_search.fit(X_np, y_np)

    # Use the best estimator for predictions
    best_model = random_search.best_estimator_

    # Evaluate using cross-validation
    actual_classes, predicted_classes, _, _ = cross_val_predict(best_model, kfold, X_np, y_np)

    #kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)
    #model = RandomForestClassifier(random_state=1)
    # actual_classes, predicted_classes, _ , actual_X_val= cross_val_predict(model, kfold, X.to_numpy(), y.to_numpy())
    # plot_confusion_matrix(actual_classes, predicted_classes, ["TNF", "Pam3CSK", "CpG", "LPS", "PolyIC"])
    #print(actual_classes,predicted_classes)
    confu_mat = confusion_matrix(actual_classes,predicted_classes)
    np.savetxt(SaveFileName,confu_mat,  delimiter=",")  #fmt = '%d',

    print(data_name)
    print(confu_mat)
    #accuracy = model.score(actual_X_val, predicted_classes)
    #print(accuracy)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


110 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
58 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise Invali

Fitting
[[101   1   0   4   2   1   0   1   1  10   0   0   8   0   7]
 [  2 140  33   1   1   0   0   1   1   7   1   0   1   6  44]
 [  1  41 148   0   0   0   0   0   0   0   3   0   0   2  14]
 [  5   3   0  21  11   3  10   9   8  42   9   9   8  17   4]
 [  2   0   0  14  32  47   2  21  10  22   8   6   8  52   6]
 [  1   1   0   4  14 163   3  19  22  11   9  14   4 135  12]
 [  2   3   0  10  10  17  25  29  11  52   9  11  39  20   1]
 [  7   0   0   3  15  57  18  62  32  42   5  22  27  51  25]
 [  4   2   0   8  14  46   4  41  51  15  11   7   7  38  43]
 [  5   6   0   5   2   3   7   4   3 389  80  74  61  19   2]
 [  0   2   1   4   3  12   7   7   3 108 208 124  18  24   0]
 [  2   3   1   5   3  10   3   9   2  81 149 207  38  33   2]
 [  9   0   0   9   5   1  12  10   0  98  17  40 269   6   3]
 [ 11   3   1   3  13  93   0   8  14  20  10  24  14 321  52]
 [  3  19   0   0   2  16   1  12  16  10   0   5   3  38 452]]
Fitting 5 folds for each of 50 candidates, tot

110 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
59 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/miniconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise Invali

Ade
[[ 84   0   1   6   0   0   0   0   3  32   0   0   8   1   1]
 [  1  63  66   0   1   0   0   0   1  24   8   6   1   4  63]
 [  1  54  89   0   0   0   0   0   0  18   5   2   3   1  36]
 [  6   0   0  14  10   6   1   1   7  50   6   6  11  36   5]
 [  7   0   0   5  45  45   0  10   5  20   9   4   7  71   2]
 [  3   0   0   4  21 112   0  16  12  20  11  11   5 175  22]
 [  8   1   0   4  10  18   3  19  10  55   5   3  54  46   3]
 [  3   0   1   4  19  45   1  48  18  44  14  16  34  91  28]
 [  3   3   2   6  18  45   1  22  27  23  15   9   7  73  37]
 [ 16   3   2   6   3   3   1   2   3 353  66  46 102  48   6]
 [  1   1   1   2   9   8   0   5   0 104 154 128  38  64   6]
 [  3   1   2   2  11  13   0   1   1  91 130 137  65  82   9]
 [ 11   2   0   3   1   0   0   1   0 151  26  54 187  40   3]
 [  8   3   2   4  15  68   1   7  10  47  24  18  16 321  43]
 [  1  22  12   3   2  11   0  10  14  11   0   1   3  44 443]]


In [5]:
# random forest for feature importance on a classification problem
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import csv
import sklearn
# linear regression feature importance
# from sklearn.datasets import make_regression
from matplotlib import pyplot
from pandas import read_csv
from sklearn.model_selection import train_test_split

# xgboost for feature importance on a classification problem
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from matplotlib import pyplot
# define dataset

# 'Fitting' 'Ade' 'Sampling'
data_name = 'Ade'
XFileName = 'All_dose_X_codon_stim_' + data_name + '.csv'
yFileName = 'All_dose_y_codon_stim_' + data_name + '.csv'
SaveFileName = 'XGBClassification_' + data_name + '.csv'
            # SaveFileName = 'XGBRegression_only_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'
            # SaveFileName = 'XGBRegression_with_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'

X = read_csv(XFileName,header = None)
print(X.shape)            
y = read_csv(yFileName,header = None)
print(y.shape)

# X, y = make_classification(n_samples=1000, n_features=3, n_informative=2, n_redundant=1, random_state=1, n_classes = 2)
# print(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

#print(y_train)
#print(y)
# define the model
# define dataset
model = RandomForestClassifier()

# fit the model
y_train = y_train.iloc[:, 0]
print(y_train)

model.fit(X_train, y_train)
# evaluate the model
y_pred = model.predict(X_test)
print(y_pred.shape)
# get importance
importance = model.feature_importances_

# import fitting data:

data_name_2 = 'Fitting'
XFileName_2 = 'All_dose_X_codon_stim_' + data_name + '.csv'
yFileName_2 = 'All_dose_y_codon_stim_' + data_name + '.csv'
SaveFileName_2 = 'XGBClassification_' + data_name + '.csv'
            # SaveFileName = 'XGBRegression_only_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'
            # SaveFileName = 'XGBRegression_with_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'

X_2 = read_csv(XFileName,header = None)
print(X_2.shape)            
y_2 = read_csv(yFileName,header = None)
print(y_2.shape)

# X, y = make_classification(n_samples=1000, n_features=3, n_informative=2, n_redundant=1, random_state=1, n_classes = 2)
# print(X)
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.33, random_state=1)

y_pred = model.predict(X_test)

confu_mat = confusion_matrix(y_test,y_pred)
# np.savetxt(SaveFileName,confu_mat,  delimiter=",")  #fmt = '%d',
accuracy = model.score(X_test, y_test)
print(accuracy)
print(confu_mat)

#y_pred_2 = model.predict(X_2)

#confu_mat = confusion_matrix(y_2,y_pred_2)
# np.savetxt(SaveFileName,confu_mat,  delimiter=",")  #fmt = '%d',
#accuracy = model.score(X_2, y_2)
#print(accuracy)
#print(confu_mat)

(5652, 6)
(5652, 1)
1330     5
2776     9
4360    12
1399     6
3884    11
        ..
905      4
5192    14
3980    11
235      1
5157    14
Name: 0, Length: 3786, dtype: int64
(1866,)
(5652, 6)
(5652, 1)
0.33815648445873525
[[ 34   0   0   1   0   0   0   0   3   5   0   0   5   1   1]
 [  0  18  26   0   0   0   0   0   0   7   5   2   0   2  17]
 [  1  19  22   0   0   0   0   0   0   2   1   1   2   0  14]
 [  5   0   0   5   6   1   0   3   2   9   0   3   5   8   2]
 [  1   0   0   3  19  15   1   7   3   4   3   2   1  14   0]
 [  0   0   1   1   7  39   0  12   4   9   3   3   1  48   7]
 [  3   1   0   0   5  10   5  10   2  13   0   5  18  13   1]
 [  1   0   1   1   8  16   3  22   6  16  12   3   6  23  10]
 [  0   3   0   3   7  15   0  10   9   6   7   0   2  21  15]
 [  6   5   1   3   1   2   2   2   3  96  29  19  33  13   1]
 [  0   1   0   0   4   0   1   0   0  28  50  43  11  20   3]
 [  2   0   1   0   5   5   2   3   0  33  61  50  14  17   2]
 [  6   1   1   3  

In [6]:
# random forest for feature importance on a classification problem
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import csv
import sklearn
# linear regression feature importance
# from sklearn.datasets import make_regression
from matplotlib import pyplot
from pandas import read_csv
from sklearn.model_selection import train_test_split

# xgboost for feature importance on a classification problem
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from matplotlib import pyplot
# define dataset

# 'Fitting' 'Ade' 'Sampling'
data_name = 'Fitting'
XFileName = 'All_dose_X_codon_stim_' + data_name + '.csv'
yFileName = 'All_dose_y_codon_stim_' + data_name + '.csv'
SaveFileName = 'XGBClassification_' + data_name + '.csv'
            # SaveFileName = 'XGBRegression_only_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'
            # SaveFileName = 'XGBRegression_with_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'

X = read_csv(XFileName,header = None)
print(X.shape)            
y = read_csv(yFileName,header = None)
print(y.shape)

# X, y = make_classification(n_samples=1000, n_features=3, n_informative=2, n_redundant=1, random_state=1, n_classes = 2)
# print(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

#print(y_train)
#print(y)
# define the model
# define dataset
model = RandomForestClassifier()

# fit the model
y_train = y_train.iloc[:, 0]
print(y_train)

model.fit(X_train, y_train)


# import fitting data:

data_name_2 = 'Ade'
XFileName_2 = 'All_dose_X_codon_stim_' + data_name + '.csv'
yFileName_2 = 'All_dose_y_codon_stim_' + data_name + '.csv'
SaveFileName_2 = 'XGBClassification_' + data_name + '.csv'
            # SaveFileName = 'XGBRegression_only_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'
            # SaveFileName = 'XGBRegression_with_randompara_' + FeatureLig + '_' + Ligand + '_' + DoseSymbol + '.csv'

X_2 = read_csv(XFileName,header = None)
print(X_2.shape)            
y_2 = read_csv(yFileName,header = None)
print(y_2.shape)

# X, y = make_classification(n_samples=1000, n_features=3, n_informative=2, n_redundant=1, random_state=1, n_classes = 2)
# print(X)
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.33, random_state=1)

y_pred = model.predict(X_test)

confu_mat = confusion_matrix(y_test,y_pred)
# np.savetxt(SaveFileName,confu_mat,  delimiter=",")  #fmt = '%d',
accuracy = model.score(X_test, y_test)
print(accuracy)
print(confu_mat)

(5652, 6)
(5652, 1)
1330     5
2776     9
4360    12
1399     6
3884    11
        ..
905      4
5192    14
3980    11
235      1
5157    14
Name: 0, Length: 3786, dtype: int64
(5652, 6)
(5652, 1)
0.42711682743837087
[[ 37   0   0   2   0   0   1   0   0   3   0   0   3   1   3]
 [  0  47   9   1   1   0   0   1   0   2   1   0   1   2  12]
 [  0  17  39   0   0   0   0   0   0   0   2   0   0   0   4]
 [  1   1   0   7   4   0   1   2   3   9   2   5   2   9   3]
 [  0   0   0   6  14  17   4  11   4   6   2   2   0   6   1]
 [  0   0   0   1   6  51   3   8   9   8   4   5   2  33   5]
 [  0   0   0   6   2   8   6  10   6  18   2   3  17   5   3]
 [  1   0   0   1   9  19   6  19  11  12   5   8  13  17   7]
 [  1   0   0   0   4  16   5  12  19   6   4   2   1   9  19]
 [  3   2   0   6   1   4   4   2   2 100  36  28  24   3   1]
 [  0   0   0   2   3   0   5   0   2  29  65  44   4   7   0]
 [  1   1   0   1   3   2   4   4   2  38  57  65   8   7   2]
 [  6   0   0   4   3   0  