In [1]:

# check xgboost version
import xgboost
print(xgboost.__version__)


2.1.2


In [2]:
# !pip install xgboost
# !pip install matplotlib
# !pip install seaborn
# !pip install scikit-learn


In [3]:
import numpy as np
import copy as cp
import matplotlib.pyplot as plt

import seaborn as sns
from typing import Tuple
from sklearn.metrics import confusion_matrix

def cross_val_predict(model, kfold, X, y ) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)
    
    no_classes = len(np.unique(y))
    
    actual_classes = np.empty([0], dtype=int)
    actual_X_val = np.empty([0], dtype=int)
    predicted_classes = np.empty([0], dtype=int)
    predicted_proba = np.empty([0, no_classes]) 

    for train_ndx, test_ndx in kfold.split(X):

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)
        actual_X_val = np.append(actual_X_val,test_X)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))

        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
        except:
            predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

    return actual_classes, predicted_classes, predicted_proba, actual_X_val

Confusion_matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class and predicted label being j-th class.   


In [4]:
def plot_confusion_matrix(actual_classes, predicted_classes, sorted_labels):

    matrix = confusion_matrix(actual_classes, predicted_classes, labels=sorted_labels)
 
    plt.figure(figsize=(12.8,6))
    sns.heatmap(matrix, annot=True, xticklabels=sorted_labels, yticklabels=sorted_labels, cmap="Blues", fmt="g")
    plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix')

    plt.show()

In [13]:
# try cross validation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1) 
import csv
import sklearn
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
# linear regression feature importance
# from sklearn.datasets import make_regression
from matplotlib import pyplot
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

# xgboost for feature importance on a classification problem
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from matplotlib import pyplot
# define dataset

data_name = 'WT'
file_names = 'SimIkBamutClass'
XFileName = file_names + '_X_codon_stim_' + data_name + '.csv'
yFileName = file_names + '_y_codon_stim_' + data_name + '.csv'
SaveFileName = file_names + '_RandomForest_conf_mat_' + data_name + '.csv'


X = read_csv(XFileName,header = None)
#print(X.shape)            
y = read_csv(yFileName,header = None)
#print(y.shape)

# Convert to NumPy arrays
X_np = X.to_numpy()
y_np = y.to_numpy()

print(X_np.shape)
print(y_np.shape)

# split traing and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

#print(y_train)
#print(y)
# define the model
model = RandomForestClassifier()

# fit the model
y_train = y_train.iloc[:, 0]
print(y_train)
model.fit(X_train, y_train)

# evaluate the model
y_pred = model.predict(X_test)
confu_mat = confusion_matrix(y_test,y_pred)
np.savetxt(SaveFileName,confu_mat,  delimiter=",")  #fmt = '%d',

print(data_name)
print(confu_mat)
#accuracy = model.score(actual_X_val, predicted_classes)
#print(accuracy)

(3000, 6)
(3000, 1)
1382    1
23      0
2140    2
1117    1
933     0
       ..
2763    2
905     0
1096    1
235     0
1061    1
Name: 0, Length: 2010, dtype: int64
WT
[[333   5   0]
 [  8 311  10]
 [  4   7 312]]


In [14]:

XFileName_IkBamut = file_names + '_X_codon_stim_' + 'IkBamut' + '.csv'
yFileName_IkBamut = file_names + '_y_codon_stim_' + 'IkBamut' + '.csv'
SaveFileName_IkBamut = file_names + '_RandomForest_conf_mat_' + 'WT_Train_IkBamut' + '.csv'


X_IkBamut = read_csv(XFileName_IkBamut,header = None)
print(X_IkBamut.shape)            
y_IkBamut = read_csv(yFileName_IkBamut,header = None)
print(y_IkBamut.shape)

    # Convert to NumPy arrays
X_np_IkBamut = X_IkBamut.to_numpy()
y_np_IkBamut = y_IkBamut.to_numpy()

    # Ensure y is one-dimensional
if y_np_IkBamut.ndim > 1 and y_np_IkBamut.shape[1] == 1:
    y_np_IkBamut = y_np_IkBamut.flatten()

y_pred_IkBamut = model.predict(X_np_IkBamut)

confu_mat = confusion_matrix(y_np_IkBamut,y_pred_IkBamut)
np.savetxt(SaveFileName_IkBamut,confu_mat,  delimiter=",")  #fmt = '%d',

print('IkBamut')
print(confu_mat)
#accuracy = model.score(actual_X_val, predicted_classes)
#print(accuracy)

(3000, 6)
(3000, 1)
IkBamut
[[702 271  27]
 [  5 966  29]
 [  3  20 977]]
