In [None]:
# pip install xgboost

In [None]:
from xgboost import XGBRegressor
import pandas as pd 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import joblib

# Importing excel sheets as dfs, should both have samples in the same order
predictors_df = pd.read_excel('Predictors_Cleaned.xlsx', 0)
    # Rows are samples, columns are predictors
outcomes_df = pd.read_excel('Outcomes_Cleaned.xlsx', 0)
    # Rows are samples, columns are mechanisms

# Dropping unnecessary metadata
X = predictors_df.drop('SAMPLE NAME', axis=1)
Y = outcomes_df.drop('MECHANISM', axis=1) # Full DF must iterate through


# List of columns for iterating
columns = list(Y)

r2_list = []

r2: 0.09026134014129639


In [None]:
for i in columns:
    
    # resetting the r2 and best model value for each column
    r2 = 0
    best_model = XGBRegressor()
    
    # Iterate through common proportions of test/train data
    for j in [0.2, 0.4, 0.6, 0.8]:

        # Split Data
        x_train, x_test, y_train, y_test = train_test_split(X, Y[i], test_size=j) 

        # Iterate through common n_estimators
        for k in [100, 500, 1000]:

            # Create model
            xg = XGBRegressor(n_estimators=k) 
            xg = xg.fit(x_train, y_train)

            # Find model r squared
            predictions = xg.predict(x_test)
            new_r2 = abs(r2_score(y_test, predictions)) # using absolute value for comparision

            # Compare model, if model r2 is over 1 then its very bad fit so we don't save it
            if new_r2 <= 1:
                if new_r2 > r2:
                    r2 = new_r2
                    # Saving the better model
                    best_model = xg
    
    # Finally dumping the best model for the class
    joblib.dump(best_model, f"xgb_{i}.joblib")
    # Storing highest r2 and oob scores
    r2_list.append(r2)
    # Printing highest r2 value and oob
    print(f"{i} r2: {r2}")

In [1]:
# Plotting trees & features
from xgboost import plot_tree
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance

# Load model
loaded_xg = joblib.load("XGBoost/xgb_Betalactams.joblib")
feature_names = X.columns

plt.figure(figsize=(60, 40))  
plot_tree(loaded_xg) 
plt.gcf().set_size_inches(60, 40)
plt.show()

perm_importance = permutation_importance(loaded_xg, x_test, y_test)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(feature_names[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

NameError: name 'joblib' is not defined

In [None]:
# SINGLE TEST


# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, Y['Betalactams'], test_size=0.6)
# Create Model
xg = XGBRegressor(n_estimators=1000)
# fit model
xg.fit(X_train, y_train)

# make predictions
predictions = xg.predict(X_test)

# r squared value
r2 = abs(r2_score(y_test, predictions)) # using absolute value for comparision
print(f"r2: {r2}")

r2: 0.1462017297744751
