In [31]:
import os 
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

xgboost model on full CSV file as input (frame is included)

In [32]:
# set random seed
np.random.seed(69)

In [33]:
def load_data(folders):
    """ load csvs as flattened feature vectors with labels """
    data, labels = [], []

    for folder, _ in folders:
        if os.path.exists(folder):
            for file in os.listdir(folder):
                if file.endswith('.csv'):
                    file_path = os.path.join(folder, file)
                    df = pd.read_csv(file_path)
                    label = df.iloc[0, 0]  # extract label
                    features = df.iloc[:, 1:].values.flatten()  # flatten features
                    data.append(features)
                    labels.append(label)
        else:
            print(f"warning: folder {folder} not found.")

    return np.array(data), np.array(labels)

## Train XGboost

In [34]:
# define training folders
train_folders = [
    ('../rat_dance_csv/train/', 1),
    ('../neg_control_csv/train', 0)
]
# load dataset
X, y = load_data(train_folders)

# normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# shuffle dataset
indices = np.random.permutation(len(X))
X, y = X[indices], y[indices]

# define xgboost model
clf = xgb.XGBClassifier(
    n_estimators=100, # number of trees
    max_depth=6, # maximum depth of each tree
    learning_rate=0.1, # step size shrinkage to prevent overfitting
    subsample=0.8, # fraction of samples used per tree
    colsample_bytree=0.8,  # fraction of features used per tree
    use_label_encoder=False,  # suppress warning for new sklearn API
    eval_metric='logloss',  # evaluation metric
    random_state=69
)

# stratified k-folds cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
cross_val_acc = cross_val_score(clf, X, y, cv=cv).mean()

# print results
print(f"cross-validation accuracy: {cross_val_acc:.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



cross-validation accuracy: 0.8333


## Validate XGBoost

In [35]:
# load validation dataset
val_folders = [
    ('../rat_dance_csv/val', 1),
    ('../neg_control_csv/val', 0)
]

X_val, y_val = load_data(val_folders)
X_val = scaler.transform(X_val)  # apply same scaling

# train on full training set
clf.fit(X, y)

# evaluate on validation set
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
class_report = classification_report(y_val, y_pred, target_names=["negative control (0)", "ratdance (1)"])
conf_matrix = confusion_matrix(y_val, y_pred)

# show results
print(f"\nvalidation performance:")
print(f"accuracy: {accuracy:.4f}")
print("\nclassification report:")
print(class_report)
print("\nconfusion matrix:")
print(conf_matrix)

Parameters: { "use_label_encoder" } are not used.




validation performance:
accuracy: 0.9286

classification report:
                      precision    recall  f1-score   support

negative control (0)       0.88      1.00      0.93         7
        ratdance (1)       1.00      0.86      0.92         7

            accuracy                           0.93        14
           macro avg       0.94      0.93      0.93        14
        weighted avg       0.94      0.93      0.93        14


confusion matrix:
[[7 0]
 [1 6]]


In [36]:
# load validation dataset
test_folders = [
    ('../rat_dance_csv/test', 1),
    ('../neg_control_csv/test', 0)
]

X_val, y_val = load_data(test_folders)
X_val = scaler.transform(X_val)  # apply same scaling

# train on full training set
clf.fit(X, y)

# evaluate on validation set
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
class_report = classification_report(y_val, y_pred, target_names=["negative control (0)", "ratdance (1)"])
conf_matrix = confusion_matrix(y_val, y_pred)

# show results
print(f"\nvalidation performance:")
print(f"accuracy: {accuracy:.4f}")
print("\nclassification report:")
print(class_report)
print("\nconfusion matrix:")
print(conf_matrix)

Parameters: { "use_label_encoder" } are not used.




validation performance:
accuracy: 0.7143

classification report:
                      precision    recall  f1-score   support

negative control (0)       1.00      0.43      0.60         7
        ratdance (1)       0.64      1.00      0.78         7

            accuracy                           0.71        14
           macro avg       0.82      0.71      0.69        14
        weighted avg       0.82      0.71      0.69        14


confusion matrix:
[[3 4]
 [0 7]]


In [37]:
import joblib

joblib.dump(clf, 'xg.pkl')


['xg.pkl']