In [15]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Before running any of this code you will need to run rat_to_CSV.py and get all ratdanceCSV and negative_controlCSV (make sure that they are uncommented at end of the file) once downloaded click through the CSV_preproccessing.ipynb then you are good to run this random forrest classifier.

In [16]:
# set random seed
np.random.seed(69)

In [17]:
# data loader
def load_data(folders):
    """ load csvs as flattened feature vectors with labels """
    data, labels = [], []

    for folder, _ in folders:
        if os.path.exists(folder):
            for file in os.listdir(folder):
                if file.endswith('.csv'):
                    file_path = os.path.join(folder, file)
                    df = pd.read_csv(file_path)
                    label = df.iloc[0, 0]  # extract label
                    features = df.iloc[:, 1:].values.flatten()  # flatten features
                    data.append(features)
                    labels.append(label)
        else:
            print(f"warning: folder {folder} not found.")

    return np.array(data), np.array(labels)

In [None]:
# define training folders
train_folders = [
    ('./rat_dance_csv/train/', 1),  # label 1 for ratdance
    ('./neg_control_csv/train/', 0)  # label 0 for negative control
]

# load dataset
X, y = load_data(train_folders)

# normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# shuffle dataset
indices = np.random.permutation(len(X))
X, y = X[indices], y[indices]

# train model with better hyperparameters
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10, # allow trees to grow 
    min_samples_split=5, # require at least 5 samples to split
    min_samples_leaf=2, # each leaf must have at least 2 samples
    max_features='sqrt', # use sqrt(features) to increase randomness
    random_state=69
)

# stratified k-folds cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
cross_val_acc = cross_val_score(clf, X, y, cv=cv).mean()

# print results
print(f"cross-validation accuracy: {cross_val_acc:.4f}")



ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
# load validation dataset
val_folders = [
    ('./rat_dance_csv/val/', 1),
    ('./neg_control_csv/val/', 0)
]

X_val, y_val = load_data(val_folders)
X_val = scaler.transform(X_val)  # apply same scaling

# train on full training set
clf.fit(X, y)

# evaluate on validation set
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
class_report = classification_report(y_val, y_pred, target_names=["negative control (0)", "ratdance (1)"])
conf_matrix = confusion_matrix(y_val, y_pred)

# show results
print(f"\nvalidation performance:")
print(f"accuracy: {accuracy:.4f}")
print("\nclassification report:")
print(class_report)
print("\nconfusion matrix:")
print(conf_matrix)


validation performance:
accuracy: 0.9286

classification report:
                      precision    recall  f1-score   support

negative control (0)       0.88      1.00      0.93         7
        ratdance (1)       1.00      0.86      0.92         7

            accuracy                           0.93        14
           macro avg       0.94      0.93      0.93        14
        weighted avg       0.94      0.93      0.93        14


confusion matrix:
[[7 0]
 [1 6]]
