# Feature-based classifiers

In [None]:
import sklearn
import xgboost

In [None]:
label_dict = {label_names[i]: i for i in range(len(label_names))}  # assigns idx to each label
dense_label_arr = np.zeros((len(label_strs), len(label_names)))
for row, labels in zip(dense_label_arr, label_strs):
    label_idcs = [label_dict[label] for label in labels]
    row[label_idcs] = 1

In [None]:
train_mask = [i for i in range(len(img_ids)) if img_ids[i] in list(train_ids[0])]
print(sum(train_mask))
val_mask = [i for i in range(len(img_ids)) if img_ids[i] in list(val_ids[0])]
print(sum(val_mask))
test_mask = [i for i in range(len(img_ids)) if img_ids[i] in list(test_ids[0])]
print(sum(test_mask))

In [None]:
train_clip_feats = img_features[train_mask]
train_labels = dense_label_arr[train_mask]
val_clip_feats = img_features[val_mask]
val_labels = dense_label_arr[val_mask]
test_clip_feats = img_features[test_mask]
test_labels = dense_label_arr[test_mask]

In [None]:
from sklearn.preprocessing import StandardScaler

def multilabel_acc(classifier, train_feats, train_labels, test_feats, test_labels, v=0):
    scaler = StandardScaler()
    train_feats = scaler.fit_transform(train_feats)
    test_feats = scaler.transform(test_feats)
    
    accs = []
    for label_idx in tqdm(range(train_labels.shape[1])):
        classifier.fit(train_feats, train_labels[:, label_idx])

        test_preds = classifier.predict(test_feats)
        test_acc = (test_preds == test_labels[:, label_idx]).astype(float).mean()
        mean_label = test_labels[:, label_idx].mean()
        if v:
            print(test_acc, mean_label if mean_label > 0.5 else 1 - mean_label)
        accs.append(test_acc)
    return accs

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(tree_method="gpu_hist", eval_metric="error", n_jobs=1)
xgb_accs = multilabel_acc(classifier, 
                      train_clip_feats.numpy(), train_labels, 
                      test_clip_feats.numpy(), test_labels)
print(np.mean(xgb_accs))

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators=10,
                           tree_method="gpu_hist", eval_metric="error", n_jobs=1)
xgb_10_accs = multilabel_acc(classifier, 
                      train_clip_feats.numpy(), train_labels, 
                      test_clip_feats.numpy(), test_labels)
print(np.mean(xgb_10_accs))

In [None]:
#solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(n_jobs=5, solver="lbfgs")
logreg_accs = multilabel_acc(classifier, 
                      train_clip_feats.numpy(), train_labels, 
                      test_clip_feats.numpy(), test_labels,
                      v=0)
print(np.mean(logreg_accs))

In [None]:
baselines = []
for l in test_labels:
    mean = l.mean()
    if mean < 0.5:
        mean = 1 - mean
    baselines.append(mean)
np.mean(baselines)